From c840438b62e3071b8e658de7343c8e461387de97 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Fri, 30 Jul 2021 17:31:00 -0500
Subject: [PATCH 01/57] Squashed 'src/composable_kernel/' content from commit
 f6edda611

git-subtree-dir: src/composable_kernel
git-subtree-split: f6edda6119ebbb237dfa6270797b34f960d7b190
---
 .clang-format                                 |   90 +
 CMakeLists.txt                                |   42 +
 README.md                                     |  177 +
 cmake/AddKernels.cmake                        |   40 +
 cmake/TargetFlags.cmake                       |   50 +
 .../include/gridwise_operation_wrapper.hpp    |   14 +
 ...volution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp |  272 +
 ...lution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp |  275 +
 ...volution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp |  263 +
 ...volution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp |  179 +
 ...lution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp |  129 +
 ...lution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp |  129 +
 ...lution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp |  132 +
 ...volution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp |  132 +
 .../tensor_description/cluster_descriptor.hpp |   33 +
 .../dynamic_multi_index_transform.hpp         | 1737 +++++
 .../dynamic_multi_index_transform_helper.hpp  |  104 +
 .../dynamic_tensor_descriptor.hpp             |  596 ++
 .../dynamic_tensor_descriptor_helper.hpp      |  150 +
 .../tensor_description/tensor_adaptor.hpp     |  466 ++
 ...lockwise_dynamic_tensor_slice_transfer.hpp |  171 +
 ...kwise_dynamic_tensor_slice_transfer_v2.hpp |  158 +
 .../blockwise_gemm_dlops_v2r2.hpp             |  396 ++
 .../blockwise_gemm_dlops_v2r3.hpp             |  410 ++
 .../blockwise_gemm_dlops_v3.hpp               |  190 +
 .../blockwise_gemm_xdlops.hpp                 |  528 ++
 ...ridwise_dynamic_contraction_dlops_v1r2.hpp |  664 ++
 .../gridwise_dynamic_gemm_dlops_v1r2.hpp      |  679 ++
 .../gridwise_dynamic_gemm_dlops_v1r3.hpp      |  671 ++
 .../gridwise_dynamic_gemm_dlops_v2.hpp        |  463 ++
 .../gridwise_dynamic_gemm_xdlops_v2r3.hpp     |  823 +++
 .../threadwise_contraction_dlops.hpp          |  230 +
 .../threadwise_dynamic_tensor_slice_set.hpp   |   59 +
 ...readwise_dynamic_tensor_slice_transfer.hpp | 1449 +++++
 ...dwise_dynamic_tensor_slice_transfer_v2.hpp |  789 +++
 .../threadwise_gemm_dlops_v3.hpp              |  162 +
 .../include/tensor_operation/xdlops_gemm.hpp  |  801 +++
 .../utility/amd_buffer_addressing_v2.hpp      |  654 ++
 .../include/utility/amd_dlop.hpp              |  188 +
 .../include/utility/amd_inline_asm.hpp        |  353 +
 .../include/utility/amd_llvm_intrinsic.hpp    |   11 +
 .../include/utility/amd_xdlops.hpp            |  499 ++
 composable_kernel/include/utility/array.hpp   |   63 +
 .../include/utility/array_multi_index.hpp     |   77 +
 .../include/utility/common_header.hpp         |   45 +
 composable_kernel/include/utility/config.hpp  |  142 +
 .../utility/container_element_picker.hpp      |  155 +
 .../include/utility/container_helper.hpp      |  403 ++
 .../include/utility/data_type.hpp             | 1017 +++
 .../include/utility/data_type_enum.hpp        |   20 +
 .../include/utility/data_type_helper.hpp      |   76 +
 .../include/utility/dynamic_buffer.hpp        |  208 +
 .../include/utility/functional.hpp            |  116 +
 .../include/utility/functional2.hpp           |   48 +
 .../include/utility/functional3.hpp           |  142 +
 .../include/utility/functional4.hpp           |   62 +
 .../include/utility/integral_constant.hpp     |   17 +
 .../include/utility/magic_division.hpp        |  155 +
 composable_kernel/include/utility/math.hpp    |  225 +
 .../include/utility/multi_index.hpp           |   12 +
 composable_kernel/include/utility/number.hpp  |   44 +
 composable_kernel/include/utility/print.hpp   |   70 +
 .../include/utility/sequence.hpp              |  882 +++
 .../include/utility/sequence_helper.hpp       |   36 +
 .../include/utility/static_buffer.hpp         |   35 +
 .../utility/statically_indexed_array.hpp      |   40 +
 .../statically_indexed_array_multi_index.hpp  |  108 +
 .../include/utility/synchronization.hpp       |   21 +
 composable_kernel/include/utility/tuple.hpp   |  167 +
 .../include/utility/tuple_helper.hpp          |   80 +
 composable_kernel/include/utility/type.hpp    |   60 +
 composable_kernel/include/utility/utility.hpp |   14 +
 ...mplicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp |  374 ++
 ...plicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp |  362 ++
 ...plicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp |  362 ++
 ...mplicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp |  392 ++
 external/half/include/half.hpp                | 5671 +++++++++++++++++
 external/rocm/include/bfloat16_dev.hpp        |  125 +
 host/CMakeLists.txt                           |    4 +
 host/driver_offline/CMakeLists.txt            |   21 +
 .../conv_bwd_driver_offline.cpp               |  357 ++
 .../conv_fwd_driver_offline.cpp               |  480 ++
 ...plicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp |  341 +
 ...icit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp |  317 +
 ...mplicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp |  210 +
 ...plicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp |  283 +
 ...licit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp |  284 +
 ...icit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp |  206 +
 ...icit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp |  240 +
 ...icit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp |  305 +
 ...icit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp |  365 ++
 ...mplicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp |  192 +
 ...mplicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp |  244 +
 .../driver_dynamic_contraction_dlops_v1r2.hpp |  290 +
 ...mplicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp |  352 +
 ..._gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp |  367 ++
 .../driver_dynamic_gemm_dlops_v1r2.hpp        |  415 ++
 .../driver_dynamic_gemm_dlops_v1r3.hpp        |  411 ++
 .../driver_dynamic_gemm_xdlops_v2r3.hpp       |  196 +
 host/driver_online/CMakeLists.txt             |   21 +
 host/driver_online/conv_fwd_driver_online.cpp |  453 ++
 ...nv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp |  673 ++
 ..._tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp |   51 +
 ...tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp |   73 +
 ...tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp |   73 +
 .../convolution_problem_descriptor.hpp        |   79 +
 ...mplicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp |  395 ++
 ...plicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp |  386 ++
 ...plicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp |  389 ++
 ...mplicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp |  182 +
 .../include/online_driver_common.hpp          |   44 +
 host/host_tensor/CMakeLists.txt               |   19 +
 host/host_tensor/include/conv_common.hpp      |   86 +
 host/host_tensor/include/device.hpp           |   86 +
 host/host_tensor/include/device_tensor.hpp    |    9 +
 host/host_tensor/include/host_conv.hpp        |  326 +
 .../include/host_conv_bwd_data.hpp            |  143 +
 host/host_tensor/include/host_tensor.hpp      |  322 +
 .../include/host_tensor_generator.hpp         |   60 +
 host/host_tensor/src/device.cpp               |   67 +
 host/host_tensor/src/host_tensor.cpp          |   48 +
 host/online_compilation/CMakeLists.txt        |  168 +
 .../addkernels/CMakeLists.txt                 |   30 +
 .../addkernels/addkernels.cpp                 |  264 +
 .../addkernels/include_inliner.cpp            |  213 +
 .../addkernels/include_inliner.hpp            |  142 +
 .../addkernels/source_file_desc.hpp           |   45 +
 .../hip_utility/binary_cache.cpp              |  112 +
 .../hip_utility/exec_utils.cpp                |   93 +
 .../hip_utility/handlehip.cpp                 |  285 +
 .../hip_utility/hip_build_utils.cpp           |  346 +
 .../hip_utility/hipoc_kernel.cpp              |   84 +
 .../hip_utility/hipoc_program.cpp             |  139 +
 .../hip_utility/kernel_build_params.cpp       |   66 +
 .../hip_utility/kernel_cache.cpp              |  154 +
 .../online_compilation/hip_utility/logger.cpp |   43 +
 host/online_compilation/hip_utility/md5.cpp   |  319 +
 .../hip_utility/target_properties.cpp         |  119 +
 .../hip_utility/tmp_dir.cpp                   |   66 +
 .../include/binary_cache.hpp                  |   52 +
 host/online_compilation/include/config.h.in   |   47 +
 host/online_compilation/include/env.hpp       |  123 +
 .../online_compilation/include/exec_utils.hpp |   42 +
 host/online_compilation/include/handle.hpp    |  145 +
 host/online_compilation/include/hipCheck.hpp  |   22 +
 .../include/hip_build_utils.hpp               |   97 +
 .../include/hipoc_kernel.hpp                  |  174 +
 .../include/hipoc_program.hpp                 |   64 +
 .../include/hipoc_program_impl.hpp            |   61 +
 host/online_compilation/include/kernel.hpp    |   45 +
 .../include/kernel_build_params.hpp           |  137 +
 .../include/kernel_cache.hpp                  |   97 +
 host/online_compilation/include/logger.hpp    |   23 +
 .../online_compilation/include/manage_ptr.hpp |   76 +
 host/online_compilation/include/md5.hpp       |   12 +
 .../include/op_kernel_args.hpp                |   35 +
 .../include/simple_hash.hpp                   |   44 +
 .../include/stringutils.hpp                   |  133 +
 .../include/target_properties.hpp             |   56 +
 host/online_compilation/include/tmp_dir.hpp   |   26 +
 .../online_compilation/include/write_file.hpp |   30 +
 host/online_compilation/kernel.cpp.in         |   70 +
 .../online_compilation/kernel_includes.cpp.in |   80 +
 host/online_compilation/kernels_batch.cpp.in  |    1 +
 script/cmake-rocm.sh                          |   42 +
 script/count_vgpr.sh                          |  259 +
 script/docker-rocm4.1.sh                      |   14 +
 script/hipclang_opt.sh                        |   25 +
 script/run.sh                                 |   47 +
 169 files changed, 41816 insertions(+)
 create mode 100644 .clang-format
 create mode 100644 CMakeLists.txt
 create mode 100644 README.md
 create mode 100644 cmake/AddKernels.cmake
 create mode 100644 cmake/TargetFlags.cmake
 create mode 100644 composable_kernel/include/gridwise_operation_wrapper.hpp
 create mode 100644 composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp
 create mode 100644 composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp
 create mode 100644 composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp
 create mode 100644 composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp
 create mode 100644 composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
 create mode 100644 composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp
 create mode 100644 composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp
 create mode 100644 composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp
 create mode 100644 composable_kernel/include/tensor_description/cluster_descriptor.hpp
 create mode 100644 composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
 create mode 100644 composable_kernel/include/tensor_description/dynamic_multi_index_transform_helper.hpp
 create mode 100644 composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
 create mode 100644 composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper.hpp
 create mode 100644 composable_kernel/include/tensor_description/tensor_adaptor.hpp
 create mode 100644 composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
 create mode 100644 composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer_v2.hpp
 create mode 100644 composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r2.hpp
 create mode 100644 composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r3.hpp
 create mode 100644 composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp
 create mode 100644 composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp
 create mode 100644 composable_kernel/include/tensor_operation/gridwise_dynamic_contraction_dlops_v1r2.hpp
 create mode 100644 composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r2.hpp
 create mode 100644 composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r3.hpp
 create mode 100644 composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v2.hpp
 create mode 100644 composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp
 create mode 100644 composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp
 create mode 100644 composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_set.hpp
 create mode 100644 composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
 create mode 100644 composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer_v2.hpp
 create mode 100644 composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp
 create mode 100644 composable_kernel/include/tensor_operation/xdlops_gemm.hpp
 create mode 100644 composable_kernel/include/utility/amd_buffer_addressing_v2.hpp
 create mode 100644 composable_kernel/include/utility/amd_dlop.hpp
 create mode 100644 composable_kernel/include/utility/amd_inline_asm.hpp
 create mode 100644 composable_kernel/include/utility/amd_llvm_intrinsic.hpp
 create mode 100644 composable_kernel/include/utility/amd_xdlops.hpp
 create mode 100644 composable_kernel/include/utility/array.hpp
 create mode 100644 composable_kernel/include/utility/array_multi_index.hpp
 create mode 100644 composable_kernel/include/utility/common_header.hpp
 create mode 100644 composable_kernel/include/utility/config.hpp
 create mode 100644 composable_kernel/include/utility/container_element_picker.hpp
 create mode 100644 composable_kernel/include/utility/container_helper.hpp
 create mode 100644 composable_kernel/include/utility/data_type.hpp
 create mode 100644 composable_kernel/include/utility/data_type_enum.hpp
 create mode 100644 composable_kernel/include/utility/data_type_helper.hpp
 create mode 100644 composable_kernel/include/utility/dynamic_buffer.hpp
 create mode 100644 composable_kernel/include/utility/functional.hpp
 create mode 100644 composable_kernel/include/utility/functional2.hpp
 create mode 100644 composable_kernel/include/utility/functional3.hpp
 create mode 100644 composable_kernel/include/utility/functional4.hpp
 create mode 100644 composable_kernel/include/utility/integral_constant.hpp
 create mode 100644 composable_kernel/include/utility/magic_division.hpp
 create mode 100644 composable_kernel/include/utility/math.hpp
 create mode 100644 composable_kernel/include/utility/multi_index.hpp
 create mode 100644 composable_kernel/include/utility/number.hpp
 create mode 100644 composable_kernel/include/utility/print.hpp
 create mode 100644 composable_kernel/include/utility/sequence.hpp
 create mode 100644 composable_kernel/include/utility/sequence_helper.hpp
 create mode 100644 composable_kernel/include/utility/static_buffer.hpp
 create mode 100644 composable_kernel/include/utility/statically_indexed_array.hpp
 create mode 100644 composable_kernel/include/utility/statically_indexed_array_multi_index.hpp
 create mode 100644 composable_kernel/include/utility/synchronization.hpp
 create mode 100644 composable_kernel/include/utility/tuple.hpp
 create mode 100644 composable_kernel/include/utility/tuple_helper.hpp
 create mode 100644 composable_kernel/include/utility/type.hpp
 create mode 100644 composable_kernel/include/utility/utility.hpp
 create mode 100644 composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp
 create mode 100644 composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
 create mode 100644 composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp
 create mode 100644 composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp
 create mode 100644 external/half/include/half.hpp
 create mode 100644 external/rocm/include/bfloat16_dev.hpp
 create mode 100644 host/CMakeLists.txt
 create mode 100644 host/driver_offline/CMakeLists.txt
 create mode 100644 host/driver_offline/conv_bwd_driver_offline.cpp
 create mode 100644 host/driver_offline/conv_fwd_driver_offline.cpp
 create mode 100644 host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
 create mode 100644 host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
 create mode 100644 host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
 create mode 100644 host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
 create mode 100644 host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp
 create mode 100644 host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
 create mode 100644 host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp
 create mode 100644 host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp
 create mode 100644 host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
 create mode 100644 host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
 create mode 100644 host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
 create mode 100644 host/driver_offline/include/driver_dynamic_contraction_dlops_v1r2.hpp
 create mode 100644 host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
 create mode 100644 host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
 create mode 100644 host/driver_offline/include/driver_dynamic_gemm_dlops_v1r2.hpp
 create mode 100644 host/driver_offline/include/driver_dynamic_gemm_dlops_v1r3.hpp
 create mode 100644 host/driver_offline/include/driver_dynamic_gemm_xdlops_v2r3.hpp
 create mode 100644 host/driver_online/CMakeLists.txt
 create mode 100644 host/driver_online/conv_fwd_driver_online.cpp
 create mode 100644 host/driver_online/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp
 create mode 100644 host/driver_online/include/conv_tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp
 create mode 100644 host/driver_online/include/conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp
 create mode 100644 host/driver_online/include/conv_tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
 create mode 100644 host/driver_online/include/convolution_problem_descriptor.hpp
 create mode 100644 host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
 create mode 100644 host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
 create mode 100644 host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
 create mode 100644 host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
 create mode 100644 host/driver_online/include/online_driver_common.hpp
 create mode 100644 host/host_tensor/CMakeLists.txt
 create mode 100644 host/host_tensor/include/conv_common.hpp
 create mode 100644 host/host_tensor/include/device.hpp
 create mode 100644 host/host_tensor/include/device_tensor.hpp
 create mode 100644 host/host_tensor/include/host_conv.hpp
 create mode 100644 host/host_tensor/include/host_conv_bwd_data.hpp
 create mode 100644 host/host_tensor/include/host_tensor.hpp
 create mode 100644 host/host_tensor/include/host_tensor_generator.hpp
 create mode 100644 host/host_tensor/src/device.cpp
 create mode 100644 host/host_tensor/src/host_tensor.cpp
 create mode 100644 host/online_compilation/CMakeLists.txt
 create mode 100644 host/online_compilation/addkernels/CMakeLists.txt
 create mode 100644 host/online_compilation/addkernels/addkernels.cpp
 create mode 100644 host/online_compilation/addkernels/include_inliner.cpp
 create mode 100644 host/online_compilation/addkernels/include_inliner.hpp
 create mode 100644 host/online_compilation/addkernels/source_file_desc.hpp
 create mode 100644 host/online_compilation/hip_utility/binary_cache.cpp
 create mode 100644 host/online_compilation/hip_utility/exec_utils.cpp
 create mode 100644 host/online_compilation/hip_utility/handlehip.cpp
 create mode 100644 host/online_compilation/hip_utility/hip_build_utils.cpp
 create mode 100644 host/online_compilation/hip_utility/hipoc_kernel.cpp
 create mode 100644 host/online_compilation/hip_utility/hipoc_program.cpp
 create mode 100644 host/online_compilation/hip_utility/kernel_build_params.cpp
 create mode 100644 host/online_compilation/hip_utility/kernel_cache.cpp
 create mode 100644 host/online_compilation/hip_utility/logger.cpp
 create mode 100644 host/online_compilation/hip_utility/md5.cpp
 create mode 100644 host/online_compilation/hip_utility/target_properties.cpp
 create mode 100644 host/online_compilation/hip_utility/tmp_dir.cpp
 create mode 100644 host/online_compilation/include/binary_cache.hpp
 create mode 100644 host/online_compilation/include/config.h.in
 create mode 100644 host/online_compilation/include/env.hpp
 create mode 100644 host/online_compilation/include/exec_utils.hpp
 create mode 100644 host/online_compilation/include/handle.hpp
 create mode 100644 host/online_compilation/include/hipCheck.hpp
 create mode 100644 host/online_compilation/include/hip_build_utils.hpp
 create mode 100644 host/online_compilation/include/hipoc_kernel.hpp
 create mode 100644 host/online_compilation/include/hipoc_program.hpp
 create mode 100644 host/online_compilation/include/hipoc_program_impl.hpp
 create mode 100644 host/online_compilation/include/kernel.hpp
 create mode 100644 host/online_compilation/include/kernel_build_params.hpp
 create mode 100644 host/online_compilation/include/kernel_cache.hpp
 create mode 100644 host/online_compilation/include/logger.hpp
 create mode 100644 host/online_compilation/include/manage_ptr.hpp
 create mode 100644 host/online_compilation/include/md5.hpp
 create mode 100644 host/online_compilation/include/op_kernel_args.hpp
 create mode 100644 host/online_compilation/include/simple_hash.hpp
 create mode 100644 host/online_compilation/include/stringutils.hpp
 create mode 100644 host/online_compilation/include/target_properties.hpp
 create mode 100644 host/online_compilation/include/tmp_dir.hpp
 create mode 100644 host/online_compilation/include/write_file.hpp
 create mode 100644 host/online_compilation/kernel.cpp.in
 create mode 100644 host/online_compilation/kernel_includes.cpp.in
 create mode 100644 host/online_compilation/kernels_batch.cpp.in
 create mode 100755 script/cmake-rocm.sh
 create mode 100755 script/count_vgpr.sh
 create mode 100755 script/docker-rocm4.1.sh
 create mode 100755 script/hipclang_opt.sh
 create mode 100755 script/run.sh

diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000000..22f2674966
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,90 @@
+---
+Language:        Cpp
+AccessModifierOffset: 0
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: true
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlinesLeft: true
+AlignOperands:   true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: true
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: false
+BinPackParameters: false
+BraceWrapping:   
+  AfterClass:      true
+  AfterControlStatement: true
+  AfterEnum:       true
+  AfterFunction:   true
+  AfterNamespace:  false
+  AfterObjCDeclaration: true
+  AfterStruct:     true
+  AfterUnion:      true
+  BeforeCatch:     true
+  BeforeElse:      true
+  IndentBraces:    false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Custom
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+ColumnLimit:     100
+CommentPragmas:  '^ IWYU pragma:'
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
+IncludeCategories: 
+  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
+    Priority:        2
+  - Regex:           '^(<|"(gtest|isl|json)/)'
+    Priority:        3
+  - Regex:           '.*'
+    Priority:        1
+IndentCaseLabels: false
+IndentWidth:     4
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: true
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 60
+PointerAlignment: Left
+ReflowComments:  true
+SortIncludes:    false
+SpaceAfterCStyleCast: false
+# SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: Never
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Cpp11
+TabWidth:        8
+UseTab:          Never
+...
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000000..0cf342bb45
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,42 @@
+cmake_minimum_required(VERSION 2.8.3)
+project(modular_convolution)
+
+list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
+
+include(TargetFlags)
+include(AddKernels)
+
+## C++
+enable_language(CXX)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
+
+## OpenMP
+if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+	# workaround issue hipcc in rocm3.5 cannot find openmp
+	set(OpenMP_CXX "${CMAKE_CXX_COMPILER}")
+	set(OpenMP_CXX_FLAGS "-fopenmp=libomp -Wno-unused-command-line-argument")
+	set(OpenMP_CXX_LIB_NAMES "libomp" "libgomp" "libiomp5")
+	set(OpenMP_libomp_LIBRARY ${OpenMP_CXX_LIB_NAMES})
+	set(OpenMP_libgomp_LIBRARY ${OpenMP_CXX_LIB_NAMES})
+	set(OpenMP_libiomp5_LIBRARY ${OpenMP_CXX_LIB_NAMES})
+else()
+	find_package(OpenMP REQUIRED)
+endif()
+
+message("OpenMP_CXX_LIB_NAMES: ${OpenMP_CXX_LIB_NAMES}")
+message("OpenMP_gomp_LIBRARY: ${OpenMP_gomp_LIBRARY}")
+message("OpenMP_pthread_LIBRARY: ${OpenMP_pthread_LIBRARY}")
+message("OpenMP_CXX_FLAGS: ${OpenMP_CXX_FLAGS}")
+
+set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+link_libraries(${OpenMP_gomp_LIBRARY})
+link_libraries(${OpenMP_pthread_LIBRARY})
+
+## HIP
+find_package(HIP REQUIRED)
+message(STATUS "Build with HIP ${hip_VERSION}")
+
+add_subdirectory(host)
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000..6e6019601a
--- /dev/null
+++ b/README.md
@@ -0,0 +1,177 @@
+# How to build and run
+
+# Docker
+```
+docker run                                                                   \
+-it                                                                          \
+--rm                                                                         \
+--privileged                                                                 \
+--group-add sudo                                                             \
+-w /root/workspace                                                           \
+-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                                \
+rocm/tensorflow:rocm4.2-tf2.4-dev                                            \
+/bin/bash
+```
+
+# Install Boost for online compilation
+https://www.boost.org/doc/libs/1_66_0/more/getting_started/unix-variants.html#easy-build-and-install
+
+
+# Build
+Add path of Boost
+```
+ export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
+```
+
+```
+mkdir build && cd build
+```
+
+cmake cmd. Need to Specify target ID, example below is gfx908
+```
+cmake                                                                                                                              \
+-D CMAKE_BUILD_TYPE=Release                                                                                                                    \
+-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 -O3 --amdgpu-target=gfx908 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=$PWD"   \
+-D HIP_ONLINE_COMPILER_FLAGS="-DCK_AMD_GPU_GFX908"                                                                                             \
+-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                                                                      \
+-D CMAKE_PREFIX_PATH=/opt/rocm                                                                                                                 \
+-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                                                              \
+..
+```
+
+Build drivers:   \
+``conv_fwd_driver_offline`` is (offline compilation) driver for forward convolution,  \
+``conv_bwd_driver_offline`` is (offline compilation) driver for backward-data convolution  \
+``conv_fwd_driver_online`` is (online compilation) driver for forward convolution
+```
+ make -j conv_fwd_driver_offline
+ make -j conv_bwd_driver_offline
+ make -j conv_fwd_driver_online
+```
+
+# Run
+* layout: 0 = NCHW; 1 = NHWC
+* algo: algorithm
+* verify: 0 = no verification; 1 = do verification
+* init: 0 ~ 5. initialization method
+* log: 0 = no log; 1 = do log
+* repeat: number of time kernel being launched
+```
+######################################################## layout  algo  verify  init  log  repeat  N__ K___ C___ Y X Hi_ Wi__ Strides Dilations LeftPads RightPads
+ ./host/driver_offline/conv_fwd_driver_offline                0     4       0     0    0       1  128  256  192 3 3  71   71     2 2       1 1      1 1       1 1
+ ./host/driver_offline/conv_fwd_driver_offline                0     4       0     0    0       1  256 1024  256 3 3  14   14     1 1       1 1      1 1       1 1
+ ./host/driver_offline/conv_fwd_driver_offline                1     5       0     0    0       1  128  256  192 3 3  71   71     2 2       1 1      1 1       1 1
+ ./host/driver_offline/conv_fwd_driver_offline                1     5       0     0    0       1  256 1024  256 3 3  14   14     1 1       1 1      1 1       1 1
+ ./host/driver_offline/conv_bwd_driver_offline                1     5       0     0    0       1  256  256 1024 3 3  14   14     1 1       1 1      1 1       1 1
+```
+
+# Result
+Forward convoltuion, FP16, NCHW
+```
+./host/driver_offline/conv_fwd_driver_offline                0     4       0     0    0       1  128  256  192 3 3  71   71     2 2       1 1      1 1       1 1
+
+layout: 0
+in: dim 4, lengths {128, 192, 71, 71}, strides {967872, 5041, 71, 1}
+wei: dim 4, lengths {256, 192, 3, 3}, strides {1728, 9, 3, 1}
+out: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1296, 36, 1}
+InLeftPads size 2, {1, 1, }
+InRightPads size 2, {1, 1, }
+ConvStrides size 2, {2, 2, }
+ConvDilations size 2, {1, 1, }
+device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw
+a_k0_m_k1_grid_desc{216, 256, 8}
+b_k0_n_k1_grid_desc{216, 165888, 8}
+c_m_n_grid_desc{ 256, 165888}
+launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
+Warm up
+Start running 1 times...
+Average time : 1.4155 ms, 103.686 TFlop/s
+```
+
+Forward convoltuion, FP16, NCHW
+```
+ ./host/driver_offline/conv_fwd_driver_offline                0     4       0     0    0       1  256 1024  256 3 3  14   14     1 1       1 1      1 1       1 1
+ 
+ layout: 0
+in: dim 4, lengths {256, 256, 14, 14}, strides {50176, 196, 14, 1}
+wei: dim 4, lengths {1024, 256, 3, 3}, strides {2304, 9, 3, 1}
+out: dim 4, lengths {256, 1024, 14, 14}, strides {200704, 196, 14, 1}
+InLeftPads size 2, {1, 1, }
+InRightPads size 2, {1, 1, }
+ConvStrides size 2, {1, 1, }
+ConvDilations size 2, {1, 1, }
+device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw
+a_k0_m_k1_grid_desc{288, 1024, 8}
+b_k0_n_k1_grid_desc{288, 50176, 8}
+c_m_n_grid_desc{ 1024, 50176}
+launch_and_time_kernel: grid_dim {1568, 1, 1}, block_dim {256, 1, 1}
+Warm up
+Start running 1 times...
+Average time : 2.21357 ms, 106.959 TFlop/s
+ ```
+ 
+ Forward convolution, FP16, NHWC
+ ```
+ ./host/driver_offline/conv_fwd_driver_offline                1     5       0     0    0       1  128  256  192 3 3  71   71     2 2       1 1      1 1       1 1
+ 
+ layout: 1
+in: dim 4, lengths {128, 71, 71, 192}, strides {967872, 13632, 192, 1}
+wei: dim 4, lengths {256, 3, 3, 192}, strides {1728, 576, 192, 1}
+out: dim 4, lengths {128, 36, 36, 256}, strides {331776, 9216, 256, 1}
+InLeftPads size 2, {1, 1, }
+InRightPads size 2, {1, 1, }
+ConvStrides size 2, {2, 2, }
+ConvDilations size 2, {1, 1, }
+device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk
+a_k0_m_k1_grid_desc{216, 165888, 8}
+b_k0_n_k1_grid_desc{216, 256, 8}
+c_m_n_grid_desc{ 165888, 256}
+launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
+Warm up
+Start running 1 times...
+Average time : 1.12014 ms, 131.025 TFlop/s
+ ```
+ 
+ Forward convolution, FP16, NHWC
+ ```
+ ./host/driver_offline/conv_fwd_driver_offline                1     5       0     0    0       1  256 1024  256 3 3  14   14     1 1       1 1      1 1       1 1
+ 
+ layout: 1
+in: dim 4, lengths {256, 14, 14, 256}, strides {50176, 3584, 256, 1}
+wei: dim 4, lengths {1024, 3, 3, 256}, strides {2304, 768, 256, 1}
+out: dim 4, lengths {256, 14, 14, 1024}, strides {200704, 14336, 1024, 1}
+InLeftPads size 2, {1, 1, }
+InRightPads size 2, {1, 1, }
+ConvStrides size 2, {1, 1, }
+ConvDilations size 2, {1, 1, }
+device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk
+a_k0_m_k1_grid_desc{288, 50176, 8}
+b_k0_n_k1_grid_desc{288, 1024, 8}
+c_m_n_grid_desc{ 50176, 1024}
+launch_and_time_kernel: grid_dim {1568, 1, 1}, block_dim {256, 1, 1}
+Warm up
+Start running 1 times...
+Average time : 1.86877 ms, 126.693 TFlop/s
+ ```
+ 
+ Backward data convolution, FP16, NHWC
+ ```
+ ./host/driver_offline/conv_bwd_driver_offline       1     1       0     3    0       1  256  256 1024 3 3  14   14     1 1       1 1      1 1       1 1
+ 
+ layout: 1
+in: dim 4, lengths {256, 14, 14, 1024}, strides {200704, 14336, 1024, 1}
+wei: dim 4, lengths {256, 3, 3, 1024}, strides {9216, 3072, 1024, 1}
+out: dim 4, lengths {256, 14, 14, 256}, strides {50176, 3584, 256, 1}
+InLeftPads size 2, {1, 1, }
+InRightPads size 2, {1, 1, }
+ConvStrides size 2, {1, 1, }
+ConvDilations size 2, {1, 1, }
+device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk
+a_k0_m_k1_grid_desc{288, 50176, 8}
+b_k0_n_k1_grid_desc{288, 1024, 8}
+c_m_n_grid_desc{ 50176, 1024}
+launch_and_time_kernel: grid_dim {1568, 1, 1}, block_dim {256, 1, 1}
+Warm up
+Start running 1 times...
+Average time : 2.22461 ms, 106.428 TFlop/s
+```
diff --git a/cmake/AddKernels.cmake b/cmake/AddKernels.cmake
new file mode 100644
index 0000000000..429ecc47a9
--- /dev/null
+++ b/cmake/AddKernels.cmake
@@ -0,0 +1,40 @@
+
+function(add_kernels SRC_DIR KERNEL_FILES)
+    set(INIT_KERNELS_LIST)
+    set(KERNELS_DECLS)
+    foreach(KERNEL_FILE ${KERNEL_FILES})
+        if("${CMAKE_VERSION}" VERSION_LESS 3.0)
+            configure_file(${KERNEL_FILE} ${KERNEL_FILE}.delete)
+        else()
+            set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${KERNEL_FILE})
+        endif()
+        get_filename_component(BASE_NAME ${KERNEL_FILE} NAME_WE)
+        string(TOUPPER "${BASE_NAME}" KEY_NAME)
+        string(MAKE_C_IDENTIFIER "${KEY_NAME}" VAR_NAME)
+	string(APPEND KERNELS_DECLS "extern const size_t APP_KERNEL_${VAR_NAME}_SIZE;\n")
+	string(APPEND KERNELS_DECLS "extern const unsigned char APP_KERNEL_${VAR_NAME}[];\n")
+	list(APPEND INIT_KERNELS_LIST "    { \"${KEY_NAME}\", std::string(reinterpret_cast<const char*>(APP_KERNEL_${VAR_NAME}), APP_KERNEL_${VAR_NAME}_SIZE) }")
+    endforeach()
+    string(REPLACE ";" ",\n" INIT_KERNELS "${INIT_KERNELS_LIST}")
+    configure_file(${SRC_DIR}/kernel.cpp.in ${PROJECT_BINARY_DIR}/kernel.cpp)
+endfunction()
+
+function(add_kernel_includes SRC_DIR KERNEL_FILES)
+    set(INIT_KERNELS_LIST)
+    foreach(KERNEL_FILE ${KERNEL_FILES})
+        if("${CMAKE_VERSION}" VERSION_LESS 3.0)
+            configure_file(${KERNEL_FILE} ${KERNEL_FILE}.delete)
+        else()
+            set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${KERNEL_FILE})
+        endif()
+        get_filename_component(BASE_NAME ${KERNEL_FILE} NAME_WE)
+        get_filename_component(FILE_NAME ${KERNEL_FILE} NAME)
+        string(TOUPPER "${BASE_NAME}" KEY_NAME)
+        string(MAKE_C_IDENTIFIER "${KEY_NAME}" VAR_NAME)
+        list(APPEND INIT_KERNELS_LIST "    { \"${FILE_NAME}\", std::string(reinterpret_cast<const char*>(${VAR_NAME}), ${VAR_NAME}_SIZE) }")
+    endforeach()
+    string(REPLACE ";" ",\n" INIT_KERNELS "${INIT_KERNELS_LIST}")
+    configure_file(${SRC_DIR}/kernel_includes.cpp.in ${PROJECT_BINARY_DIR}/kernel_includes.cpp)
+endfunction()
+
+
diff --git a/cmake/TargetFlags.cmake b/cmake/TargetFlags.cmake
new file mode 100644
index 0000000000..4f83fb5d39
--- /dev/null
+++ b/cmake/TargetFlags.cmake
@@ -0,0 +1,50 @@
+
+function(get_target_property2 VAR TARGET PROPERTY)
+    get_target_property(_pflags ${TARGET} ${PROPERTY})
+    if(_pflags)
+        set(${VAR} ${_pflags} PARENT_SCOPE)
+    else()
+        set(${VAR} "" PARENT_SCOPE)
+    endif()
+endfunction()
+
+
+macro(append_flags FLAGS TARGET PROPERTY PREFIX)
+    get_target_property2(_pflags ${TARGET} ${PROPERTY})
+    foreach(FLAG ${_pflags})
+        if(TARGET ${FLAG})
+            target_flags(_pflags2 ${FLAG})
+            string(APPEND ${FLAGS} " ${_pflags2}")
+        else()
+            string(APPEND ${FLAGS} " ${PREFIX}${FLAG}")
+        endif()
+    endforeach()
+endmacro()
+
+macro(append_link_flags FLAGS TARGET PROPERTY)
+    get_target_property2(_pflags ${TARGET} ${PROPERTY})
+    foreach(FLAG ${_pflags})
+        if(TARGET ${FLAG})
+            target_flags(_pflags2 ${FLAG})
+            string(APPEND ${FLAGS} " ${_pflags2}")
+        elseif(FLAG MATCHES "^-.*")
+            string(APPEND ${FLAGS} " ${FLAG}")
+        elseif(EXISTS ${FLAG})
+            string(APPEND ${FLAGS} " ${FLAG}")
+        else()
+            string(APPEND ${FLAGS} " -l${FLAG}")
+        endif()
+    endforeach()
+endmacro()
+
+function(target_flags FLAGS TARGET)
+    set(_flags)
+    append_flags(_flags ${TARGET} "INTERFACE_COMPILE_OPTIONS" "")
+    append_flags(_flags ${TARGET} "INTERFACE_COMPILE_DEFINITIONS" "-D")
+    append_flags(_flags ${TARGET} "INTERFACE_INCLUDE_DIRECTORIES" "-isystem ")
+    append_flags(_flags ${TARGET} "INTERFACE_LINK_DIRECTORIES" "-L ")
+    append_flags(_flags ${TARGET} "INTERFACE_LINK_OPTIONS" "")
+    append_link_flags(_flags ${TARGET} "INTERFACE_LINK_LIBRARIES" "")
+    # message("_flags: ${_flags}")
+    set(${FLAGS} ${_flags} PARENT_SCOPE)
+endfunction()
diff --git a/composable_kernel/include/gridwise_operation_wrapper.hpp b/composable_kernel/include/gridwise_operation_wrapper.hpp
new file mode 100644
index 0000000000..0a1e07ec57
--- /dev/null
+++ b/composable_kernel/include/gridwise_operation_wrapper.hpp
@@ -0,0 +1,14 @@
+#ifndef CK_GRIDWISE_OPERATION_KERNEL_WRAPPER
+#define CK_GRIDWISE_OPERATION_KERNEL_WRAPPER
+
+template <typename GridwiseOp, typename... Xs>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        run_gridwise_operation(Xs... xs)
+{
+    GridwiseOp{}.Run(xs...);
+}
+
+#endif
diff --git a/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp b/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp
new file mode 100644
index 0000000000..5c582dea46
--- /dev/null
+++ b/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,272 @@
+#ifndef CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1_NHWC_KYXC_NHWK_HPP
+#define CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1_NHWC_KYXC_NHWK_HPP
+
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// Number of GEMMs = YTilda * XTilda
+// GemmM = C
+// GemmN = N * HTildaSlice * WTildaSlice
+// GemmK = K * YDotSlice * XDotSlice
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads,
+          index_t IYTildaValue,
+          index_t IXTildaValue,
+          index_t GemmK1Value>
+__host__ __device__ constexpr auto
+transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk(
+    const DynamicTensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const DynamicTensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const DynamicTensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    Number<IYTildaValue>,
+    Number<IXTildaValue>,
+    Number<GemmK1Value>)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto GemmK1  = Number<GemmK1Value>{};
+    constexpr auto IYTilda = Number<IYTildaValue>{};
+    constexpr auto IXTilda = Number<IXTildaValue>{};
+
+    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
+    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
+    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
+
+    const auto Hi = in_n_hi_wi_c_grid_desc.GetLength(I1);
+    const auto Wi = in_n_hi_wi_c_grid_desc.GetLength(I2);
+
+    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
+    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
+
+    const auto Y = wei_k_y_x_c_grid_desc.GetLength(I1);
+    const auto X = wei_k_y_x_c_grid_desc.GetLength(I2);
+
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+
+    const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
+    const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
+
+    const auto YTilda = ConvStrideH / GcdStrideDilationH;
+    const auto XTilda = ConvStrideW / GcdStrideDilationW;
+
+    const auto YDot = math::integer_divide_ceil(Y, YTilda);
+    const auto XDot = math::integer_divide_ceil(X, XTilda);
+
+    const auto HTilda = Ho + math::integer_divide_ceil(ConvDilationH * (Y - I1), ConvStrideH);
+    const auto WTilda = Wo + math::integer_divide_ceil(ConvDilationW * (X - I1), ConvStrideW);
+
+    // only work on HTilda and WTilda that contribute to non-padding area of input tensor
+    const auto IHTildaSliceBegin = math::integer_divide_floor(
+        math::max(I0, InLeftPadH - ConvDilationH * (YTilda - I1)), ConvStrideH);
+    const auto IWTildaSliceBegin = math::integer_divide_floor(
+        math::max(I0, InLeftPadW - ConvDilationW * (XTilda - I1)), ConvStrideW);
+
+    const auto IHTildaSliceEnd =
+        math::min(HTilda, math::integer_divide_ceil(InLeftPadH + Hi - I1, ConvStrideH) + I1);
+    const auto IWTildaSliceEnd =
+        math::min(WTilda, math::integer_divide_ceil(InLeftPadW + Wi - I1, ConvStrideW) + I1);
+
+    const auto HTildaSlice = IHTildaSliceEnd - IHTildaSliceBegin;
+    const auto WTildaSlice = IWTildaSliceEnd - IWTildaSliceBegin;
+
+    // GemmK is different for each GEMM
+    const auto YDotSlice = math::integer_divide_ceil(Y - IYTilda, YTilda);
+    const auto XDotSlice = math::integer_divide_ceil(X - IXTilda, XTilda);
+
+    const auto K1 = GemmK1;
+    const auto K0 = K / K1;
+
+    // weight tensor
+    const auto wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc = transform_dynamic_tensor_descriptor(
+        wei_k_y_x_c_grid_desc,
+        make_tuple(make_pass_through_transform(K),
+                   make_embed_transform(make_tuple(YDot, YTilda),
+                                        make_tuple(ConvStrideH / GcdStrideDilationH, I1)),
+                   make_embed_transform(make_tuple(XDot, XTilda),
+                                        make_tuple(ConvStrideW / GcdStrideDilationW, I1)),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+    const auto wei_k0_k1_ydotslice_xdotslice_c_grid_desc =
+        transform_dynamic_tensor_descriptor(wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc,
+                                            make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
+                                                       make_slice_transform(YDot, I0, YDotSlice),
+                                                       make_slice_transform(XDot, I0, XDotSlice),
+                                                       make_freeze_transform(IYTilda),
+                                                       make_freeze_transform(IXTilda),
+                                                       make_pass_through_transform(C)),
+                                            make_tuple(Sequence<0>{},
+                                                       Sequence<1>{},
+                                                       Sequence<3>{},
+                                                       Sequence<2>{},
+                                                       Sequence<4>{},
+                                                       Sequence<5>{}),
+                                            make_tuple(Sequence<0, 1>{},
+                                                       Sequence<2>{},
+                                                       Sequence<3>{},
+                                                       Sequence<>{},
+                                                       Sequence<>{},
+                                                       Sequence<4>{}));
+
+#if 1
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+        wei_k0_k1_ydotslice_xdotslice_c_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)),
+                   make_pass_through_transform(C),
+                   make_pass_through_transform(K1)),
+        make_tuple(Sequence<2, 3, 0>{}, Sequence<4>{}, Sequence<1>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+#else
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+        wei_k0_k1_ydotslice_xdotslice_c_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(K0, YDotSlice, XDotSlice)),
+                   make_pass_through_transform(C),
+                   make_pass_through_transform(K1)),
+        make_tuple(Sequence<0, 2, 3>{}, Sequence<4>{}, Sequence<1>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+#endif
+
+    // output tensor
+    // this add padding check
+    const auto out_n_hop_wop_k_grid_desc = transform_dynamic_tensor_descriptor(
+        out_n_ho_wo_k_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pad_transform(Ho, I0, I0),
+                   make_pad_transform(Wo, I0, I0),
+                   make_pass_through_transform(K)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    const auto out_n_ydot_htilda_xdot_wtilda_k_grid_desc = transform_dynamic_tensor_descriptor(
+        out_n_hop_wop_k_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_embed_transform(make_tuple(YDot, HTilda),
+                                        make_tuple(-ConvDilationH / GcdStrideDilationH, I1)),
+                   make_embed_transform(make_tuple(XDot, WTilda),
+                                        make_tuple(-ConvDilationW / GcdStrideDilationW, I1)),
+                   make_pass_through_transform(K)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+    const auto out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc =
+        transform_dynamic_tensor_descriptor(
+            out_n_ydot_htilda_xdot_wtilda_k_grid_desc,
+            make_tuple(make_pass_through_transform(N),
+                       make_slice_transform(YDot, I0, YDotSlice),
+                       make_slice_transform(HTilda, IHTildaSliceBegin, HTildaSlice),
+                       make_slice_transform(XDot, I0, XDotSlice),
+                       make_slice_transform(WTilda, IWTildaSliceBegin, WTildaSlice),
+                       make_unmerge_transform(make_tuple(K0, K1))),
+            make_tuple(Sequence<0>{},
+                       Sequence<1>{},
+                       Sequence<2>{},
+                       Sequence<3>{},
+                       Sequence<4>{},
+                       Sequence<5>{}),
+            make_tuple(Sequence<0>{},
+                       Sequence<1>{},
+                       Sequence<2>{},
+                       Sequence<3>{},
+                       Sequence<4>{},
+                       Sequence<5, 6>{}));
+
+#if 1
+    const auto out_gemmk0_gemmn_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+        out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)),
+                   make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice)),
+                   make_pass_through_transform(K1)),
+        make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}, Sequence<6>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+#else
+    const auto out_gemmk0_gemmn_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+        out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(K0, YDotSlice, XDotSlice)),
+                   make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice)),
+                   make_pass_through_transform(K1)),
+        make_tuple(Sequence<5, 1, 3>{}, Sequence<0, 2, 4>{}, Sequence<6>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+#endif
+
+    // input tensor
+    const auto in_n_hip_wip_c_grid_desc = transform_dynamic_tensor_descriptor(
+        in_n_hi_wi_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                   make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    const auto in_n_ytilda_htilda_xtilda_wtilda_c_grid_desc = transform_dynamic_tensor_descriptor(
+        in_n_hip_wip_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_embed_transform(make_tuple(YTilda, HTilda),
+                                        make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(XTilda, WTilda),
+                                        make_tuple(ConvDilationW, ConvStrideW)),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+    const auto in_n_htildaslice_wtildaslice_c_grid_desc = transform_dynamic_tensor_descriptor(
+        in_n_ytilda_htilda_xtilda_wtilda_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_freeze_transform(IYTilda),
+                   make_slice_transform(HTilda, IHTildaSliceBegin, HTildaSlice),
+                   make_freeze_transform(IXTilda),
+                   make_slice_transform(WTilda, IWTildaSliceBegin, WTildaSlice),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{},
+                   Sequence<1>{},
+                   Sequence<2>{},
+                   Sequence<3>{},
+                   Sequence<4>{},
+                   Sequence<5>{}),
+        make_tuple(Sequence<0>{},
+                   Sequence<>{},
+                   Sequence<1>{},
+                   Sequence<>{},
+                   Sequence<2>{},
+                   Sequence<3>{}));
+
+    const auto in_gemmm_gemmn_grid_desc = transform_dynamic_tensor_descriptor(
+        in_n_htildaslice_wtildaslice_c_grid_desc,
+        make_tuple(make_pass_through_transform(C),
+                   make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice))),
+        make_tuple(Sequence<3>{}, Sequence<0, 1, 2>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    return make_tuple(wei_gemmk0_gemmm_gemmk1_grid_desc,
+                      out_gemmk0_gemmn_gemmk1_grid_desc,
+                      in_gemmm_gemmn_grid_desc);
+}
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp b/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp
new file mode 100644
index 0000000000..377a1ac29b
--- /dev/null
+++ b/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,275 @@
+#ifndef CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1R2_NHWC_KYXC_NHWK_HPP
+#define CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1R2_NHWC_KYXC_NHWK_HPP
+
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// A: out
+// B: wei
+// C: in
+// Number of GEMMs = YTilda * XTilda
+// GemmM = N * HTildaSlice * WTildaSlice
+// GemmN = C
+// GemmK = K * YDotSlice * XDotSlice
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads,
+          index_t IYTildaValue,
+          index_t IXTildaValue,
+          index_t GemmK1Value>
+__host__ __device__ constexpr auto
+transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk(
+    const DynamicTensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const DynamicTensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const DynamicTensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    Number<IYTildaValue>,
+    Number<IXTildaValue>,
+    Number<GemmK1Value>)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto GemmK1  = Number<GemmK1Value>{};
+    constexpr auto IYTilda = Number<IYTildaValue>{};
+    constexpr auto IXTilda = Number<IXTildaValue>{};
+
+    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
+    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
+    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
+
+    const auto Hi = in_n_hi_wi_c_grid_desc.GetLength(I1);
+    const auto Wi = in_n_hi_wi_c_grid_desc.GetLength(I2);
+
+    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
+    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
+
+    const auto Y = wei_k_y_x_c_grid_desc.GetLength(I1);
+    const auto X = wei_k_y_x_c_grid_desc.GetLength(I2);
+
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+
+    const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
+    const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
+
+    const auto YTilda = ConvStrideH / GcdStrideDilationH;
+    const auto XTilda = ConvStrideW / GcdStrideDilationW;
+
+    const auto YDot = math::integer_divide_ceil(Y, YTilda);
+    const auto XDot = math::integer_divide_ceil(X, XTilda);
+
+    const auto HTilda = Ho + math::integer_divide_ceil(ConvDilationH * (Y - I1), ConvStrideH);
+    const auto WTilda = Wo + math::integer_divide_ceil(ConvDilationW * (X - I1), ConvStrideW);
+
+    // only work on HTilda and WTilda that contribute to non-padding area of input tensor
+    const auto IHTildaSliceBegin = math::integer_divide_floor(
+        math::max(I0, InLeftPadH - ConvDilationH * (YTilda - I1)), ConvStrideH);
+    const auto IWTildaSliceBegin = math::integer_divide_floor(
+        math::max(I0, InLeftPadW - ConvDilationW * (XTilda - I1)), ConvStrideW);
+
+    const auto IHTildaSliceEnd =
+        math::min(HTilda, math::integer_divide_ceil(InLeftPadH + Hi - I1, ConvStrideH) + I1);
+    const auto IWTildaSliceEnd =
+        math::min(WTilda, math::integer_divide_ceil(InLeftPadW + Wi - I1, ConvStrideW) + I1);
+
+    const auto HTildaSlice = IHTildaSliceEnd - IHTildaSliceBegin;
+    const auto WTildaSlice = IWTildaSliceEnd - IWTildaSliceBegin;
+
+    // GemmK is different for each GEMM
+    const auto YDotSlice = math::integer_divide_ceil(Y - IYTilda, YTilda);
+    const auto XDotSlice = math::integer_divide_ceil(X - IXTilda, XTilda);
+
+    const auto K1 = GemmK1;
+    const auto K0 = K / K1;
+
+    // A: output tensor
+    // this add padding check
+    const auto out_n_hop_wop_k_grid_desc = transform_dynamic_tensor_descriptor(
+        out_n_ho_wo_k_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pad_transform(Ho, I0, I0),
+                   make_pad_transform(Wo, I0, I0),
+                   make_pass_through_transform(K)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    const auto out_n_ydot_htilda_xdot_wtilda_k_grid_desc = transform_dynamic_tensor_descriptor(
+        out_n_hop_wop_k_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_embed_transform(make_tuple(YDot, HTilda),
+                                        make_tuple(-ConvDilationH / GcdStrideDilationH, I1)),
+                   make_embed_transform(make_tuple(XDot, WTilda),
+                                        make_tuple(-ConvDilationW / GcdStrideDilationW, I1)),
+                   make_pass_through_transform(K)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+    const auto out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc =
+        transform_dynamic_tensor_descriptor(
+            out_n_ydot_htilda_xdot_wtilda_k_grid_desc,
+            make_tuple(make_pass_through_transform(N),
+                       make_slice_transform(YDot, I0, YDotSlice),
+                       make_slice_transform(HTilda, IHTildaSliceBegin, HTildaSlice),
+                       make_slice_transform(XDot, I0, XDotSlice),
+                       make_slice_transform(WTilda, IWTildaSliceBegin, WTildaSlice),
+                       make_unmerge_transform(make_tuple(K0, K1))),
+            make_tuple(Sequence<0>{},
+                       Sequence<1>{},
+                       Sequence<2>{},
+                       Sequence<3>{},
+                       Sequence<4>{},
+                       Sequence<5>{}),
+            make_tuple(Sequence<0>{},
+                       Sequence<1>{},
+                       Sequence<2>{},
+                       Sequence<3>{},
+                       Sequence<4>{},
+                       Sequence<5, 6>{}));
+
+#if 1
+    const auto out_gemmk0_gemmm_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+        out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)),
+                   make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice)),
+                   make_pass_through_transform(K1)),
+        make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}, Sequence<6>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+#else
+    const auto out_gemmk0_gemmm_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+        out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(K0, YDotSlice, XDotSlice)),
+                   make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice)),
+                   make_pass_through_transform(K1)),
+        make_tuple(Sequence<5, 1, 3>{}, Sequence<0, 2, 4>{}, Sequence<6>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+#endif
+
+    // B: weight tensor
+    const auto wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc = transform_dynamic_tensor_descriptor(
+        wei_k_y_x_c_grid_desc,
+        make_tuple(make_pass_through_transform(K),
+                   make_embed_transform(make_tuple(YDot, YTilda),
+                                        make_tuple(ConvStrideH / GcdStrideDilationH, I1)),
+                   make_embed_transform(make_tuple(XDot, XTilda),
+                                        make_tuple(ConvStrideW / GcdStrideDilationW, I1)),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+    const auto wei_k0_k1_ydotslice_xdotslice_c_grid_desc =
+        transform_dynamic_tensor_descriptor(wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc,
+                                            make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
+                                                       make_slice_transform(YDot, I0, YDotSlice),
+                                                       make_slice_transform(XDot, I0, XDotSlice),
+                                                       make_freeze_transform(IYTilda),
+                                                       make_freeze_transform(IXTilda),
+                                                       make_pass_through_transform(C)),
+                                            make_tuple(Sequence<0>{},
+                                                       Sequence<1>{},
+                                                       Sequence<3>{},
+                                                       Sequence<2>{},
+                                                       Sequence<4>{},
+                                                       Sequence<5>{}),
+                                            make_tuple(Sequence<0, 1>{},
+                                                       Sequence<2>{},
+                                                       Sequence<3>{},
+                                                       Sequence<>{},
+                                                       Sequence<>{},
+                                                       Sequence<4>{}));
+
+#if 1
+    const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+        wei_k0_k1_ydotslice_xdotslice_c_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)),
+                   make_pass_through_transform(C),
+                   make_pass_through_transform(K1)),
+        make_tuple(Sequence<2, 3, 0>{}, Sequence<4>{}, Sequence<1>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+#else
+    const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+        wei_k0_k1_ydotslice_xdotslice_c_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(K0, YDotSlice, XDotSlice)),
+                   make_pass_through_transform(C),
+                   make_pass_through_transform(K1)),
+        make_tuple(Sequence<0, 2, 3>{}, Sequence<4>{}, Sequence<1>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+#endif
+
+    // C: input tensor
+    const auto in_n_hip_wip_c_grid_desc = transform_dynamic_tensor_descriptor(
+        in_n_hi_wi_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                   make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    const auto in_n_ytilda_htilda_xtilda_wtilda_c_grid_desc = transform_dynamic_tensor_descriptor(
+        in_n_hip_wip_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_embed_transform(make_tuple(YTilda, HTilda),
+                                        make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(XTilda, WTilda),
+                                        make_tuple(ConvDilationW, ConvStrideW)),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+    const auto in_n_htildaslice_wtildaslice_c_grid_desc = transform_dynamic_tensor_descriptor(
+        in_n_ytilda_htilda_xtilda_wtilda_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_freeze_transform(IYTilda),
+                   make_slice_transform(HTilda, IHTildaSliceBegin, HTildaSlice),
+                   make_freeze_transform(IXTilda),
+                   make_slice_transform(WTilda, IWTildaSliceBegin, WTildaSlice),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{},
+                   Sequence<1>{},
+                   Sequence<2>{},
+                   Sequence<3>{},
+                   Sequence<4>{},
+                   Sequence<5>{}),
+        make_tuple(Sequence<0>{},
+                   Sequence<>{},
+                   Sequence<1>{},
+                   Sequence<>{},
+                   Sequence<2>{},
+                   Sequence<3>{}));
+
+    const auto in_gemmm_gemmn_grid_desc = transform_dynamic_tensor_descriptor(
+        in_n_htildaslice_wtildaslice_c_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice)),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    return make_tuple(out_gemmk0_gemmm_gemmk1_grid_desc,
+                      wei_gemmk0_gemmn_gemmk1_grid_desc,
+                      in_gemmm_gemmn_grid_desc);
+}
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp
new file mode 100644
index 0000000000..404129365f
--- /dev/null
+++ b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -0,0 +1,263 @@
+#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4_NCHW_KCYX_NKHW_HPP
+#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4_NCHW_KCYX_NKHW_HPP
+
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// GemmM = K
+// GemmN = N * Ho * Wo
+// GemmK = C * Y * X
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+__host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(
+    const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
+    const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
+    const DynamicTensorDescriptor<Out...>& out_n_k_ho_wo_global_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    const auto N = in_n_c_hi_wi_global_desc.GetLength(I0);
+    const auto C = in_n_c_hi_wi_global_desc.GetLength(I1);
+    const auto K = out_n_k_ho_wo_global_desc.GetLength(I1);
+
+    const auto Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
+    const auto Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
+
+    const auto Ho = out_n_k_ho_wo_global_desc.GetLength(I2);
+    const auto Wo = out_n_k_ho_wo_global_desc.GetLength(I3);
+
+    const auto Y = wei_k_c_y_x_global_desc.GetLength(I2);
+    const auto X = wei_k_c_y_x_global_desc.GetLength(I3);
+
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+
+    // weight tensor
+    const auto wei_gemmk_gemmm_global_desc = transform_dynamic_tensor_descriptor(
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C * Y * X)),
+        make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    // input tensor
+    const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
+        in_n_c_hi_wi_global_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pass_through_transform(C),
+                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                   make_pad_transform(Wi, InLeftPadW, InRightPadW)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor(
+        in_n_c_hip_wip_global_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pass_through_transform(C),
+                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW))),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
+
+    const auto in_gemmk_gemmn_global_desc =
+        transform_dynamic_tensor_descriptor(in_n_c_y_ho_x_wo_global_desc,
+                                            make_tuple(make_merge_transform(make_tuple(C, Y, X)),
+                                                       make_merge_transform(make_tuple(N, Ho, Wo))),
+                                            make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    // output tensor
+    const auto out_gemmm_gemmn_global_desc = transform_dynamic_tensor_descriptor(
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K, Ho * Wo)),
+        make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(N, Ho * Wo))),
+        make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    return make_tuple(
+        wei_gemmk_gemmm_global_desc, in_gemmk_gemmn_global_desc, out_gemmm_gemmn_global_desc);
+}
+
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+__host__ __device__ constexpr auto
+transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_no_pad(
+    const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
+    const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
+    const DynamicTensorDescriptor<Out...>& out_n_k_ho_wo_global_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    const auto N = in_n_c_hi_wi_global_desc.GetLength(I0);
+    const auto C = in_n_c_hi_wi_global_desc.GetLength(I1);
+    const auto K = out_n_k_ho_wo_global_desc.GetLength(I1);
+
+    const auto Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
+    const auto Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
+
+    const auto Ho = out_n_k_ho_wo_global_desc.GetLength(I2);
+    const auto Wo = out_n_k_ho_wo_global_desc.GetLength(I3);
+
+    const auto Y = wei_k_c_y_x_global_desc.GetLength(I2);
+    const auto X = wei_k_c_y_x_global_desc.GetLength(I3);
+
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+
+    assert(InLeftPadH == 0 && InLeftPadW == 0 && InRightPadH == 0 && InRightPadW == 0);
+
+    // weight tensor
+    const auto wei_gemmk_gemmm_global_desc = transform_dynamic_tensor_descriptor(
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C * Y * X)),
+        make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    // input tensor
+    const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor(
+        in_n_c_hi_wi_global_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pass_through_transform(C),
+                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW))),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
+
+    const auto in_gemmk_gemmn_global_desc =
+        transform_dynamic_tensor_descriptor(in_n_c_y_ho_x_wo_global_desc,
+                                            make_tuple(make_merge_transform(make_tuple(C, Y, X)),
+                                                       make_merge_transform(make_tuple(N, Ho, Wo))),
+                                            make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    // output tensor
+    const auto out_gemmm_gemmn_global_desc = transform_dynamic_tensor_descriptor(
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K, Ho * Wo)),
+        make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(N, Ho * Wo))),
+        make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    return make_tuple(
+        wei_gemmk_gemmm_global_desc, in_gemmk_gemmn_global_desc, out_gemmm_gemmn_global_desc);
+}
+
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+__host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_1x1(
+    const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
+    const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
+    const DynamicTensorDescriptor<Out...>& out_n_k_ho_wo_global_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    const auto N = in_n_c_hi_wi_global_desc.GetLength(I0);
+    const auto C = in_n_c_hi_wi_global_desc.GetLength(I1);
+    const auto K = out_n_k_ho_wo_global_desc.GetLength(I1);
+
+    const auto Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
+    const auto Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
+
+    const auto Ho = out_n_k_ho_wo_global_desc.GetLength(I2);
+    const auto Wo = out_n_k_ho_wo_global_desc.GetLength(I3);
+
+    const auto Y = wei_k_c_y_x_global_desc.GetLength(I2);
+    const auto X = wei_k_c_y_x_global_desc.GetLength(I3);
+
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+
+    assert(Y == 1 && X == 1 && ConvStrideH == 1 && ConvStrideW == 1 && ConvDilationH == 1 &&
+           ConvDilationW == 1 && InLeftPadH == 0 && InLeftPadW == 0 && InRightPadH == 0 &&
+           InRightPadW == 0);
+
+    // weight tensor
+    const auto wei_gemmk_gemmm_global_desc = transform_dynamic_tensor_descriptor(
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C)),
+        make_tuple(make_pass_through_transform(K), make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    // input tensor
+    const auto in_gemmk_gemmn_global_desc = transform_dynamic_tensor_descriptor(
+        in_n_c_hi_wi_global_desc,
+        make_tuple(make_pass_through_transform(C), make_merge_transform(make_tuple(N, Ho, Wo))),
+        make_tuple(Sequence<1>{}, Sequence<0, 2, 3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    // output tensor
+    const auto out_gemmm_gemmn_global_desc = transform_dynamic_tensor_descriptor(
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K, Ho * Wo)),
+        make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(N, Ho * Wo))),
+        make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    return make_tuple(
+        wei_gemmk_gemmm_global_desc, in_gemmk_gemmn_global_desc, out_gemmm_gemmn_global_desc);
+}
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp
new file mode 100644
index 0000000000..79051d9512
--- /dev/null
+++ b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,179 @@
+#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4_NHWC_KYXC_NHWK_HPP
+#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4_NHWC_KYXC_NHWK_HPP
+
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// GemmM = K
+// GemmN = N * Ho * Wo
+// GemmK = C * Y * X
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+__host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk_pad(
+    const DynamicTensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const DynamicTensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const DynamicTensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
+    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
+    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
+
+    const auto Hi = in_n_hi_wi_c_grid_desc.GetLength(I1);
+    const auto Wi = in_n_hi_wi_c_grid_desc.GetLength(I2);
+
+    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
+    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
+
+    const auto Y = wei_k_y_x_c_grid_desc.GetLength(I1);
+    const auto X = wei_k_y_x_c_grid_desc.GetLength(I2);
+
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+
+    // weight tensor
+    const auto wei_gemmk_gemmm_grid_desc = transform_dynamic_tensor_descriptor(
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, Y * X * C)),
+        make_tuple(make_pass_through_transform(K), make_pass_through_transform(Y * X * C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    // input tensor
+    const auto in_n_hip_wip_c_grid_desc = transform_dynamic_tensor_descriptor(
+        in_n_hi_wi_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                   make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    const auto in_n_y_ho_x_wo_c_grid_desc = transform_dynamic_tensor_descriptor(
+        in_n_hip_wip_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+    const auto in_gemmk_gemmn_grid_desc =
+        transform_dynamic_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
+                                            make_tuple(make_merge_transform(make_tuple(Y, X, C)),
+                                                       make_merge_transform(make_tuple(N, Ho, Wo))),
+                                            make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    // output tensor
+    const auto out_gemmm_gemmn_grid_desc = transform_dynamic_tensor_descriptor(
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N * Ho * Wo, K)),
+        make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(K)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    return make_tuple(
+        wei_gemmk_gemmm_grid_desc, in_gemmk_gemmn_grid_desc, out_gemmm_gemmn_grid_desc);
+}
+
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+__host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk_1x1(
+    const DynamicTensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const DynamicTensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const DynamicTensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
+    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
+    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
+
+    const auto Hi = in_n_hi_wi_c_grid_desc.GetLength(I1);
+    const auto Wi = in_n_hi_wi_c_grid_desc.GetLength(I2);
+
+    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
+    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
+
+    const auto Y = wei_k_y_x_c_grid_desc.GetLength(I1);
+    const auto X = wei_k_y_x_c_grid_desc.GetLength(I2);
+
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+
+    assert(Y == 1 && X == 1 && ConvStrideH == 1 && ConvStrideW == 1 && ConvDilationH == 1 &&
+           ConvDilationW == 1 && InLeftPadH == 0 && InLeftPadW == 0 && InRightPadH == 0 &&
+           InRightPadW == 0);
+
+    // weight tensor
+    const auto wei_gemmk_gemmm_grid_desc = transform_dynamic_tensor_descriptor(
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C)),
+        make_tuple(make_pass_through_transform(K), make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    // input tensor
+    const auto in_gemmk_gemmn_grid_desc = transform_dynamic_tensor_descriptor(
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N * Ho * Wo, C)),
+        make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    // output tensor
+    const auto out_gemmm_gemmn_grid_desc = transform_dynamic_tensor_descriptor(
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N * Ho * Wo, K)),
+        make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(K)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    return make_tuple(
+        wei_gemmk_gemmm_grid_desc, in_gemmk_gemmn_grid_desc, out_gemmm_gemmn_grid_desc);
+}
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
new file mode 100644
index 0000000000..49ae26518e
--- /dev/null
+++ b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
@@ -0,0 +1,129 @@
+#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R2_NCHW_KCYX_NKHW_HPP
+#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R2_NCHW_KCYX_NKHW_HPP
+
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// GemmM = K
+// GemmN = N * Ho * Wo
+// GemmK = C * Y * X
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads,
+          index_t GemmK1Value>
+__host__ __device__ constexpr auto
+transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(
+    const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_grid_desc,
+    const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_grid_desc,
+    const DynamicTensorDescriptor<Out...>& out_n_k_ho_wo_grid_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    Number<GemmK1Value>)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto GemmK1 = Number<GemmK1Value>{};
+
+    const auto N = in_n_c_hi_wi_grid_desc.GetLength(I0);
+    const auto C = in_n_c_hi_wi_grid_desc.GetLength(I1);
+    const auto K = out_n_k_ho_wo_grid_desc.GetLength(I1);
+
+    const auto Hi = in_n_c_hi_wi_grid_desc.GetLength(I2);
+    const auto Wi = in_n_c_hi_wi_grid_desc.GetLength(I3);
+
+    const auto Ho = out_n_k_ho_wo_grid_desc.GetLength(I2);
+    const auto Wo = out_n_k_ho_wo_grid_desc.GetLength(I3);
+
+    const auto Y = wei_k_c_y_x_grid_desc.GetLength(I2);
+    const auto X = wei_k_c_y_x_grid_desc.GetLength(I3);
+
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+
+    const auto GemmM  = K;
+    const auto GemmN  = N * Ho * Wo;
+    const auto GemmK  = C * Y * X;
+    const auto GemmK0 = GemmK / GemmK1;
+
+    // weight tensor
+    const auto wei_gemmk_gemmm_grid_desc = transform_dynamic_tensor_descriptor(
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C * Y * X)),
+        make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+        wei_gemmk_gemmm_grid_desc,
+        make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                   make_pass_through_transform(GemmM)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+    // input tensor
+    const auto in_n_c_hip_wip_grid_desc = transform_dynamic_tensor_descriptor(
+        in_n_c_hi_wi_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pass_through_transform(C),
+                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                   make_pad_transform(Wi, InLeftPadW, InRightPadW)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    const auto in_n_c_y_ho_x_wo_grid_desc = transform_dynamic_tensor_descriptor(
+        in_n_c_hip_wip_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pass_through_transform(C),
+                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW))),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
+
+    const auto in_gemmk_gemmn_grid_desc =
+        transform_dynamic_tensor_descriptor(in_n_c_y_ho_x_wo_grid_desc,
+                                            make_tuple(make_merge_transform(make_tuple(C, Y, X)),
+                                                       make_merge_transform(make_tuple(N, Ho, Wo))),
+                                            make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    const auto in_gemmk0_gemmn_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+        in_gemmk_gemmn_grid_desc,
+        make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                   make_pass_through_transform(GemmN)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+    // output tensor
+    const auto out_gemmm_gemmn_grid_desc = transform_dynamic_tensor_descriptor(
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K, Ho * Wo)),
+        make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(N, Ho * Wo))),
+        make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    return make_tuple(wei_gemmk0_gemmm_gemmk1_grid_desc,
+                      in_gemmk0_gemmn_gemmk1_grid_desc,
+                      out_gemmm_gemmn_grid_desc);
+}
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp
new file mode 100644
index 0000000000..5814e66766
--- /dev/null
+++ b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,129 @@
+#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R2_NHWC_KYXC_NHWK_HPP
+#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R2_NHWC_KYXC_NHWK_HPP
+
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// GemmM = K
+// GemmN = N * Ho * Wo
+// GemmK = C * Y * X
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads,
+          index_t GemmK1Value>
+__host__ __device__ constexpr auto
+transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk_pad(
+    const DynamicTensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const DynamicTensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const DynamicTensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    Number<GemmK1Value>)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto GemmK1 = Number<GemmK1Value>{};
+
+    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
+    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
+    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
+
+    const auto Hi = in_n_hi_wi_c_grid_desc.GetLength(I1);
+    const auto Wi = in_n_hi_wi_c_grid_desc.GetLength(I2);
+
+    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
+    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
+
+    const auto Y = wei_k_y_x_c_grid_desc.GetLength(I1);
+    const auto X = wei_k_y_x_c_grid_desc.GetLength(I2);
+
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+
+    const auto GemmM  = K;
+    const auto GemmN  = N * Ho * Wo;
+    const auto GemmK  = C * Y * X;
+    const auto GemmK0 = GemmK / GemmK1;
+
+    // weight tensor
+    const auto wei_gemmk_gemmm_grid_desc = transform_dynamic_tensor_descriptor(
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, Y * X * C)),
+        make_tuple(make_pass_through_transform(K), make_pass_through_transform(Y * X * C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+        wei_gemmk_gemmm_grid_desc,
+        make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                   make_pass_through_transform(GemmM)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+    // input tensor
+    const auto in_n_hip_wip_c_grid_desc = transform_dynamic_tensor_descriptor(
+        in_n_hi_wi_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                   make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    const auto in_n_y_ho_x_wo_c_grid_desc = transform_dynamic_tensor_descriptor(
+        in_n_hip_wip_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+    const auto in_gemmk_gemmn_grid_desc =
+        transform_dynamic_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
+                                            make_tuple(make_merge_transform(make_tuple(Y, X, C)),
+                                                       make_merge_transform(make_tuple(N, Ho, Wo))),
+                                            make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    const auto in_gemmk0_gemmn_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+        in_gemmk_gemmn_grid_desc,
+        make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                   make_pass_through_transform(GemmN)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+    // output tensor
+    const auto out_gemmm_gemmn_grid_desc = transform_dynamic_tensor_descriptor(
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N * Ho * Wo, K)),
+        make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(K)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    return make_tuple(wei_gemmk0_gemmm_gemmk1_grid_desc,
+                      in_gemmk0_gemmn_gemmk1_grid_desc,
+                      out_gemmm_gemmn_grid_desc);
+}
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp
new file mode 100644
index 0000000000..ad9d99f4e7
--- /dev/null
+++ b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,132 @@
+#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R4_NHWC_KYXC_NHWK_HPP
+#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R4_NHWC_KYXC_NHWK_HPP
+
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// A: in
+// B: wei
+// C: out
+// GemmM = N * Ho * Wo
+// GemmN = K
+// GemmK = Y * X * C
+template <typename... In,
+          typename... Wei,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads,
+          index_t GemmK1Value>
+__host__ __device__ constexpr auto
+transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad(
+    const DynamicTensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const DynamicTensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const DynamicTensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    Number<GemmK1Value>)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto GemmK1 = Number<GemmK1Value>{};
+
+    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
+    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
+    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
+
+    const auto Hi = in_n_hi_wi_c_grid_desc.GetLength(I1);
+    const auto Wi = in_n_hi_wi_c_grid_desc.GetLength(I2);
+
+    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
+    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
+
+    const auto Y = wei_k_y_x_c_grid_desc.GetLength(I1);
+    const auto X = wei_k_y_x_c_grid_desc.GetLength(I2);
+
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+
+    const auto GemmM  = N * Ho * Wo;
+    const auto GemmN  = K;
+    const auto GemmK  = Y * X * C;
+    const auto GemmK0 = GemmK / GemmK1;
+
+    // A: input tensor
+    const auto in_n_hip_wip_c_grid_desc = transform_dynamic_tensor_descriptor(
+        in_n_hi_wi_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                   make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    const auto in_n_y_ho_x_wo_c_grid_desc = transform_dynamic_tensor_descriptor(
+        in_n_hip_wip_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+    const auto in_gemmk_gemmm_grid_desc =
+        transform_dynamic_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
+                                            make_tuple(make_merge_transform(make_tuple(Y, X, C)),
+                                                       make_merge_transform(make_tuple(N, Ho, Wo))),
+                                            make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    const auto in_gemmk0_gemmm_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+        in_gemmk_gemmm_grid_desc,
+        make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                   make_pass_through_transform(GemmM)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+    // B: weight tensor
+    const auto wei_gemmk_gemmn_grid_desc = transform_dynamic_tensor_descriptor(
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, Y * X * C)),
+        make_tuple(make_pass_through_transform(K), make_pass_through_transform(Y * X * C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+        wei_gemmk_gemmn_grid_desc,
+        make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                   make_pass_through_transform(GemmN)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+    // C: output tensor
+    const auto out_gemmm_gemmn_grid_desc = transform_dynamic_tensor_descriptor(
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N * Ho * Wo, K)),
+        make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(K)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    return make_tuple(in_gemmk0_gemmm_gemmk1_grid_desc,
+                      wei_gemmk0_gemmn_gemmk1_grid_desc,
+                      out_gemmm_gemmn_grid_desc);
+}
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp
new file mode 100644
index 0000000000..e709f768cb
--- /dev/null
+++ b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp
@@ -0,0 +1,132 @@
+#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_CONTRACTION_V6R1_NCHW_KCYX_NKHW_HPP
+#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_CONTRACTION_V6R1_NCHW_KCYX_NKHW_HPP
+
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// GemmM0 = 1
+// GemmM1 = K
+// GemmN0 = N0
+// GemmN1 = (N / N0) * Ho * Wo
+// GemmK0 = (C / C0) * Y * X
+// GemmK1 = C0
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads,
+          typename N0Type,
+          typename C0Type>
+__host__ __device__ constexpr auto
+transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad(
+    const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_grid_desc,
+    const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_grid_desc,
+    const DynamicTensorDescriptor<Out...>& out_n_k_ho_wo_grid_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const N0Type& N0,
+    const C0Type& C0)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    const auto N = in_n_c_hi_wi_grid_desc.GetLength(I0);
+    const auto C = in_n_c_hi_wi_grid_desc.GetLength(I1);
+    const auto K = out_n_k_ho_wo_grid_desc.GetLength(I1);
+
+    const auto Hi = in_n_c_hi_wi_grid_desc.GetLength(I2);
+    const auto Wi = in_n_c_hi_wi_grid_desc.GetLength(I3);
+
+    const auto Ho = out_n_k_ho_wo_grid_desc.GetLength(I2);
+    const auto Wo = out_n_k_ho_wo_grid_desc.GetLength(I3);
+
+    const auto Y = wei_k_c_y_x_grid_desc.GetLength(I2);
+    const auto X = wei_k_c_y_x_grid_desc.GetLength(I3);
+
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+
+    const auto N1 = N / N0;
+    const auto C1 = C / C0;
+
+    // weight tensor
+    const auto wei_gk0_gm0_gm1_gk1_grid_desc = transform_dynamic_tensor_descriptor(
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C * Y * X)),
+        make_tuple(make_unmerge_transform(make_tuple(I1, K)),
+                   make_unmerge_transform(make_tuple(C0, C1 * Y * X))),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1, 2>{}, Sequence<3, 0>{}));
+
+    // input tensor
+    const auto in_n_c_hip_wip_grid_desc = transform_dynamic_tensor_descriptor(
+        in_n_c_hi_wi_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pass_through_transform(C),
+                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                   make_pad_transform(Wi, InLeftPadW, InRightPadW)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    const auto in_n0_n1_c0_c1_y_ho_x_wo_grid_desc = transform_dynamic_tensor_descriptor(
+        in_n_c_hip_wip_grid_desc,
+        make_tuple(make_unmerge_transform(make_tuple(N0, N1)),
+                   make_unmerge_transform(make_tuple(C0, C1)),
+                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW))),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}, Sequence<6, 7>{}));
+
+    const auto in_gk0_gn0_gn1_gk1_grid_desc = transform_dynamic_tensor_descriptor(
+        in_n0_n1_c0_c1_y_ho_x_wo_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(C1, Y, X)),
+                   make_pass_through_transform(N0),
+                   make_merge_transform(make_tuple(N1, Ho, Wo)),
+                   make_pass_through_transform(C0)),
+        make_tuple(Sequence<3, 4, 6>{}, Sequence<0>{}, Sequence<1, 5, 7>{}, Sequence<2>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    // output tensor
+    const auto out_n_k_howo_grid_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K, Ho * Wo));
+
+    const auto out_n0_n1_1_k_howo_grid_desc = transform_dynamic_tensor_descriptor(
+        out_n_k_howo_grid_desc,
+        make_tuple(make_unmerge_transform(make_tuple(N0, N1)),
+                   make_unmerge_transform(make_tuple(I1, K)),
+                   make_pass_through_transform(Ho * Wo)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+        make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}, Sequence<4>{}));
+
+    const auto out_gm0_gm1_gn0_gn1_grid_desc = transform_dynamic_tensor_descriptor(
+        out_n0_n1_1_k_howo_grid_desc,
+        make_tuple(make_pass_through_transform(I1),
+                   make_pass_through_transform(K),
+                   make_pass_through_transform(N0),
+                   make_merge_transform_v2_magic_division(make_tuple(N1, Ho * Wo))),
+        make_tuple(Sequence<2>{}, Sequence<3>{}, Sequence<0>{}, Sequence<1, 4>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    return make_tuple(
+        wei_gk0_gm0_gm1_gk1_grid_desc, in_gk0_gn0_gn1_gk1_grid_desc, out_gm0_gm1_gn0_gn1_grid_desc);
+}
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/tensor_description/cluster_descriptor.hpp b/composable_kernel/include/tensor_description/cluster_descriptor.hpp
new file mode 100644
index 0000000000..c3523623d9
--- /dev/null
+++ b/composable_kernel/include/tensor_description/cluster_descriptor.hpp
@@ -0,0 +1,33 @@
+#ifndef CK_CLUSTER_DESCRIPTOR_HPP
+#define CK_CLUSTER_DESCRIPTOR_HPP
+
+#include "common_header.hpp"
+#include "tensor_adaptor.hpp"
+
+namespace ck {
+
+template <typename Lengths,
+          typename ArrangeOrder = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type>
+__host__ __device__ constexpr auto make_cluster_descriptor_v2(
+    const Lengths& lengths,
+    ArrangeOrder order = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type{})
+{
+    constexpr index_t ndim_low = Lengths::Size();
+
+    const auto reordered_lengths = container_reorder_given_new2old(lengths, order);
+
+    const auto low_lengths = generate_tuple(
+        [&](auto idim_low) { return reordered_lengths[idim_low]; }, Number<ndim_low>{});
+
+    const auto transform = make_merge_transform(low_lengths);
+
+    constexpr auto low_dim_old_top_ids = ArrangeOrder{};
+
+    constexpr auto up_dim_new_top_ids = Sequence<0>{};
+
+    return make_single_stage_tensor_adaptor(
+        make_tuple(transform), make_tuple(low_dim_old_top_ids), make_tuple(up_dim_new_top_ids));
+}
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp b/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
new file mode 100644
index 0000000000..967517bef7
--- /dev/null
+++ b/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
@@ -0,0 +1,1737 @@
+#ifndef CK_DYNAMIC_MULTI_INDEX_TRANSFORM_HPP
+#define CK_DYNAMIC_MULTI_INDEX_TRANSFORM_HPP
+
+#include "common_header.hpp"
+#include "multi_index.hpp"
+
+namespace ck {
+
+template <typename LowLength>
+struct DynamicPassThrough
+{
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<1>;
+
+    using UpLengths = decltype(make_tuple(LowLength{}));
+
+    UpLengths up_lengths_;
+
+    __host__ __device__ constexpr DynamicPassThrough() = default;
+
+    __host__ __device__ constexpr DynamicPassThrough(const LowLength& low_length)
+        : up_lengths_{make_tuple(low_length)}
+    {
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
+
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ static void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up)
+    {
+        static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        idx_low(Number<0>{}) = idx_up[Number<0>{}];
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ static void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                                     const UpIdxDiff& idx_diff_up,
+                                                     LowIdx& idx_low,
+                                                     const UpIdx&,
+                                                     Number<Hack>)
+    {
+        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
+                          UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        constexpr auto I0 = Number<0>{};
+
+        idx_diff_low(I0) = idx_diff_up[I0];
+
+        idx_low += idx_diff_low;
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+
+    template <typename UpIdx>
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
+    {
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<UpLengths>::value;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("DynamicPassThrough, ");
+        printf("up_lengths_");
+        print_multi_index(up_lengths_);
+        printf("}");
+    }
+};
+
+template <typename LowLength, typename LeftPad, typename RightPad, bool SkipIsValidCheck = false>
+struct DynamicPad
+{
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<1>;
+
+    using UpLengths = decltype(make_tuple(LowLength{} + LeftPad{} + RightPad{}));
+
+    UpLengths up_lengths_;
+    LeftPad left_pad_;
+    RightPad right_pad_;
+
+    __host__ __device__ constexpr DynamicPad() = default;
+
+    __host__ __device__ constexpr DynamicPad(const LowLength& low_length,
+                                             const LeftPad& left_pad,
+                                             const RightPad& right_pad)
+        : up_lengths_{make_tuple(low_length + left_pad + right_pad)},
+          left_pad_{left_pad},
+          right_pad_{right_pad}
+    {
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
+
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
+    {
+        static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        idx_low(Number<0>{}) = idx_up[Number<0>{}] - left_pad_;
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ static void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                                     const UpIdxDiff& idx_diff_up,
+                                                     LowIdx& idx_low,
+                                                     const UpIdx&,
+                                                     Number<Hack>)
+    {
+        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
+                          UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        constexpr auto I0 = Number<0>{};
+
+        idx_diff_low(I0) = idx_diff_up[I0];
+
+        idx_low += idx_diff_low;
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return SkipIsValidCheck;
+    }
+
+    template <typename UpIdx>
+    __host__ __device__ constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& idx_up) const
+    {
+        return SkipIsValidCheck || ((idx_up[Number<0>{}] >= left_pad_) &&
+                                    (idx_up[Number<0>{}] < up_lengths_[Number<0>{}] - right_pad_));
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<UpLengths>::value &&
+               is_known_at_compile_time<LeftPad>::value &&
+               is_known_at_compile_time<RightPad>::value;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("DynamicPad, ");
+        printf("up_lengths_");
+        print_multi_index(up_lengths_);
+        printf("left_pad_ %d", index_t{left_pad_});
+        printf("right_pad_ %d", index_t{right_pad_});
+        printf("}");
+    }
+};
+
+template <typename LowLength, typename LeftPad, bool SkipIsValidCheck = false>
+struct DynamicLeftPad
+{
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<1>;
+
+    using UpLengths = decltype(make_tuple(LowLength{} + LeftPad{}));
+
+    UpLengths up_lengths_;
+    LeftPad left_pad_;
+
+    __host__ __device__ constexpr DynamicLeftPad() = default;
+
+    __host__ __device__ constexpr DynamicLeftPad(const LowLength& low_length,
+                                                 const LeftPad& left_pad)
+        : up_lengths_{make_tuple(low_length + left_pad)}, left_pad_{left_pad}
+    {
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
+
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
+    {
+        static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        idx_low(Number<0>{}) = idx_up[Number<0>{}] - left_pad_;
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ static void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                                     const UpIdxDiff& idx_diff_up,
+                                                     LowIdx& idx_low,
+                                                     const UpIdx&,
+                                                     Number<Hack>)
+    {
+        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
+                          UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        constexpr auto I0 = Number<0>{};
+
+        idx_diff_low(I0) = idx_diff_up[I0];
+
+        idx_low += idx_diff_low;
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return SkipIsValidCheck;
+    }
+
+    template <typename UpIdx>
+    __host__ __device__ constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& idx_up) const
+    {
+        return SkipIsValidCheck || (idx_up[Number<0>{}] >= left_pad_);
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<UpLengths>::value &&
+               is_known_at_compile_time<LeftPad>::value;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("DynamicLeftPad, ");
+        printf("up_lengths_");
+        print_multi_index(up_lengths_);
+        printf("left_pad_ %d", index_t{left_pad_});
+        printf("}");
+    }
+};
+
+template <typename LowLength, typename RightPad, bool SkipIsValidCheck = false>
+struct DynamicRightPad
+{
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<1>;
+
+    using UpLengths = decltype(make_tuple(LowLength{} + RightPad{}));
+
+    UpLengths up_lengths_;
+    LowLength low_length_;
+    RightPad right_pad_;
+
+    __host__ __device__ constexpr DynamicRightPad() = default;
+
+    __host__ __device__ constexpr DynamicRightPad(const LowLength& low_length,
+                                                  const RightPad& right_pad)
+        : up_lengths_{make_tuple(low_length + right_pad)},
+          low_length_{low_length},
+          right_pad_{right_pad}
+    {
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
+
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ static constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                                  const UpIdx& idx_up)
+    {
+        static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        idx_low(Number<0>{}) = idx_up[Number<0>{}];
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ static void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                                     const UpIdxDiff& idx_diff_up,
+                                                     LowIdx& idx_low,
+                                                     const UpIdx&,
+                                                     Number<Hack>)
+    {
+        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
+                          UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        constexpr auto I0 = Number<0>{};
+
+        idx_diff_low(I0) = idx_diff_up[I0];
+
+        idx_low += idx_diff_low;
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return SkipIsValidCheck;
+    }
+
+    template <typename UpIdx>
+    __host__ __device__ constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& idx_up) const
+    {
+        return SkipIsValidCheck || (idx_up[Number<0>{}] < low_length_);
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<UpLengths>::value &&
+               is_known_at_compile_time<LowLength>::value &&
+               is_known_at_compile_time<RightPad>::value;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("DynamicRightPad, ");
+        printf("up_lengths_");
+        print_multi_index(up_lengths_);
+        printf("low_length_ %d", index_t{low_length_});
+        printf("left_pad_ %d", index_t{right_pad_});
+        printf("}");
+    }
+};
+
+// idx_low = coefficients[0, ...nDimUp-1] * idx_up[0, ...nDimUp-1]
+// UpLengths and Coefficients can be either of the followings:
+//   1) Tuple of index_t, which is known at run-time, or
+//   2) Tuple of Number, which is known at compile-time, or
+//   3) Tuple of mixture of index_t and Number, which is known partially at run-time and partially
+//   at compile-time
+template <typename UpLengths,
+          typename Coefficients,
+          typename std::enable_if<UpLengths::Size() == Coefficients::Size(), bool>::type = false>
+struct DynamicEmbed
+{
+    static constexpr index_t NDimUp = UpLengths::Size();
+
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<NDimUp>;
+
+    UpLengths up_lengths_;
+    Coefficients coefficients_;
+
+    __host__ __device__ constexpr DynamicEmbed() = default;
+
+    __host__ __device__ constexpr DynamicEmbed(const UpLengths& up_lengths,
+                                               const Coefficients& coefficients)
+        : up_lengths_{up_lengths}, coefficients_{coefficients}
+    {
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
+
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return NDimUp; }
+
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
+    {
+        static_assert(LowIdx::Size() == 1 && UpIdx::Size() == NDimUp,
+                      "wrong! inconsistent # of dimension");
+
+        idx_low(Number<0>{}) = 0;
+
+        static_for<0, NDimUp, 1>{}([&idx_low, &idx_up, this](auto i) {
+            idx_low(Number<0>{}) += idx_up[i] * this->coefficients_[i];
+        });
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                              const UpIdxDiff& idx_diff_up,
+                                              LowIdx& idx_low,
+                                              const UpIdx&,
+                                              Number<Hack>) const
+    {
+        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == NDimUp &&
+                          LowIdx::Size() == 1 && UpIdx::Size() == NDimUp,
+                      "wrong! inconsistent # of dimension");
+
+        idx_diff_low(Number<0>{}) = 0;
+
+        static_for<0, NDimUp, 1>{}(
+            [&](auto i) { idx_diff_low(Number<0>{}) += idx_diff_up[i] * coefficients_[i]; });
+
+        idx_low += idx_diff_low;
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+
+    template <typename UpIdx>
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
+    {
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<UpLengths>::value &&
+               is_known_at_compile_time<Coefficients>::value;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("DynamicEmbed, ");
+        printf("up_lengths_ ");
+        print_multi_index(up_lengths_);
+        printf("coefficients_ ");
+        print_multi_index(coefficients_);
+        printf("}");
+    }
+};
+
+// Implementation of "Merge" transformation primitive that uses regular to do lowering of
+// multi-index and use carry-and-borrow check to do lowering of multi-index delta
+template <typename LowLengths>
+struct DynamicMerge_v1_carry_check
+{
+    static constexpr index_t NDimLow = LowLengths::Size();
+
+    using LowerIndex = MultiIndex<NDimLow>;
+    using UpperIndex = MultiIndex<1>;
+
+    using LowLengthsScan = decltype(
+        container_reverse_exclusive_scan(LowLengths{}, math::multiplies_v2{}, Number<1>{}));
+
+    using UpLengths =
+        decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies_v2{}, Number<1>{})));
+
+    LowLengths low_lengths_;
+    LowLengthsScan low_lengths_scan_;
+    UpLengths up_lengths_;
+
+    __host__ __device__ constexpr DynamicMerge_v1_carry_check() = default;
+
+    __host__ __device__ constexpr DynamicMerge_v1_carry_check(const LowLengths& low_lengths)
+        : low_lengths_{low_lengths},
+          low_lengths_scan_{
+              container_reverse_exclusive_scan(low_lengths, math::multiplies_v2{}, Number<1>{})},
+          up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies_v2{}, Number<1>{}))}
+    {
+        static_assert(LowerIndex::Size() == NDimLow, "wrong!");
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return NDimLow; }
+
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
+    {
+        static_assert(LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        index_t tmp = idx_up[Number<0>{}];
+
+        // normal division
+        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
+            idx_low(i) = tmp / this->low_lengths_scan_[i];
+            tmp -= idx_low[i] * this->low_lengths_scan_[i];
+        });
+
+        idx_low(Number<NDimLow - 1>{}) = tmp;
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ void UpdateLowerIndex_1a(LowIdxDiff& idx_diff_low,
+                                                 const UpIdxDiff& idx_diff_up,
+                                                 LowIdx& idx_low,
+                                                 const UpIdx& /* idx_up_new */,
+                                                 Number<Hack>) const
+    {
+        static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
+                          LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        // CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
+        // However,
+        //   1) If idx_diff_up is known at compile-time, then idx_diff_low_const
+        //   can be calculated at compile-time.
+        //   2) If idx_diff_up is not known at compile-time, but its value
+        //   doesn't change during the whole kernel execution, then
+        //   idx_diff_low_const also
+        //   doesn't change during the whole kernel execution. Compiler generated
+        //   ISA should
+        //   only caclculate idx_diff_low_const once and save it durinng the whole
+        //   kernel execution
+        // If neither 1) nor 2) is satisfied, then the calculation will also be
+        // computed at
+        //   run-time each time this function is called, and can be very expensive.
+        LowerIndex idx_diff_low_const;
+        LowerIndex idx_low_length_minus_idx_diff_low_const;
+        LowerIndex idx_low_length_plus_idx_diff_low_const;
+
+#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
+        index_t tmp = idx_diff_up[Number<0>{}];
+
+        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
+            idx_diff_low_const(i) = tmp / low_lengths_scan_[i];
+            tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
+        });
+
+        idx_diff_low_const(Number<NDimLow - 1>{}) = tmp;
+
+        static_for<0, NDimLow, 1>{}([&](auto i) {
+            idx_low_length_minus_idx_diff_low_const(i) = low_lengths_[i] - idx_diff_low_const[i];
+
+            idx_low_length_plus_idx_diff_low_const(i) = low_lengths_[i] + idx_diff_low_const[i];
+        });
+#else
+        // Hack: this force result into SGPR. Need to make sure the result is thread invariant
+        index_t tmp = idx_diff_up[Number<0>{}];
+
+        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
+            idx_diff_low_const(i) = __builtin_amdgcn_readfirstlane(tmp / low_lengths_scan_[i]);
+            tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
+        });
+
+        idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp);
+
+        static_for<0, NDimLow, 1>{}([&](auto i) {
+            idx_low_length_minus_idx_diff_low_const(i) =
+                __builtin_amdgcn_readfirstlane(low_lengths_[i] - idx_diff_low_const[i]);
+
+            idx_low_length_plus_idx_diff_low_const(i) =
+                __builtin_amdgcn_readfirstlane(low_lengths_[i] + idx_diff_low_const[i]);
+        });
+#endif
+
+        if constexpr(Hack == 1)
+        {
+            // do carry check on each low dimension in reversed order
+            // do not need to check the first dimension
+            index_t carry = 0;
+
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                index_t idx_low_tmp = idx_low[i] + carry;
+
+                bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
+
+                idx_diff_low(i) =
+                    do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
+
+                idx_diff_low(i) += carry;
+
+                carry = do_carry ? 1 : 0;
+            });
+
+            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
+
+            idx_low += idx_diff_low;
+        }
+        else if constexpr(Hack == 2)
+        {
+            // do carry check on each low dimension in reversed order
+            // do not need to check the first dimension
+            index_t borrow = 0;
+
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                index_t idx_low_tmp = idx_low[i] - borrow;
+
+                bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
+
+                idx_diff_low(i) =
+                    do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low_const[i];
+
+                idx_diff_low(i) -= borrow;
+
+                borrow = do_borrow ? 1 : 0;
+            });
+
+            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] - borrow;
+
+            idx_low += idx_diff_low;
+        }
+        else
+        {
+            // do carry check on each low dimension in reversed order
+            // do not need to check the first dimension
+            index_t carry = 0;
+
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                index_t idx_low_tmp = idx_low[i] + carry;
+
+                bool do_carry  = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
+                bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
+
+                idx_diff_low(i) =
+                    do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
+                idx_diff_low(i) =
+                    do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low[i];
+
+                idx_diff_low(i) += carry;
+
+                carry = do_carry ? 1 : 0;
+                carry = do_borrow ? -1 : carry;
+            });
+
+            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
+
+            idx_low += idx_diff_low;
+        }
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ void UpdateLowerIndex_1b(LowIdxDiff& idx_diff_low,
+                                                 const UpIdxDiff& idx_diff_up,
+                                                 LowIdx& idx_low,
+                                                 const UpIdx& /* idx_up_new */,
+                                                 Number<Hack>) const
+    {
+        static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
+                          LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        // CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
+        // However,
+        //   1) If idx_diff_up is known at compile-time, then idx_diff_low_const
+        //   can be calculated at compile-time.
+        //   2) If idx_diff_up is not known at compile-time, but its value
+        //   doesn't change during the whole kernel execution, then
+        //   idx_diff_low_const also
+        //   doesn't change during the whole kernel execution. Compiler generated
+        //   ISA should
+        //   only caclculate idx_diff_low_const once and save it durinng the whole
+        //   kernel execution
+        // If neither 1) nor 2) is satisfied, then the calculation will also be
+        // computed at
+        //   run-time each time this function is called, and can be very expensive.
+        LowerIndex idx_diff_low_const;
+        LowerIndex idx_low_length_minus_idx_diff_low_const;
+        LowerIndex idx_low_length_plus_idx_diff_low_const;
+
+#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
+        index_t tmp = idx_diff_up[Number<0>{}];
+
+        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
+            idx_diff_low_const(i) = tmp / low_lengths_scan_[i];
+            tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
+        });
+
+        idx_diff_low_const(Number<NDimLow - 1>{}) = tmp;
+
+        static_for<0, NDimLow, 1>{}([&](auto i) {
+            idx_low_length_minus_idx_diff_low_const(i) = low_lengths_[i] - idx_diff_low_const[i];
+
+            idx_low_length_plus_idx_diff_low_const(i) = low_lengths_[i] + idx_diff_low_const[i];
+        });
+#else
+        // Hack: this force result into SGPR. Need to make sure the result is thread invariant
+        index_t tmp = idx_diff_up[Number<0>{}];
+
+        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
+            idx_diff_low_const(i) = __builtin_amdgcn_readfirstlane(tmp / low_lengths_scan_[i]);
+            tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
+        });
+
+        idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp);
+
+        static_for<0, NDimLow, 1>{}([&](auto i) {
+            idx_low_length_minus_idx_diff_low_const(i) =
+                __builtin_amdgcn_readfirstlane(low_lengths_[i] - idx_diff_low_const[i]);
+
+            idx_low_length_plus_idx_diff_low_const(i) = low_lengths_[i] + idx_diff_low_const[i];
+        });
+#endif
+
+        if constexpr(Hack == 1)
+        {
+            // do carry check on each low dimension in reversed order
+            // do not need to check the first dimension
+            index_t carry = 0;
+
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                index_t idx_low_tmp = idx_low[i] + carry;
+
+                bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
+
+                idx_diff_low(i) =
+                    do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
+
+                idx_diff_low(i) += carry;
+
+                carry = do_carry ? 1 : 0;
+            });
+
+            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
+
+            idx_low += idx_diff_low;
+        }
+        else if constexpr(Hack == 2)
+        {
+            // do carry check on each low dimension in reversed order
+            // do not need to check the first dimension
+            index_t borrow = 0;
+
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                index_t negative_idx_low_tmp = borrow - idx_low[i];
+
+                bool do_borrow = negative_idx_low_tmp > idx_diff_low_const[i];
+
+                idx_diff_low(i) =
+                    do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low_const[i];
+
+                idx_diff_low(i) -= borrow;
+
+                borrow = do_borrow ? 1 : 0;
+            });
+
+            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] - borrow;
+
+            idx_low += idx_diff_low;
+        }
+        else
+        {
+            // do carry check on each low dimension in reversed order
+            // do not need to check the first dimension
+            index_t carry = 0;
+
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                index_t idx_low_tmp = idx_low[i] + carry;
+
+                bool do_carry  = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
+                bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
+
+                idx_diff_low(i) =
+                    do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
+                idx_diff_low(i) =
+                    do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low[i];
+
+                idx_diff_low(i) += carry;
+
+                carry = do_carry ? 1 : 0;
+                carry = do_borrow ? -1 : carry;
+            });
+
+            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
+
+            idx_low += idx_diff_low;
+        }
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ void UpdateLowerIndex_2(LowIdxDiff& idx_diff_low,
+                                                const UpIdxDiff& idx_diff_up,
+                                                LowIdx& idx_low,
+                                                const UpIdx& /* idx_up_new */,
+                                                Number<Hack>) const
+    {
+        static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
+                          LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        // CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
+        // However,
+        //   1) If idx_diff_up is known at compile-time, then idx_diff_low_const
+        //   can be calculated at compile-time.
+        //   2) If idx_diff_up is not known at compile-time, but its value
+        //   doesn't change during the whole kernel execution, then
+        //   idx_diff_low_const also
+        //   doesn't change during the whole kernel execution. Compiler generated
+        //   ISA should
+        //   only caclculate idx_diff_low_const once and save it durinng the whole
+        //   kernel execution
+        // If neither 1) nor 2) is satisfied, then the calculation will also be
+        //   computed at run-time each time this function is called, and can be
+        //   very expensive.
+        LowerIndex idx_diff_low_const;
+
+#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
+        index_t tmp = idx_diff_up[Number<0>{}];
+
+        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
+            idx_diff_low_const(i) = tmp / low_lengths_scan_[i];
+            tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
+        });
+
+        idx_diff_low_const(Number<NDimLow - 1>{}) = tmp;
+#else
+        // Hack: this force result into SGPR. Need to make sure the result is thread invariant
+        index_t tmp = idx_diff_up[Number<0>{}];
+
+        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
+            idx_diff_low_const(i) = __builtin_amdgcn_readfirstlane(tmp / low_lengths_scan_[i]);
+            tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
+        });
+
+        idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp);
+#endif
+
+        if constexpr(Hack == 1)
+        {
+            // do carry check on each low dimension in reversed order
+            // do not need to check the first dimension
+            bool do_carry = 0;
+
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                idx_diff_low(i) = idx_diff_low_const[i] + do_carry;
+
+                index_t idx_low_tmp = idx_low[i] + idx_diff_low[i];
+
+                do_carry = idx_low_tmp >= low_lengths_[i];
+
+#if 0
+                // TODO: use exec-mask inline asm, which use 1 VALU
+                if(do_carry)
+                {
+                    idx_diff_low(i) -= low_lengths_[i];
+                }
+#elif 1
+                // this use 2 VALU
+                idx_diff_low(i) = do_carry ? idx_diff_low[i] - low_lengths_[i] : idx_diff_low[i];
+#elif 1
+                // this use 2 VALU
+                index_t idx_diff_low_tmp = idx_diff_low[i] - low_lengths_[i];
+                idx_diff_low(i)          = do_carry ? idx_diff_low_tmp : idx_diff_low[i];
+#endif
+
+                idx_low(i) += idx_diff_low[i];
+            });
+
+            constexpr auto I0 = Number<0>{};
+
+            idx_diff_low(I0) = idx_diff_low_const[I0] + do_carry;
+
+            idx_low(I0) += idx_diff_low[I0];
+        }
+        else if constexpr(Hack == 2)
+        {
+            // do borrow check on each low dimension in reversed order
+            // do not need to check the first dimension
+            bool do_borrow = 0;
+
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                idx_diff_low(i) = idx_diff_low_const[i] - do_borrow;
+
+                index_t idx_low_tmp = idx_low[i] + idx_diff_low[i];
+
+                do_borrow = idx_low_tmp < 0;
+
+#if 0
+                // TODO: use exec-mask inline asm
+                if(do_borrow)
+                {
+                    idx_diff_low(i) += low_lengths_[i];
+                }
+#elif 1
+                idx_diff_low(i) = do_borrow ? idx_diff_low[i] + low_lengths_[i] : idx_diff_low[i];
+#elif 1
+                index_t idx_diff_low_tmp = idx_diff_low[i] + low_lengths_[i];
+                idx_diff_low(i)          = do_borrow ? idx_diff_low_tmp : idx_diff_low[i];
+#endif
+
+                idx_low(i) += idx_diff_low[i];
+            });
+
+            constexpr auto I0 = Number<0>{};
+
+            idx_diff_low(I0) = idx_diff_low_const[I0] - do_borrow;
+
+            idx_low(I0) += idx_diff_low[I0];
+        }
+        else
+        {
+            // not implemented
+        }
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                              const UpIdxDiff& idx_diff_up,
+                                              LowIdx& idx_low,
+                                              const UpIdx& idx_up_new,
+                                              Number<Hack>) const
+    {
+#if 1
+        UpdateLowerIndex_1a(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
+#elif 0
+        UpdateLowerIndex_1b(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
+#else
+        UpdateLowerIndex_2(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
+#endif
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return false; }
+
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<LowLengths>::value &&
+               is_known_at_compile_time<LowLengthsScan>::value &&
+               is_known_at_compile_time<UpLengths>::value;
+    }
+
+    template <typename UpIdx>
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
+    {
+        return true;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("DynamicMerge_v1_carry_check, ");
+        printf("low_lengths_ ");
+        print_multi_index(low_lengths_);
+        printf("low_lengths_scan_ ");
+        print_multi_index(low_lengths_scan_);
+        printf("up_lengths_ ");
+        print_multi_index(up_lengths_);
+        printf("}");
+    }
+};
+
+template <typename LowLengths>
+struct lambda_merge_generate_MagicDivision_calculate_magic_multiplier
+{
+    template <index_t I>
+    __host__ __device__ constexpr auto operator()(Number<I> i) const
+    {
+        return MagicDivision::CalculateMagicMultiplier(LowLengths{}[i]);
+    }
+};
+
+template <typename LowLengths>
+struct lambda_merge_generate_MagicDivision_calculate_magic_shift
+{
+    template <index_t I>
+    __host__ __device__ constexpr auto operator()(Number<I> i) const
+    {
+        return MagicDivision::CalculateMagicShift(LowLengths{}[i]);
+    }
+};
+
+// Implementation of "Merge" transformation primitive that uses magic-number-division to do lowering
+// of both multi-index and delta of multi-index
+// Caution:
+//   1. The magic number division implementation being used would produce correct result if the
+//   dividended is uint32_t and its value is with in 31-bit value range of uint32_t.
+//   2. The magic number division for int32_t dividened has not been implemented, the int32_t
+//   dividend would be bit-wise interpreted as uint32_t and magic number division implementation for
+//   uint32_t is then used.
+//   3. For Merge primitive, upper-index is the dividend.
+//   4. When upper-index is uint32_t, its value need to be within 31-bit range.
+//   5. When upper-index is int32_t type (when index_t is int32_t), its value need to be
+//   non-negative.
+template <typename LowLengths>
+struct DynamicMerge_v2_magic_division
+{
+    static constexpr index_t NDimLow = LowLengths::Size();
+
+    using LowerIndex = MultiIndex<NDimLow>;
+    using UpperIndex = MultiIndex<1>;
+
+    using UpLengths =
+        decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies_v2{}, Number<1>{})));
+
+    using LowLengthsMagicDivisorMultipiler = decltype(
+        generate_tuple(lambda_merge_generate_MagicDivision_calculate_magic_multiplier<LowLengths>{},
+                       Number<NDimLow>{}));
+
+    using LowLengthsMagicDivisorShift = decltype(
+        generate_tuple(lambda_merge_generate_MagicDivision_calculate_magic_shift<LowLengths>{},
+                       Number<NDimLow>{}));
+
+    LowLengths low_lengths_;
+    LowLengthsMagicDivisorMultipiler low_lengths_magic_divisor_multiplier_;
+    LowLengthsMagicDivisorShift low_lengths_magic_divisor_shift_;
+    UpLengths up_lengths_;
+
+    __host__ __device__ constexpr DynamicMerge_v2_magic_division() = default;
+
+    __host__ __device__ constexpr DynamicMerge_v2_magic_division(const LowLengths& low_lengths)
+        : low_lengths_{low_lengths},
+          low_lengths_magic_divisor_multiplier_{generate_tuple(
+              [&](auto i) { return MagicDivision::CalculateMagicMultiplier(low_lengths[i]); },
+              Number<NDimLow>{})},
+          low_lengths_magic_divisor_shift_{generate_tuple(
+              [&](auto i) { return MagicDivision::CalculateMagicShift(low_lengths[i]); },
+              Number<NDimLow>{})},
+          up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies_v2{}, Number<1>{}))}
+    {
+        static_assert(LowerIndex::Size() == NDimLow, "wrong!");
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return NDimLow; }
+
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
+    {
+        static_assert(LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        index_t tmp = idx_up[Number<0>{}];
+
+        static_for<NDimLow - 1, 0, -1>{}([&, this](auto i) {
+            index_t tmp2 =
+                MagicDivision::DoMagicDivision(tmp,
+                                               this->low_lengths_magic_divisor_multiplier_[i],
+                                               this->low_lengths_magic_divisor_shift_[i]);
+            idx_low(i) = tmp - tmp2 * this->low_lengths_[i];
+            tmp        = tmp2;
+        });
+
+        idx_low(Number<0>{}) = tmp;
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                              const UpIdxDiff&,
+                                              LowIdx& idx_low,
+                                              const UpIdx& idx_up_new,
+                                              Number<Hack>) const
+    {
+        static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
+                          LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        index_t tmp = idx_up_new[Number<0>{}];
+
+        static_for<NDimLow - 1, 0, -1>{}([&, this](auto i) {
+            index_t tmp2 =
+                MagicDivision::DoMagicDivision(tmp,
+                                               this->low_lengths_magic_divisor_multiplier_[i],
+                                               this->low_lengths_magic_divisor_shift_[i]);
+
+            index_t idx_low_old = idx_low[i];
+
+            idx_low(i) = tmp - tmp2 * this->low_lengths_[i];
+            tmp        = tmp2;
+
+            idx_diff_low(i) = idx_low[i] - idx_low_old;
+        });
+
+        idx_diff_low(Number<0>{}) = tmp - idx_low(Number<0>{});
+
+        idx_low(Number<0>{}) = tmp;
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return false; }
+
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<LowLengths>::value &&
+               is_known_at_compile_time<LowLengthsMagicDivisorMultipiler>::value &&
+               is_known_at_compile_time<LowLengthsMagicDivisorShift>::value &&
+               is_known_at_compile_time<UpLengths>::value;
+    }
+
+    template <typename UpIdx>
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
+    {
+        return true;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("DynamicMerge_v2_magic_division, ");
+        printf("low_lengths_ ");
+        print_multi_index(low_lengths_);
+        printf("low_lengths_magic_divisor_multiplier_ ");
+        print_multi_index(low_lengths_magic_divisor_multiplier_);
+        printf("low_lengths_magic_divisor_shift_ ");
+        print_multi_index(low_lengths_magic_divisor_shift_);
+        printf("up_lengths_ ");
+        print_multi_index(up_lengths_);
+        printf("}");
+    }
+};
+
+// Implementation of "Merge" transformation primitive that uses magic-number-division to do lowering
+// of both multi-index and delta of multi-index
+// Caution:
+//   1. The magic number division implementation being used would produce correct result if the
+//   dividended is uint32_t and its value is with in 31-bit value range of uint32_t.
+//   2. The magic number division for int32_t dividened has not been implemented, the int32_t
+//   dividend would be bit-wise interpreted as uint32_t and magic number division implementation for
+//   uint32_t is then used.
+//   3. For Merge primitive, upper-index is the dividend.
+//   4. When upper-index is uint32_t, its value need to be within 31-bit range.
+//   5. When upper-index is int32_t type (when index_t is int32_t), its value need to be
+//   non-negative.
+template <typename LowLengths>
+struct DynamicMerge_v2r2_magic_division
+{
+    static constexpr index_t NDimLow = LowLengths::Size();
+
+    using LowerIndex = MultiIndex<NDimLow>;
+    using UpperIndex = MultiIndex<1>;
+
+    using LowLengthsScan = decltype(
+        container_reverse_exclusive_scan(LowLengths{}, math::multiplies_v2{}, Number<1>{}));
+
+    using UpLengths =
+        decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies_v2{}, Number<1>{})));
+
+    using LowLengthsScanMagicDivisorMultipiler = decltype(generate_tuple(
+        lambda_merge_generate_MagicDivision_calculate_magic_multiplier<LowLengthsScan>{},
+        Number<NDimLow>{}));
+
+    using LowLengthsScanMagicDivisorShift = decltype(
+        generate_tuple(lambda_merge_generate_MagicDivision_calculate_magic_shift<LowLengthsScan>{},
+                       Number<NDimLow>{}));
+
+    LowLengths low_lengths_;
+    LowLengthsScan low_lengths_scan_;
+    LowLengthsScanMagicDivisorMultipiler low_lengths_scan_magic_divisor_multiplier_;
+    LowLengthsScanMagicDivisorShift low_lengths_scan_magic_divisor_shift_;
+    UpLengths up_lengths_;
+
+    __host__ __device__ constexpr DynamicMerge_v2r2_magic_division() = default;
+
+    __host__ __device__ constexpr DynamicMerge_v2r2_magic_division(const LowLengths& low_lengths)
+        : low_lengths_{low_lengths},
+          low_lengths_scan_{
+              container_reverse_exclusive_scan(low_lengths, math::multiplies_v2{}, Number<1>{})},
+          low_lengths_scan_magic_divisor_multiplier_{generate_tuple(
+              [&](auto i) { return MagicDivision::CalculateMagicMultiplier(low_lengths_scan_[i]); },
+              Number<NDimLow>{})},
+          low_lengths_scan_magic_divisor_shift_{generate_tuple(
+              [&](auto i) { return MagicDivision::CalculateMagicShift(low_lengths_scan_[i]); },
+              Number<NDimLow>{})},
+          up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies_v2{}, Number<1>{}))}
+    {
+        static_assert(LowerIndex::Size() == NDimLow, "wrong!");
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return NDimLow; }
+
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
+    {
+        static_assert(LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        index_t tmp = idx_up[Number<0>{}];
+
+        static_for<0, NDimLow - 1, 1>{}([&, this](auto i) {
+            idx_low(i) =
+                MagicDivision::DoMagicDivision(tmp,
+                                               this->low_lengths_scan_magic_divisor_multiplier_[i],
+                                               this->low_lengths_scan_magic_divisor_shift_[i]);
+
+            tmp -= idx_low[i] * this->low_lengths_scan_[i];
+        });
+
+        idx_low(Number<NDimLow - 1>{}) = tmp;
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                              const UpIdxDiff&,
+                                              LowIdx& idx_low,
+                                              const UpIdx& idx_up_new,
+                                              Number<Hack>) const
+    {
+        static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
+                          LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        index_t tmp = idx_up_new[Number<0>{}];
+
+        static_for<0, NDimLow - 1, 1>{}([&, this](auto i) {
+            index_t idx_low_old = idx_low[i];
+
+            idx_low(i) =
+                MagicDivision::DoMagicDivision(tmp,
+                                               this->low_lengths_scan_magic_divisor_multiplier_[i],
+                                               this->low_lengths_scan_magic_divisor_shift_[i]);
+
+            idx_diff_low(i) = idx_low[i] - idx_low_old;
+
+            tmp -= idx_low[i] * this->low_lengths_scan_[i];
+        });
+
+        idx_diff_low(Number<NDimLow - 1>{}) = tmp - idx_low[Number<NDimLow - 1>{}];
+
+        idx_low(Number<NDimLow - 1>{}) = tmp;
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return false; }
+
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<LowLengths>::value &&
+               is_known_at_compile_time<LowLengthsScanMagicDivisorMultipiler>::value &&
+               is_known_at_compile_time<LowLengthsScanMagicDivisorShift>::value &&
+               is_known_at_compile_time<UpLengths>::value;
+    }
+
+    template <typename UpIdx>
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
+    {
+        return true;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("DynamicMerge_v2r2_magic_division, ");
+        printf("low_lengths_ ");
+        print_multi_index(low_lengths_);
+        printf("low_lengths_scan ");
+        print_multi_index(low_lengths_scan_);
+        printf("low_lengths_scan_magic_divisor_multiplier_ ");
+        print_multi_index(low_lengths_scan_magic_divisor_multiplier_);
+        printf("low_lengths_scan_magic_divisor_shift_ ");
+        print_multi_index(low_lengths_scan_magic_divisor_shift_);
+        printf("up_lengths_ ");
+        print_multi_index(up_lengths_);
+        printf("}");
+    }
+};
+
+template <typename UpLengths, bool Use24BitIntegerCalculation>
+struct DynamicUnMerge
+{
+    static constexpr index_t NDimUp = UpLengths::Size();
+
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<NDimUp>;
+
+    using UpLengthsScan =
+        decltype(container_reverse_exclusive_scan(UpLengths{}, math::multiplies_v2{}, Number<1>{}));
+
+    UpLengths up_lengths_;
+    UpLengthsScan up_lengths_scan_;
+
+    __host__ __device__ constexpr DynamicUnMerge() = default;
+
+    __host__ __device__ constexpr DynamicUnMerge(const UpLengths& up_lengths)
+        : up_lengths_{up_lengths},
+          up_lengths_scan_{
+              container_reverse_exclusive_scan(up_lengths, math::multiplies_v2{}, Number<1>{})}
+    {
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
+
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return NDimUp; }
+
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
+    {
+        if constexpr(!Use24BitIntegerCalculation)
+        {
+            idx_low(Number<0>{}) = idx_up[Number<NDimUp - 1>{}];
+
+            static_for<0, NDimUp - 1, 1>{}(
+                [&](auto i) { idx_low(Number<0>{}) += idx_up[i] * up_lengths_scan_[i]; });
+        }
+        else
+        {
+            idx_low(Number<0>{}) = idx_up[Number<NDimUp - 1>{}];
+
+            static_for<0, NDimUp - 1, 1>{}([&](auto i) {
+                idx_low(Number<0>{}) =
+                    (0x00ffffff & idx_low[Number<0>{}]) +
+                    (0x00ffffff & idx_up[i]) * (0x00ffffff & up_lengths_scan_[i]);
+            });
+        }
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                              const UpIdxDiff& idx_diff_up,
+                                              LowIdx& idx_low,
+                                              const UpIdx&,
+                                              Number<Hack>) const
+    {
+        CalculateLowerIndex(idx_diff_low, idx_diff_up);
+
+        idx_low += idx_diff_low;
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+
+    template <typename UpIdx>
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
+    {
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<UpLengths>::value &&
+               is_known_at_compile_time<UpLengthsScan>::value;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("DynamicUnMerge, ");
+        printf("up_lengths_");
+        print_multi_index(up_lengths_);
+        printf("up_lengths_scan_");
+        print_multi_index(up_lengths_scan_);
+        printf("}");
+    }
+};
+
+template <typename LowerIndex>
+struct DynamicFreeze
+{
+    LowerIndex low_idx_;
+
+    __host__ __device__ constexpr DynamicFreeze() = default;
+
+    __host__ __device__ constexpr DynamicFreeze(const LowerIndex& low_idx) : low_idx_{low_idx} {}
+
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
+
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 0; }
+
+    __host__ __device__ static constexpr auto GetUpperLengths() { return Tuple<>{}; }
+
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& /* idx_up */) const
+    {
+        static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 0,
+                      "wrong! inconsistent # of dimension");
+
+        idx_low(Number<0>{}) = low_idx_;
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ static void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                                     const UpIdxDiff& /* idx_diff_up */,
+                                                     LowIdx& /* idx_low */,
+                                                     const UpIdx& /* idx_up_new */,
+                                                     Number<Hack>)
+    {
+        idx_diff_low(Number<0>{}) = 0;
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+
+    template <typename UpIdx>
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
+    {
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<LowerIndex>::value;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("DynamicFreeze");
+        printf("low_idx_ %d", index_t{low_idx_});
+    }
+};
+
+// Insert a dangling upper dimension without lower dimension
+template <typename UpperLength>
+struct DynamicInsert
+{
+    using UpLengths = decltype(make_tuple(UpperLength{}));
+
+    UpLengths up_lengths_;
+
+    __host__ __device__ constexpr DynamicInsert() = default;
+
+    __host__ __device__ constexpr DynamicInsert(const UpperLength& up_length)
+        : up_lengths_{make_tuple(up_length)}
+    {
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 0; }
+
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+
+    __host__ __device__ constexpr auto GetUpperLengths() const { return up_lengths_; }
+
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx&, const UpIdx&) const
+    {
+        static_assert(LowIdx::Size() == 0 && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ static void
+    UpdateLowerIndex(LowIdxDiff&, const UpIdxDiff&, LowIdx&, const UpIdx&, Number<Hack>)
+    {
+        static_assert(LowIdxDiff::Size() == 0 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 0 &&
+                          UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+
+    template <typename UpIdx>
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
+    {
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<UpperLength>::value;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("DynamicInsert");
+        print_multi_index(up_lengths_);
+    }
+};
+
+template <typename VectorSize, typename UpLength>
+struct DynamicVectorize
+{
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<1>;
+
+    using UpLengths = decltype(make_tuple(UpLength{}));
+
+    UpLengths up_lengths_;
+    VectorSize vector_size_;
+
+    __host__ __device__ constexpr DynamicVectorize() = default;
+
+    __host__ __device__ constexpr DynamicVectorize(const VectorSize& vector_size,
+                                                   const UpLength& up_length)
+        : vector_size_{vector_size}, up_lengths_{make_tuple(up_length)}
+    {
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
+
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up) const
+    {
+        static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        idx_low(Number<0>{}) = vector_size_ * idx_up[Number<0>{}];
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                              const UpIdxDiff& idx_diff_up,
+                                              LowIdx& idx_low,
+                                              const UpIdx&,
+                                              Number<Hack>) const
+    {
+        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
+                          UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        constexpr auto I0 = Number<0>{};
+
+        idx_diff_low(I0) = vector_size_ * idx_diff_up[I0];
+
+        idx_low += idx_diff_low;
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+
+    template <typename UpIdx>
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
+    {
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<UpLengths>::value;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("DynamicVectorize, ");
+        printf("up_lengths_");
+        print_multi_index(up_lengths_);
+        printf("}");
+    }
+};
+
+template <typename LowLength, typename SliceBegin, typename SliceEnd>
+struct DynamicSlice
+{
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<1>;
+
+    using UpLengths = decltype(make_tuple(SliceEnd{} - SliceBegin{}));
+
+    UpLengths up_lengths_;
+    SliceBegin slice_begin_;
+    SliceEnd slice_end_;
+
+    __host__ __device__ constexpr DynamicSlice() = default;
+
+    __host__ __device__ constexpr DynamicSlice(const LowLength&,
+                                               const SliceBegin& slice_begin,
+                                               const SliceEnd& slice_end)
+        : up_lengths_{make_tuple(slice_end - slice_begin)},
+          slice_begin_{slice_begin},
+          slice_end_{slice_end}
+    {
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
+
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
+    {
+        static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        idx_low(Number<0>{}) = idx_up[Number<0>{}] + slice_begin_;
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ static void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                                     const UpIdxDiff& idx_diff_up,
+                                                     LowIdx& idx_low,
+                                                     const UpIdx&,
+                                                     Number<Hack>)
+    {
+        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
+                          UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        constexpr auto I0 = Number<0>{};
+
+        idx_diff_low(I0) = idx_diff_up[I0];
+
+        idx_low += idx_diff_low;
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+
+    template <typename UpIdx>
+    __host__ __device__ constexpr bool IsValidUpperIndexMappedToValidLowerIndex(const UpIdx&) const
+    {
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<UpLengths>::value &&
+               is_known_at_compile_time<SliceBegin>::value &&
+               is_known_at_compile_time<SliceEnd>::value;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("DynamicSlice, ");
+        printf("up_lengths_");
+        print_multi_index(up_lengths_);
+        printf("slice_begin_ %d", index_t{slice_begin_});
+        printf("slice_end %d", index_t{slice_end_});
+        printf("}");
+    }
+};
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/tensor_description/dynamic_multi_index_transform_helper.hpp b/composable_kernel/include/tensor_description/dynamic_multi_index_transform_helper.hpp
new file mode 100644
index 0000000000..b3e1c60485
--- /dev/null
+++ b/composable_kernel/include/tensor_description/dynamic_multi_index_transform_helper.hpp
@@ -0,0 +1,104 @@
+#ifndef CK_DYNAMIC_MULTI_INDEX_TRANSFORM_HELPER_HPP
+#define CK_DYNAMIC_MULTI_INDEX_TRANSFORM_HELPER_HPP
+
+#include "common_header.hpp"
+#include "dynamic_multi_index_transform.hpp"
+
+namespace ck {
+
+template <typename LowLength>
+__host__ __device__ constexpr auto make_pass_through_transform(const LowLength& low_length)
+{
+    return DynamicPassThrough<LowLength>{low_length};
+}
+
+template <typename LowLength, typename LeftPad, typename RightPad, bool SkipIsValidCheck = false>
+__host__ __device__ constexpr auto
+make_pad_transform(const LowLength& low_length,
+                   const LeftPad& left_pad,
+                   const RightPad& right_pad,
+                   integral_constant<bool, SkipIsValidCheck> = integral_constant<bool, false>{})
+{
+    return DynamicPad<LowLength, LeftPad, RightPad, SkipIsValidCheck>{
+        low_length, left_pad, right_pad};
+}
+
+template <typename LowLength, typename LeftPad, bool SkipIsValidCheck = false>
+__host__ __device__ constexpr auto make_left_pad_transform(
+    const LowLength& low_length,
+    const LeftPad& left_pad,
+    integral_constant<bool, SkipIsValidCheck> = integral_constant<bool, false>{})
+{
+    return DynamicLeftPad<LowLength, LeftPad, SkipIsValidCheck>{low_length, left_pad};
+}
+
+template <typename LowLength, typename RightPad, bool SkipIsValidCheck>
+__host__ __device__ constexpr auto make_right_pad_transform(
+    const LowLength& low_length,
+    const RightPad& right_pad,
+    integral_constant<bool, SkipIsValidCheck> = integral_constant<bool, false>{})
+{
+    return DynamicRightPad<LowLength, RightPad, SkipIsValidCheck>{low_length, right_pad};
+}
+
+template <typename UpLengths,
+          typename Coefficients,
+          typename std::enable_if<UpLengths::Size() == Coefficients::Size(), bool>::type = false>
+__host__ __device__ constexpr auto make_embed_transform(const UpLengths& up_lengths,
+                                                        const Coefficients& coefficients)
+{
+    return DynamicEmbed<UpLengths, Coefficients>{up_lengths, coefficients};
+}
+
+template <typename LowLengths>
+__host__ __device__ constexpr auto make_merge_transform(const LowLengths& low_lengths)
+{
+#if !CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION
+    return DynamicMerge_v1_carry_check<LowLengths>{low_lengths};
+#else
+#if 1
+    return DynamicMerge_v2_magic_division<LowLengths>{low_lengths};
+#else
+    return DynamicMerge_v2r2_magic_division<LowLengths>{low_lengths};
+#endif
+#endif
+}
+
+template <typename LowLengths>
+__host__ __device__ constexpr auto
+make_merge_transform_v2_magic_division(const LowLengths& low_lengths)
+{
+    return DynamicMerge_v2_magic_division<LowLengths>{low_lengths};
+}
+
+template <typename UpLengths, bool Use24BitIntegerCalculation = false>
+__host__ __device__ constexpr auto make_unmerge_transform(
+    const UpLengths& up_lengths,
+    integral_constant<bool, Use24BitIntegerCalculation> = integral_constant<bool, false>{})
+{
+    return DynamicUnMerge<UpLengths, Use24BitIntegerCalculation>{up_lengths};
+}
+
+template <typename LowerIndex>
+__host__ __device__ constexpr auto make_freeze_transform(const LowerIndex& low_idx)
+{
+    return DynamicFreeze<LowerIndex>{low_idx};
+}
+
+template <typename LowLength, typename SliceBegin, typename SliceEnd>
+__host__ __device__ constexpr auto make_slice_transform(const LowLength& low_length,
+                                                        const SliceBegin& slice_begin,
+                                                        const SliceEnd& slice_end)
+{
+    return DynamicSlice<LowLength, SliceBegin, SliceEnd>{low_length, slice_begin, slice_end};
+}
+
+template <typename VectorSize, typename UpLength>
+__host__ __device__ constexpr auto make_vectorize_transform(const VectorSize& vector_size,
+                                                            const UpLength& up_length)
+{
+    return DynamicVectorize<VectorSize, UpLength>{vector_size, up_length};
+}
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp b/composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
new file mode 100644
index 0000000000..b9ca26c879
--- /dev/null
+++ b/composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
@@ -0,0 +1,596 @@
+#ifndef CK_DYNAMIC_TENSOR_DESCRIPTOR_HPP
+#define CK_DYNAMIC_TENSOR_DESCRIPTOR_HPP
+
+#include "common_header.hpp"
+#include "dynamic_multi_index_transform.hpp"
+
+namespace ck {
+
+template <index_t NDimHidden, typename VisibleDimensionIds>
+struct DynamicTensorCoordinate;
+
+template <index_t NTransform, index_t NDimVisible, typename UpdateLowerIndexHack>
+struct DynamicTensorCoordinateIterator;
+
+// Transforms: Tuple<transforms...>
+// LowerDimensionIdss : Tuple<Sequence<...>, ...>
+// UpperDimensionIdss : Tuple<Sequence<...>, ...>
+// VisibleDimensionIds> : Sequence<...>
+template <typename Transforms,
+          typename LowerDimensionIdss,
+          typename UpperDimensionIdss,
+          typename VisibleDimensionIds,
+          typename ElementSpaceSize>
+struct DynamicTensorDescriptor
+{
+    // TODO make these private
+    __host__ __device__ static constexpr index_t GetNumOfTransform() { return Transforms::Size(); }
+
+    __host__ __device__ static constexpr index_t GetNumOfVisibleDimension()
+    {
+        return VisibleDimensionIds::Size();
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfHiddenDimension()
+    {
+        constexpr auto all_low_dim_ids = unpack(
+            [](auto&&... xs) constexpr { return merge_sequences(xs...); }, LowerDimensionIdss{});
+
+        constexpr auto all_up_dim_ids = unpack(
+            [](auto&&... xs) constexpr { return merge_sequences(xs...); }, UpperDimensionIdss{});
+
+        constexpr auto all_dim_ids = merge_sequences(all_low_dim_ids, all_up_dim_ids);
+
+        using unique_sort_all_dim_ids = typename sequence_unique_sort<decltype(all_dim_ids),
+                                                                      math::less<index_t>,
+                                                                      math::equal<index_t>>::type;
+
+        return unique_sort_all_dim_ids::Size();
+    }
+
+    __host__ __device__ static constexpr auto InitializeElementSize(const Transforms& transforms)
+    {
+        const auto lengths = generate_tuple(
+            [&](auto idim_visible) {
+                constexpr auto tmp = GetTransformAndItsUpperDimension(idim_visible);
+
+                constexpr index_t itran   = tmp[Number<0>{}];
+                constexpr index_t idim_up = tmp[Number<1>{}];
+                constexpr bool found      = tmp[Number<2>{}];
+
+                static_assert(found == true,
+                              "wrong! not found matching transformation and upper-dimension");
+
+                const auto length =
+                    transforms[Number<itran>{}].GetUpperLengths()[Number<idim_up>{}];
+
+                return length;
+            },
+            Number<ndim_visible_>{});
+
+        // TODO: make container_reduce support tuple of Number and index_t
+        return container_reduce(lengths, math::multiplies_v2{}, Number<1>{});
+    }
+
+    template <index_t IDim>
+    __host__ __device__ static constexpr auto GetTransformAndItsUpperDimension(Number<IDim>)
+    {
+        constexpr auto idim_visible = Number<IDim>{};
+
+        constexpr index_t idim_hidden = VisibleDimensionIds::At(idim_visible);
+
+        index_t itran_found   = 0;
+        index_t idim_up_found = 0;
+        bool found            = false;
+
+        static_for<0, ntransform_, 1>{}([&](auto itran) {
+            constexpr auto up_dim_ids = UpperDimensionIdss{}[itran];
+
+            static_for<0, up_dim_ids.Size(), 1>{}([&](auto idim_up) {
+                if constexpr(up_dim_ids[idim_up] == idim_hidden)
+                {
+                    itran_found   = itran;
+                    idim_up_found = idim_up;
+                    found         = true;
+                }
+            });
+        });
+
+        return make_tuple(itran_found, idim_up_found, found);
+    }
+
+    constexpr static index_t ntransform_   = GetNumOfTransform();
+    constexpr static index_t ndim_visible_ = GetNumOfVisibleDimension();
+    constexpr static index_t ndim_hidden_  = GetNumOfHiddenDimension();
+
+    using VisibleIndex = MultiIndex<ndim_visible_>;
+    using HiddenIndex  = MultiIndex<ndim_hidden_>;
+    using Coordinate   = DynamicTensorCoordinate<ndim_hidden_, VisibleDimensionIds>;
+
+    // may be index_t or Number<>
+    using ElementSize = remove_cv_t<decltype(InitializeElementSize(Transforms{}))>;
+
+    public:
+    __host__ __device__ constexpr DynamicTensorDescriptor() = default;
+
+    __host__ __device__ constexpr DynamicTensorDescriptor(const Transforms& transforms,
+                                                          ElementSpaceSize element_space_size)
+        : transforms_{transforms},
+          element_size_{InitializeElementSize(transforms)},
+          element_space_size_{element_space_size}
+
+    {
+        static_assert(Transforms::Size() == ntransform_ &&
+                          LowerDimensionIdss::Size() == ntransform_ &&
+                          UpperDimensionIdss::Size() == ntransform_,
+                      "wrong! inconsistent # of transformations");
+
+        // TODO check dependency of dimensions is valid
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfDimension()
+    {
+        return GetNumOfVisibleDimension();
+    }
+
+    template <index_t IDim>
+    __host__ __device__ constexpr auto GetLength(Number<IDim>) const
+    {
+        static_assert(IDim >= 0 && IDim < ndim_visible_, "wrong! out of range");
+
+        constexpr auto tmp = GetTransformAndItsUpperDimension(Number<IDim>{});
+
+        constexpr index_t itran   = tmp[Number<0>{}];
+        constexpr index_t idim_up = tmp[Number<1>{}];
+        constexpr bool found      = tmp[Number<2>{}];
+
+        static_assert(found == true,
+                      "wrong! not found matching transformation and upper-dimension");
+
+        return transforms_[Number<itran>{}].GetUpperLengths()[Number<idim_up>{}];
+    }
+
+    __host__ __device__ constexpr auto GetElementSize() const { return element_size_; }
+
+    __host__ __device__ constexpr auto GetElementSpaceSize() const { return element_space_size_; }
+
+    template <typename Idx>
+    __host__ __device__ constexpr index_t CalculateOffset(const Idx& idx) const
+    {
+        static_assert(Idx::Size() == GetNumOfDimension(), "wrong! inconsistent # of dimension");
+
+        return make_dynamic_tensor_coordinate(*this, idx).GetOffset();
+    }
+
+    // TODO make these private
+    __host__ __device__ constexpr const auto& GetTransforms() const { return transforms_; }
+
+    __host__ __device__ static constexpr auto GetLowerDimensionIdss()
+    {
+        return LowerDimensionIdss{};
+    }
+
+    __host__ __device__ static constexpr auto GetUpperDimensionIdss()
+    {
+        return UpperDimensionIdss{};
+    }
+
+    __host__ __device__ static constexpr auto GetVisibleDimensionIds()
+    {
+        return VisibleDimensionIds{};
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        bool is_known = true;
+
+        static_for<0, Transforms::Size(), 1>{}([&](auto i) {
+            is_known &=
+                remove_cv_t<remove_reference_t<decltype(Transforms{}[i])>>::IsKnownAtCompileTime();
+        });
+
+        return is_known && is_known_at_compile_time<ElementSize>::value &&
+               is_known_at_compile_time<ElementSpaceSize>::value;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("DynamicTensorDescriptor, ");
+        static_for<0, ntransform_, 1>{}([&](auto i) {
+            printf("transforms: ");
+            transforms_[i].Print();
+            printf("LowerDimensionIds:");
+            LowerDimensionIdss{}.At(i).Print();
+            printf("UpperDimensionIds:");
+            UpperDimensionIdss{}.At(i).Print();
+        });
+        printf("}");
+
+        VisibleDimensionIds::Print();
+    }
+
+    // TODO make these private
+    Transforms transforms_;
+    ElementSize element_size_;
+    ElementSpaceSize element_space_size_;
+};
+
+template <index_t NDimHidden, typename VisibleDimensionIds>
+struct DynamicTensorCoordinate
+{
+    // TODO make these private
+    static constexpr index_t ndim_visible_ = VisibleDimensionIds::Size();
+
+    using HiddenIndex  = MultiIndex<NDimHidden>;
+    using VisibleIndex = MultiIndex<ndim_visible_>;
+
+    public:
+    __host__ __device__ constexpr DynamicTensorCoordinate() = default;
+
+    __host__ __device__ constexpr DynamicTensorCoordinate(const HiddenIndex& idx_hidden)
+        : idx_hidden_{idx_hidden}
+    {
+    }
+
+    __host__ __device__ constexpr auto GetIndex() const { return GetVisibleIndex(); }
+
+    __host__ __device__ constexpr index_t GetOffset() const { return idx_hidden_[Number<0>{}]; }
+
+    // TODO make these private
+    __host__ __device__ constexpr const auto& GetHiddenIndex() const { return idx_hidden_; }
+
+    __host__ __device__ auto& GetHiddenIndex() { return idx_hidden_; }
+
+    __host__ __device__ constexpr auto GetVisibleIndex() const
+    {
+        return get_container_subset(idx_hidden_, VisibleDimensionIds{});
+    }
+
+    // TODO make these private
+    HiddenIndex idx_hidden_;
+};
+
+template <index_t NTransform, index_t NDimVisible, typename UpdateLowerIndexHack>
+struct DynamicTensorCoordinateIterator
+{
+    // TODO make these private
+    using VisibleIndex = MultiIndex<NDimVisible>;
+
+    public:
+    __host__ __device__ constexpr DynamicTensorCoordinateIterator() = default;
+
+    __host__ __device__ constexpr DynamicTensorCoordinateIterator(
+        const VisibleIndex& idx_diff_visible, const MultiIndex<NTransform>& do_transforms)
+        : idx_diff_visible_{idx_diff_visible}, do_transforms_{do_transforms}
+    {
+    }
+
+    __host__ __device__ constexpr const auto& GetIndexDiff() const { return GetVisibleIndexDiff(); }
+
+    // TODO make these private
+    __host__ __device__ constexpr const auto& GetVisibleIndexDiff() const
+    {
+        return idx_diff_visible_;
+    }
+
+    VisibleIndex idx_diff_visible_;
+    MultiIndex<NTransform> do_transforms_;
+
+    // HACK: control UpdateLowerIndex()
+    static constexpr UpdateLowerIndexHack update_lower_index_hack_;
+};
+
+// TODO: How to fix this? It uses an struct instead of lambda because lambda
+// doesn't have constructor, and to put it outside the scope where it is used
+// (transform_dynamic_tensor_descriptor) because template cannot be defined inside a function
+// template
+template <typename NewTransforms>
+struct lambda_get_up_dim_num
+{
+    template <typename I>
+    __host__ __device__ constexpr auto operator()(I) const
+    {
+        using Tran = remove_reference_t<decltype(NewTransforms{}.At(I{}))>;
+        return Number<Tran::GetNumOfUpperDimension()>{};
+    }
+};
+
+template <typename OldTensorDescriptor,
+          typename NewTransforms,
+          typename NewLowerDimensionOldVisibleIdss,
+          typename NewUpperDimensionNewVisibleIdss>
+__host__ __device__ constexpr auto
+transform_dynamic_tensor_descriptor(const OldTensorDescriptor& old_tensor_desc,
+                                    const NewTransforms& new_transforms,
+                                    NewLowerDimensionOldVisibleIdss,
+                                    NewUpperDimensionNewVisibleIdss)
+{
+    // sanity check
+    {
+        constexpr auto all_old_top_ids = unpack([](auto... xs) { return merge_sequences(xs...); },
+                                                NewLowerDimensionOldVisibleIdss{});
+
+        constexpr auto all_new_top_ids = unpack([](auto... xs) { return merge_sequences(xs...); },
+                                                NewUpperDimensionNewVisibleIdss{});
+
+        static_assert(is_valid_sequence_map<decltype(all_old_top_ids)>::value &&
+                          is_valid_sequence_map<decltype(all_new_top_ids)>::value,
+                      "wrong!");
+    }
+
+    // lower dimension's hidden idss
+    // convert lower dimension visible idss (tuple of sequences) to hidden idss (tuple of
+    // sequences)
+    constexpr auto low_dim_hidden_idss = transform_tuples(
+        // convert lower dimension visible ids (a sequence) to hidden ids (a sequence)
+        [](auto low_dim_visible_ids) constexpr {
+            return transform_sequences(
+                // convert lower dimension visible id to hidden id
+                [](auto low_dim_visible_id) constexpr {
+                    return OldTensorDescriptor::GetVisibleDimensionIds()[low_dim_visible_id];
+                },
+                low_dim_visible_ids);
+        },
+        NewLowerDimensionOldVisibleIdss{});
+
+    constexpr index_t num_new_transform = NewTransforms::Size();
+
+    // upper dimension's hidden idss
+    constexpr index_t old_hidden_dim_number = OldTensorDescriptor::GetNumOfHiddenDimension();
+
+    constexpr auto up_dim_numbers =
+        generate_sequence(lambda_get_up_dim_num<NewTransforms>{}, Number<num_new_transform>{});
+
+    constexpr auto up_dim_numbers_scan = merge_sequences(
+        Sequence<0>{}, inclusive_scan_sequence(up_dim_numbers, math::plus<index_t>{}, Number<0>{}));
+
+    constexpr auto up_dim_hidden_idss = generate_tuple(
+        [ old_hidden_dim_number, up_dim_numbers_scan ](auto i) constexpr {
+            return
+                typename arithmetic_sequence_gen<old_hidden_dim_number + up_dim_numbers_scan[i],
+                                                 old_hidden_dim_number + up_dim_numbers_scan[i + 1],
+                                                 1>::type{};
+        },
+        Number<num_new_transform>{});
+
+    // new visible dimension's hidden ids
+    constexpr auto unordered_new_visible_dim_hidden_ids = unpack(
+        [](auto... xs) constexpr { return merge_sequences(xs...); }, up_dim_hidden_idss);
+
+    constexpr auto new_visible_dim_unordered2ordered = unpack(
+        [](auto... xs) constexpr { return merge_sequences(xs...); },
+        NewUpperDimensionNewVisibleIdss{});
+
+    constexpr auto new_visible_dim_hidden_ids =
+        unordered_new_visible_dim_hidden_ids.ReorderGivenOld2New(new_visible_dim_unordered2ordered);
+
+    // put everything together
+    const auto all_transforms = container_concat(old_tensor_desc.GetTransforms(), new_transforms);
+
+    constexpr auto all_low_dim_hidden_idss =
+        container_concat(OldTensorDescriptor::GetLowerDimensionIdss(), low_dim_hidden_idss);
+
+    constexpr auto all_up_dim_hidden_idss =
+        container_concat(OldTensorDescriptor::GetUpperDimensionIdss(), up_dim_hidden_idss);
+
+    const auto element_space_size = old_tensor_desc.GetElementSpaceSize();
+
+    return DynamicTensorDescriptor<remove_cv_t<decltype(all_transforms)>,
+                                   remove_cv_t<decltype(all_low_dim_hidden_idss)>,
+                                   remove_cv_t<decltype(all_up_dim_hidden_idss)>,
+                                   remove_cv_t<decltype(new_visible_dim_hidden_ids)>,
+                                   remove_cv_t<decltype(element_space_size)>>{all_transforms,
+                                                                              element_space_size};
+}
+
+template <typename TensorDesc, typename VisibleIndex>
+__host__ __device__ constexpr auto make_dynamic_tensor_coordinate(const TensorDesc& tensor_desc,
+                                                                  const VisibleIndex& idx_visible)
+{
+    static_assert(TensorDesc::GetNumOfDimension() == VisibleIndex::Size(),
+                  "wrong! # of dimension inconsistent");
+
+    constexpr index_t ntransform   = TensorDesc::GetNumOfTransform();
+    constexpr index_t ndim_hidden  = TensorDesc::GetNumOfHiddenDimension();
+    constexpr auto visible_dim_ids = TensorDesc::GetVisibleDimensionIds();
+
+    MultiIndex<ndim_hidden> idx_hidden;
+
+    // initialize visible index
+    set_container_subset(idx_hidden, visible_dim_ids, idx_visible);
+
+    // calculate hidden index
+    static_for<ntransform, 0, -1>{}([&tensor_desc, &idx_hidden](auto itran_p1) {
+        auto itran              = itran_p1 - Number<1>{};
+        const auto& tran        = tensor_desc.GetTransforms().At(itran);
+        constexpr auto dims_low = TensorDesc::GetLowerDimensionIdss().At(itran);
+        constexpr auto dims_up  = TensorDesc::GetUpperDimensionIdss().At(itran);
+
+        const auto idx_up = get_container_subset(idx_hidden, dims_up);
+
+        MultiIndex<dims_low.Size()> idx_low;
+
+        tran.CalculateLowerIndex(idx_low, idx_up);
+
+        set_container_subset(idx_hidden, dims_low, idx_low);
+    });
+
+    return DynamicTensorCoordinate<ndim_hidden, decltype(visible_dim_ids)>{idx_hidden};
+}
+
+// UpdateLowerIndexHack: Sequence<...>
+// HACK: control UpdateLowerIndex
+template <typename TensorDesc, typename VisibleIndex, typename UpdateLowerIndexHack>
+__host__ __device__ constexpr auto make_dynamic_tensor_coordinate_iterator(
+    const TensorDesc&, const VisibleIndex& idx_diff_visible, UpdateLowerIndexHack)
+{
+    static_assert(TensorDesc::GetNumOfDimension() == VisibleIndex::Size(),
+                  "wrong! # of dimension inconsistent");
+
+    constexpr index_t ntransform   = TensorDesc::GetNumOfTransform();
+    constexpr index_t ndim_hidden  = TensorDesc::GetNumOfHiddenDimension();
+    constexpr index_t ndim_visible = TensorDesc::GetNumOfVisibleDimension();
+    constexpr auto visible_dim_ids = TensorDesc::GetVisibleDimensionIds();
+
+    static_assert(UpdateLowerIndexHack::Size() == ntransform, "wrong!");
+
+    // use index_t for boolean type
+    auto do_transforms    = make_zero_multi_index<ntransform>();
+    auto is_non_zero_diff = make_zero_multi_index<ndim_hidden>();
+
+    // decide do_transform by checkout non-zero index diff components
+    MultiIndex<VisibleIndex::Size()> non_zero_diff_pick_visible;
+
+    static_for<0, ndim_visible, 1>{}(
+        [&](auto i) { non_zero_diff_pick_visible(i) = (idx_diff_visible[i] != 0); });
+
+    set_container_subset(is_non_zero_diff, visible_dim_ids, non_zero_diff_pick_visible);
+
+    static_for<ntransform - 1, -1, -1>{}([&](auto itran) {
+        constexpr auto dims_low = TensorDesc::GetLowerDimensionIdss().At(itran);
+        constexpr auto dims_up  = TensorDesc::GetUpperDimensionIdss().At(itran);
+
+        const auto non_zero_diff_pick_up = get_container_subset(is_non_zero_diff, dims_up);
+
+        MultiIndex<dims_low.Size()> non_zero_diff_pick_low;
+
+        // if any of upper index diff components is non-zero, then
+        //   1) Need to do this transform
+        //   2) all components of lower index diff will assume to be non-zero and need to be
+        //   computed
+        const bool idx_diff_up_has_non_zero = container_reduce(
+            non_zero_diff_pick_up, [](auto a, auto b) constexpr { return a or b; }, false);
+
+        do_transforms(itran) = idx_diff_up_has_non_zero;
+
+        static_for<0, dims_low.Size(), 1>{}(
+            [&](auto i) { non_zero_diff_pick_low(i) = idx_diff_up_has_non_zero; });
+
+        set_container_subset(is_non_zero_diff, dims_low, non_zero_diff_pick_low);
+    });
+
+    return DynamicTensorCoordinateIterator<ntransform, ndim_visible, UpdateLowerIndexHack>{
+        idx_diff_visible, do_transforms};
+}
+
+template <typename TensorDesc, typename VisibleIndex>
+__host__ __device__ constexpr auto
+make_dynamic_tensor_coordinate_iterator(const TensorDesc&, const VisibleIndex& idx_diff_visible)
+{
+    constexpr index_t ntransform = TensorDesc::GetNumOfTransform();
+
+    return make_dynamic_tensor_coordinate_iterator(
+        TensorDesc{}, idx_diff_visible, typename uniform_sequence_gen<ntransform, 0>::type{});
+}
+
+template <typename TensorDesc, typename TensorCoord, typename TensorCoordIterator>
+__host__ __device__ constexpr void move_dynamic_tensor_coordinate(
+    const TensorDesc& tensor_desc, TensorCoord& coord, const TensorCoordIterator& coord_iterator)
+{
+    constexpr index_t ndim_hidden = TensorDesc::GetNumOfHiddenDimension();
+    constexpr index_t ntransform  = TensorDesc::GetNumOfTransform();
+
+    // this is what needs to be calculated
+    auto idx_diff_hidden = make_zero_multi_index<ndim_hidden>();
+
+    // initialize visible index diff
+    set_container_subset(idx_diff_hidden,
+                         TensorDesc::GetVisibleDimensionIds(),
+                         coord_iterator.GetVisibleIndexDiff());
+
+    // this is what needs to be updated
+    auto& idx_hidden = coord.GetHiddenIndex();
+
+    // update visible index
+    auto idx_hidden_pick_visible =
+        get_container_subset(idx_hidden, TensorDesc::GetVisibleDimensionIds());
+
+    idx_hidden_pick_visible += coord_iterator.GetIndexDiff();
+
+    set_container_subset(idx_hidden, TensorDesc::GetVisibleDimensionIds(), idx_hidden_pick_visible);
+
+    // update rest of hidden index
+    static_for<ntransform - 1, -1, -1>{}([&](auto itran) {
+        if(coord_iterator.do_transforms_[itran])
+        {
+            const auto& tran        = tensor_desc.GetTransforms().At(itran);
+            constexpr auto dims_low = TensorDesc::GetLowerDimensionIdss().At(itran);
+            constexpr auto dims_up  = TensorDesc::GetUpperDimensionIdss().At(itran);
+
+            const auto idx_up_new  = get_container_subset(idx_hidden, dims_up);
+            auto idx_low           = get_container_subset(idx_hidden, dims_low);
+            const auto idx_diff_up = get_container_subset(idx_diff_hidden, dims_up);
+
+            MultiIndex<dims_low.Size()> idx_diff_low;
+
+            // HACK: control UpdateLowerIndex for DynamicMerge using hack
+            constexpr index_t Hack = decltype(coord_iterator.update_lower_index_hack_)::At(itran);
+
+            tran.UpdateLowerIndex(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
+
+            set_container_subset(idx_diff_hidden, dims_low, idx_diff_low);
+            set_container_subset(idx_hidden, dims_low, idx_low);
+        }
+    });
+}
+
+template <typename TensorDesc, typename TensorCoord>
+__host__ __device__ constexpr bool
+coordinate_has_valid_offset_assuming_visible_index_is_valid(const TensorDesc& tensor_desc,
+                                                            const TensorCoord& coord)
+{
+    bool valid = true;
+
+    constexpr index_t ntransform = TensorDesc::GetNumOfTransform();
+
+    const auto& idx_hidden = coord.GetHiddenIndex();
+
+    static_for<ntransform - 1, -1, -1>{}([&tensor_desc, &idx_hidden, &valid](auto itran) {
+        const auto tran = tensor_desc.GetTransforms().At(itran);
+
+        // check validity, only if current transformation does not always has a valid mapping
+        if constexpr(!decltype(tran)::IsValidUpperIndexAlwaysMappedToValidLowerIndex())
+        {
+            const auto idx_up =
+                get_container_subset(idx_hidden, TensorDesc::GetUpperDimensionIdss().At(itran));
+
+            // Comment: using valid = valid && .. will result in weird control flow in ISA
+            valid &= tran.IsValidUpperIndexMappedToValidLowerIndex(idx_up);
+        }
+    });
+
+    return valid;
+}
+
+template <typename TensorDesc, typename TensorCoord>
+__host__ __device__ constexpr bool coordinate_has_valid_offset(const TensorDesc& tensor_desc,
+                                                               const TensorCoord& coord)
+{
+    // check visible index
+    const auto& idx_visible = coord.GetVisibleIndex();
+
+    bool is_visible_index_valid = true;
+
+    static_for<0, TensorDesc::GetNumOfDimension(), 1>{}(
+        [&is_visible_index_valid, &idx_visible, &tensor_desc](auto i) {
+            is_visible_index_valid =
+                is_visible_index_valid &&
+                (idx_visible[i] >= 0 && idx_visible[i] < tensor_desc.GetLength(i));
+        });
+
+    // check other hidden index
+    return is_visible_index_valid &&
+           coordinate_has_valid_offset_assuming_visible_index_is_valid(tensor_desc, coord);
+}
+
+template <typename TensorDesc>
+using DynamicTensorCoordinate_t = decltype(make_dynamic_tensor_coordinate(
+    TensorDesc{}, MultiIndex<remove_cv_t<remove_reference_t<TensorDesc>>::GetNumOfDimension()>{}));
+
+template <typename TensorDesc>
+using DynamicTensorCoordinateIterator_t = decltype(make_dynamic_tensor_coordinate_iterator(
+    TensorDesc{}, MultiIndex<remove_cv_t<remove_reference_t<TensorDesc>>::GetNumOfDimension()>{}));
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper.hpp b/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper.hpp
new file mode 100644
index 0000000000..2e36451a66
--- /dev/null
+++ b/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper.hpp
@@ -0,0 +1,150 @@
+#ifndef CK_DYNAMIC_TENSOR_DESCRIPTOR_HELPER_HPP
+#define CK_DYNAMIC_TENSOR_DESCRIPTOR_HELPER_HPP
+
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_multi_index_transform_helper.hpp"
+
+namespace ck {
+
+/*
+ * These functions create tensor descriptor at runtime. If they are not constexpr, you will
+ * likely see usage of scratch memory during construction of these tensor descriptors. So
+ * it's better to call these functions on host and then pass the constructed tensor descritpors
+ * to GPU. If the tensor descritpors being constructed are constexpr, then you can call these
+ * functions on GPU without worrying about scratch memory usage.
+ */
+
+#if CK_WORKAROUND_SWDEV_275126
+template <typename Lengths, typename Strides, index_t I, typename AccOld>
+__host__ __device__ constexpr auto calculate_element_space_size_impl(const Lengths& lengths,
+                                                                     const Strides& strides,
+                                                                     Number<I> i,
+                                                                     AccOld acc_old)
+{
+    auto acc_new = acc_old + (lengths[i] - Number<1>{}) * strides[i];
+
+    if constexpr(i.value < Lengths::Size() - 1)
+    {
+        return calculate_element_space_size_impl(lengths, strides, i + Number<1>{}, acc_new);
+    }
+    else
+    {
+        return acc_new;
+    }
+}
+#endif
+
+template <typename... Lengths,
+          typename... Strides,
+          typename std::enable_if<sizeof...(Lengths) == sizeof...(Strides), bool>::type = false>
+__host__ __device__ constexpr auto
+make_dynamic_naive_tensor_descriptor_v2(const Tuple<Lengths...>& lengths,
+                                        const Tuple<Strides...>& strides)
+{
+    constexpr index_t N = sizeof...(Lengths);
+
+    const auto transforms = make_tuple(make_embed_transform(lengths, strides));
+
+    constexpr auto low_dim_hidden_idss = make_tuple(Sequence<0>{});
+
+    constexpr auto up_dim_hidden_idss =
+        make_tuple(typename arithmetic_sequence_gen<1, N + 1, 1>::type{});
+
+    constexpr auto visible_dim_hidden_ids = typename arithmetic_sequence_gen<1, N + 1, 1>::type{};
+
+#if !CK_WORKAROUND_SWDEV_275126
+    // rocm-4.1 compiler would crash for recursive labmda
+    // recursive function for reduction
+    auto f = [&](auto fs, auto i, auto acc_old) {
+        auto acc_new = acc_old + (lengths[i] - Number<1>{}) * strides[i];
+
+        if constexpr(i.value < N - 1)
+        {
+            return fs(fs, i + Number<1>{}, acc_new);
+        }
+        else
+        {
+            return acc_new;
+        }
+    };
+
+    const auto element_space_size = f(f, Number<0>{}, Number<1>{});
+#else
+    const auto element_space_size =
+        calculate_element_space_size_impl(lengths, strides, Number<0>{}, Number<1>{});
+#endif
+
+    return DynamicTensorDescriptor<remove_cv_t<decltype(transforms)>,
+                                   remove_cv_t<decltype(low_dim_hidden_idss)>,
+                                   remove_cv_t<decltype(up_dim_hidden_idss)>,
+                                   remove_cv_t<decltype(visible_dim_hidden_ids)>,
+                                   remove_cv_t<decltype(element_space_size)>>{transforms,
+                                                                              element_space_size};
+}
+
+// Lengths... can be:
+//   1) index_t, which is known at run-time
+//   2) Number<>, which is known at compile-time
+template <typename... Lengths>
+__host__ __device__ constexpr auto
+make_dynamic_naive_tensor_descriptor_packed_v2(const Tuple<Lengths...>& lengths)
+{
+    constexpr index_t N = sizeof...(Lengths);
+
+    const auto transforms = make_tuple(make_unmerge_transform(lengths));
+
+    constexpr auto low_dim_hidden_idss = make_tuple(Sequence<0>{});
+
+    constexpr auto up_dim_hidden_idss =
+        make_tuple(typename arithmetic_sequence_gen<1, N + 1, 1>::type{});
+
+    constexpr auto visible_dim_hidden_ids = typename arithmetic_sequence_gen<1, N + 1, 1>::type{};
+
+    const auto element_space_size = container_reduce(lengths, math::multiplies_v2{}, Number<1>{});
+
+    return DynamicTensorDescriptor<remove_cv_t<decltype(transforms)>,
+                                   remove_cv_t<decltype(low_dim_hidden_idss)>,
+                                   remove_cv_t<decltype(up_dim_hidden_idss)>,
+                                   remove_cv_t<decltype(visible_dim_hidden_ids)>,
+                                   remove_cv_t<decltype(element_space_size)>>{transforms,
+                                                                              element_space_size};
+}
+
+template <typename... Lengths, typename Align>
+__host__ __device__ constexpr auto
+make_dynamic_naive_tensor_descriptor_aligned_v2(const Tuple<Lengths...>& lengths, Align align)
+{
+    constexpr auto I1 = Number<1>{};
+
+    constexpr index_t N = sizeof...(Lengths);
+
+    const auto stride_n_minus_2 = math::integer_least_multiple(lengths[Number<N - 1>{}], align);
+
+    auto strides = generate_tuple(
+        [&](auto i) {
+            if constexpr(i.value == N - 1)
+            {
+                return I1;
+            }
+            else if constexpr(i.value == N - 2)
+            {
+                return Number<stride_n_minus_2>{};
+            }
+            else
+            {
+                return container_reduce(lengths,
+                                        math::multiplies_v2{},
+                                        Number<stride_n_minus_2>{},
+                                        i + I1,
+                                        Number<N - 1>{},
+                                        I1);
+            }
+        },
+        Number<N>{});
+
+    return make_dynamic_naive_tensor_descriptor_v2(lengths, strides);
+}
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/tensor_description/tensor_adaptor.hpp b/composable_kernel/include/tensor_description/tensor_adaptor.hpp
new file mode 100644
index 0000000000..6affe6141f
--- /dev/null
+++ b/composable_kernel/include/tensor_description/tensor_adaptor.hpp
@@ -0,0 +1,466 @@
+#ifndef CK_TENSOR_ADAPTOR_HPP
+#define CK_TENSOR_ADAPTOR_HPP
+
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// Transforms: Tuple<transforms...>
+// LowerDimensionHiddenIdss : Tuple<Sequence<...>, ...>
+// UpperDimensionHiddenIdss : Tuple<Sequence<...>, ...>
+// BottomDimensionHiddenIds : Sequence<...>
+// TopDimensionHiddenIds : Sequence<...>
+template <typename Transforms,
+          typename LowerDimensionHiddenIdss,
+          typename UpperDimensionHiddenIdss,
+          typename BottomDimensionHiddenIds,
+          typename TopDimensionHiddenIds>
+struct TensorAdaptor
+{
+    __host__ __device__ static constexpr index_t GetNumOfTransform() { return Transforms::Size(); }
+
+    __host__ __device__ constexpr const auto& GetTransforms() const { return transforms_; }
+
+    __host__ __device__ static constexpr auto GetLowerDimensionHiddenIdss()
+    {
+        return LowerDimensionHiddenIdss{};
+    }
+
+    __host__ __device__ static constexpr auto GetUpperDimensionHiddenIdss()
+    {
+        return UpperDimensionHiddenIdss{};
+    }
+
+    __host__ __device__ static constexpr auto GetTopDimensionHiddenIds()
+    {
+        return TopDimensionHiddenIds{};
+    }
+
+    __host__ __device__ static constexpr auto GetBottomDimensionHiddenIds()
+    {
+        return BottomDimensionHiddenIds{};
+    }
+
+    __host__ __device__ static constexpr auto InitializeElementSize(const Transforms& transforms)
+    {
+        const auto lengths = generate_tuple(
+            [&](auto idim_top) {
+                constexpr auto tmp = GetTransformAndItsUpperDimension(idim_top);
+
+                constexpr index_t itran   = tmp[Number<0>{}];
+                constexpr index_t idim_up = tmp[Number<1>{}];
+                constexpr bool found      = tmp[Number<2>{}];
+
+                static_assert(found == true,
+                              "wrong! not found matching transformation and upper-dimension");
+
+                const auto length =
+                    transforms[Number<itran>{}].GetUpperLengths()[Number<idim_up>{}];
+
+                return length;
+            },
+            Number<ndim_top_>{});
+
+        // TODO: make container_reduce support tuple of Number and index_t
+        return container_reduce(lengths, math::multiplies_v2{}, Number<1>{});
+    }
+
+    template <index_t IDim>
+    __host__ __device__ static constexpr auto GetTransformAndItsUpperDimension(Number<IDim>)
+    {
+        constexpr auto idim_top = Number<IDim>{};
+
+        constexpr index_t idim_hidden = TopDimensionHiddenIds::At(idim_top);
+
+        index_t itran_found   = 0;
+        index_t idim_up_found = 0;
+        bool found            = false;
+
+        static_for<0, ntransform_, 1>{}([&](auto itran) {
+            constexpr auto up_dim_ids = UpperDimensionHiddenIdss{}[itran];
+
+            static_for<0, up_dim_ids.Size(), 1>{}([&](auto idim_up) {
+                if constexpr(up_dim_ids[idim_up] == idim_hidden)
+                {
+                    itran_found   = itran;
+                    idim_up_found = idim_up;
+                    found         = true;
+                }
+            });
+        });
+
+        return make_tuple(itran_found, idim_up_found, found);
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfBottomDimension()
+    {
+        return BottomDimensionHiddenIds::Size();
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfTopDimension()
+    {
+        return TopDimensionHiddenIds::Size();
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfHiddenDimension()
+    {
+        constexpr auto all_low_dim_ids = unpack(
+            [](auto&&... xs) constexpr { return merge_sequences(xs...); },
+            LowerDimensionHiddenIdss{});
+
+        constexpr auto all_up_dim_ids = unpack(
+            [](auto&&... xs) constexpr { return merge_sequences(xs...); },
+            UpperDimensionHiddenIdss{});
+
+        constexpr auto all_dim_ids = merge_sequences(all_low_dim_ids, all_up_dim_ids);
+
+        using unique_sort_all_dim_ids = typename sequence_unique_sort<decltype(all_dim_ids),
+                                                                      math::less<index_t>,
+                                                                      math::equal<index_t>>::type;
+
+        return unique_sort_all_dim_ids::Size();
+    }
+
+    constexpr static index_t ntransform_  = GetNumOfTransform();
+    constexpr static index_t ndim_hidden_ = GetNumOfHiddenDimension();
+    constexpr static index_t ndim_bottom_ = GetNumOfBottomDimension();
+    constexpr static index_t ndim_top_    = GetNumOfTopDimension();
+
+    using HiddenIndex = MultiIndex<ndim_hidden_>;
+    using BottomIndex = MultiIndex<ndim_bottom_>;
+    using TopIndex    = MultiIndex<ndim_top_>;
+
+    // may be index_t or Number<>
+    using ElementSize = remove_cv_t<decltype(InitializeElementSize(Transforms{}))>;
+
+    public:
+    __host__ __device__ constexpr TensorAdaptor() = default;
+
+    __host__ __device__ constexpr TensorAdaptor(const Transforms& transforms)
+        : transforms_{transforms}, element_size_{InitializeElementSize(transforms)}
+    {
+        static_assert(Transforms::Size() == ntransform_ &&
+                          LowerDimensionHiddenIdss::Size() == ntransform_ &&
+                          UpperDimensionHiddenIdss::Size() == ntransform_,
+                      "wrong! inconsistent # of transformations");
+
+        // TODO check dependency of dimensions is valid
+    }
+
+    __host__ __device__ constexpr auto GetElementSize() const { return element_size_; }
+
+    template <typename TopIdx>
+    __host__ __device__ constexpr auto CalculateBottomIndex(const TopIdx& idx_top) const
+    {
+        static_assert(TopIdx::Size() == TopDimensionHiddenIds::Size(),
+                      "wrong! # of dimension inconsistent");
+
+        constexpr index_t ntransform  = GetNumOfTransform();
+        constexpr index_t ndim_hidden = GetNumOfHiddenDimension();
+
+        MultiIndex<ndim_hidden> idx_hidden;
+
+        // initialize uppest index
+        set_container_subset(idx_hidden, GetTopDimensionHiddenIds(), idx_top);
+
+        // calculate hidden index
+        static_for<ntransform, 0, -1>{}([&](auto itran_p1) {
+            auto itran              = itran_p1 - Number<1>{};
+            const auto& tran        = GetTransforms().At(itran);
+            constexpr auto dims_low = GetLowerDimensionHiddenIdss().At(itran);
+            constexpr auto dims_up  = GetUpperDimensionHiddenIdss().At(itran);
+
+            const auto idx_up = get_container_subset(idx_hidden, dims_up);
+
+            MultiIndex<dims_low.Size()> idx_low;
+
+            tran.CalculateLowerIndex(idx_low, idx_up);
+
+            set_container_subset(idx_hidden, dims_low, idx_low);
+        });
+
+        return get_container_subset(idx_hidden, BottomDimensionHiddenIds{});
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        bool is_known = true;
+
+        static_for<0, Transforms::Size(), 1>{}([&](auto i) {
+            is_known &=
+                remove_cv_t<remove_reference_t<decltype(Transforms{}[i])>>::IsKnownAtCompileTime();
+        });
+
+        return is_known && is_known_at_compile_time<ElementSize>::value;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("TensorAdaptor, ");
+        static_for<0, ntransform_, 1>{}([&](auto i) {
+            printf("transforms: ");
+            transforms_[i].Print();
+            printf("LowerDimensionHiddenIds:");
+            LowerDimensionHiddenIdss{}.At(i).Print();
+            printf("UpperDimensionHiddenIds:");
+            UpperDimensionHiddenIdss{}.At(i).Print();
+        });
+
+        printf("BottomDimensionHiddenIds:");
+        BottomDimensionHiddenIds::Print();
+        printf("TopDimensionHiddenIds:");
+        TopDimensionHiddenIds::Print();
+
+        printf("}");
+    }
+
+    private:
+    Transforms transforms_;
+    ElementSize element_size_;
+};
+
+template <typename TensorAdaptor0, typename TensorAdaptor1>
+__host__ __device__ constexpr auto chain_tensor_adaptors(const TensorAdaptor0& adaptor0,
+                                                         const TensorAdaptor1& adaptor1)
+{
+    static_assert(TensorAdaptor0::GetNumOfTopDimension() ==
+                      TensorAdaptor1::GetNumOfBottomDimension(),
+                  "wrong!");
+
+    // all_transforms = transform0 + transform1
+    const auto all_transforms =
+        container_concat(adaptor0.GetTransforms(), adaptor1.GetTransforms());
+
+    // shift
+    constexpr index_t adaptor0_max_hidden_id = [&]() {
+        index_t adaptor0_max_hidden_id_ = NumericLimits<index_t>::Min();
+
+        static_for<0, TensorAdaptor0::GetNumOfTransform(), 1>{}([&](auto itran) {
+            constexpr index_t ndim_low =
+                TensorAdaptor0{}.GetTransforms()[itran].GetNumOfLowerDimension();
+
+            static_for<0, ndim_low, 1>{}([&](auto idim_low) {
+                adaptor0_max_hidden_id_ =
+                    math::max(adaptor0_max_hidden_id_,
+                              TensorAdaptor0::GetLowerDimensionHiddenIdss()[itran][idim_low].value);
+            });
+
+            constexpr index_t ndim_up =
+                TensorAdaptor0{}.GetTransforms()[itran].GetNumOfUpperDimension();
+
+            static_for<0, ndim_up, 1>{}([&](auto idim_up) {
+                adaptor0_max_hidden_id_ =
+                    math::max(adaptor0_max_hidden_id_,
+                              TensorAdaptor0::GetUpperDimensionHiddenIdss()[itran][idim_up].value);
+            });
+        });
+
+        return adaptor0_max_hidden_id_;
+    }();
+
+    constexpr index_t adaptor1_min_hidden_id = [&]() {
+        index_t adaptor1_min_hidden_id_ = NumericLimits<index_t>::Max();
+
+        static_for<0, TensorAdaptor1::GetNumOfTransform(), 1>{}([&](auto itran) {
+            constexpr index_t ndim_low =
+                TensorAdaptor1{}.GetTransforms()[itran].GetNumOfLowerDimension();
+
+            // get the min of all lower dimenions, but not bottom dimension (because their id will
+            // be matched with top id from adaptor0)
+            static_for<0, ndim_low, 1>{}([&](auto idim_low) {
+                constexpr index_t low_dim_hidden_id =
+                    TensorAdaptor1::GetLowerDimensionHiddenIdss()[itran][idim_low].value;
+
+                bool is_bottom_dim = false;
+                static_for<0, TensorAdaptor1::GetNumOfBottomDimension(), 1>{}([&](auto i) {
+                    if constexpr(low_dim_hidden_id ==
+                                 TensorAdaptor1::GetBottomDimensionHiddenIds()[i])
+                    {
+                        is_bottom_dim = true;
+                    }
+                });
+
+                if(!is_bottom_dim)
+                {
+                    adaptor1_min_hidden_id_ = math::min(adaptor1_min_hidden_id_, low_dim_hidden_id);
+                }
+            });
+
+            constexpr index_t ndim_up =
+                TensorAdaptor1{}.GetTransforms()[itran].GetNumOfUpperDimension();
+
+            // get the min of all upper dimensions
+            static_for<0, ndim_up, 1>{}([&](auto idim_up) {
+                adaptor1_min_hidden_id_ =
+                    math::min(adaptor1_min_hidden_id_,
+                              TensorAdaptor1::GetUpperDimensionHiddenIdss()[itran][idim_up].value);
+            });
+        });
+
+        return adaptor1_min_hidden_id_;
+    }();
+
+    constexpr index_t adaptor1_hidden_id_shift =
+        adaptor0_max_hidden_id + 1 - adaptor1_min_hidden_id;
+
+    constexpr index_t ndim_bottom_1 = TensorAdaptor1::GetNumOfBottomDimension();
+
+    // all_low_dim_hidden_idss =
+    // low_dim_hidden_idss_0 + match_hidden_id_for_1(shift_hidden_id_for_1(low_dim_hiden_idss_1))
+    constexpr auto low_dim_hidden_idss_1 = generate_tuple(
+        // generate sequence of ids for a transform
+        [&](auto itran) {
+            constexpr auto ndim_low_1 = TensorAdaptor1::GetLowerDimensionHiddenIdss()[itran].Size();
+
+            constexpr auto low_dim_hidden_ids_1 =
+                TensorAdaptor1::GetLowerDimensionHiddenIdss()[itran];
+
+            // sequence in, sequence out
+            constexpr auto low_dim_hidden_ids_1_mod = [&]() constexpr
+            {
+                auto low_dim_hidden_ids_1_mod_ = to_multi_index(low_dim_hidden_ids_1);
+
+                // shift hidden id so every dim id is unique
+                static_for<0, ndim_low_1, 1>{}([&](auto idim_low_1) {
+                    low_dim_hidden_ids_1_mod_(idim_low_1) += adaptor1_hidden_id_shift;
+                });
+
+                // match hidden id
+                static_for<0, ndim_low_1, 1>{}([&](auto idim_low_1) {
+                    static_for<0, ndim_bottom_1, 1>{}([&](auto idim_bottom_1) {
+                        // if this low dim is bottom dim, then do id matching
+                        if constexpr(low_dim_hidden_ids_1[idim_low_1] ==
+                                     TensorAdaptor1::GetBottomDimensionHiddenIds()[idim_bottom_1])
+                        {
+                            low_dim_hidden_ids_1_mod_(idim_low_1) =
+                                TensorAdaptor0::GetTopDimensionHiddenIds()[idim_bottom_1];
+                        }
+                    });
+                });
+
+                return low_dim_hidden_ids_1_mod_;
+            }
+            ();
+
+            return generate_sequence_v2(
+                [&](auto i) constexpr { return Number<low_dim_hidden_ids_1_mod[i]>{}; },
+                Number<ndim_low_1>{});
+        },
+        Number<TensorAdaptor1::GetNumOfTransform()>{});
+
+    constexpr auto all_low_dim_hidden_idss =
+        container_concat(TensorAdaptor0::GetLowerDimensionHiddenIdss(), low_dim_hidden_idss_1);
+
+    // all_up_dim_hidden_idss =
+    // up_dim_hidden_idss_0 + shift_hidden_id_for_1(up_dim_hiden_idss_1)
+    constexpr auto up_dim_hidden_idss_1 = generate_tuple(
+        // generate sequence of ids for a transform
+        [&](auto itran) {
+            constexpr auto ndim_up_1 = TensorAdaptor1::GetUpperDimensionHiddenIdss()[itran].Size();
+
+            constexpr auto up_dim_hidden_ids_1 =
+                TensorAdaptor1::GetUpperDimensionHiddenIdss()[itran];
+
+            // sequence in, constexpr tuple out
+            constexpr auto up_dim_hidden_ids_1_mod = [&]() constexpr
+            {
+                auto up_dim_hidden_ids_1_mod_ = to_multi_index(up_dim_hidden_ids_1);
+
+                // shift hidden id
+                static_for<0, ndim_up_1, 1>{}([&](auto idim_up_1) {
+                    up_dim_hidden_ids_1_mod_(idim_up_1) += adaptor1_hidden_id_shift;
+                });
+
+                return up_dim_hidden_ids_1_mod_;
+            }
+            ();
+
+            // constexpr tuple to sequence
+            return generate_sequence_v2(
+                [&](auto i) constexpr { return Number<up_dim_hidden_ids_1_mod[i]>{}; },
+                Number<ndim_up_1>{});
+        },
+        Number<TensorAdaptor1::GetNumOfTransform()>{});
+
+    constexpr auto all_up_dim_hidden_idss =
+        container_concat(TensorAdaptor0::GetUpperDimensionHiddenIdss(), up_dim_hidden_idss_1);
+
+    // bottom_dim_hidden_ids = bottom_dim_hidden_ids_0
+    constexpr auto bottom_dim_hidden_ids = TensorAdaptor0::GetBottomDimensionHiddenIds();
+
+    // top_dim_hidden_ids = shift_hidden_id(top_dim_hidden_ids_1)
+    constexpr auto top_dim_hidden_ids =
+        TensorAdaptor1::GetTopDimensionHiddenIds() + Number<adaptor1_hidden_id_shift>{};
+
+    // put everything together
+    return TensorAdaptor<remove_cv_t<decltype(all_transforms)>,
+                         remove_cv_t<decltype(all_low_dim_hidden_idss)>,
+                         remove_cv_t<decltype(all_up_dim_hidden_idss)>,
+                         remove_cv_t<decltype(bottom_dim_hidden_ids)>,
+                         remove_cv_t<decltype(top_dim_hidden_ids)>>{all_transforms};
+}
+
+// Transforms: Tuple<transforms...>
+// LowerDimensionOldTopIdss: Tuple<Sequence<...>, ...>
+// UpperDimensionNewTopIdss: Tuple<Sequence<...>, ...>
+template <typename Transforms, typename LowerDimensionOldTopIdss, typename UpperDimensionNewTopIdss>
+__host__ __device__ constexpr auto make_single_stage_tensor_adaptor(const Transforms& transforms,
+                                                                    LowerDimensionOldTopIdss,
+                                                                    UpperDimensionNewTopIdss)
+{
+    constexpr index_t ntransform = Transforms::Size();
+
+    static_assert(LowerDimensionOldTopIdss::Size() == ntransform &&
+                      UpperDimensionNewTopIdss::Size() == ntransform,
+                  "wrong!");
+
+    // sanity check on LowerDimensionOldTopIdss and UpperDimensionNewTopIdss
+    constexpr auto all_low_dim_old_top_ids = unpack(
+        [](auto&&... xs) constexpr { return merge_sequences(xs...); }, LowerDimensionOldTopIdss{});
+
+    constexpr auto all_up_dim_new_top_ids = unpack(
+        [](auto&&... xs) constexpr { return merge_sequences(xs...); }, UpperDimensionNewTopIdss{});
+
+    static_assert(is_valid_sequence_map<decltype(all_low_dim_old_top_ids)>::value &&
+                      is_valid_sequence_map<decltype(all_up_dim_new_top_ids)>::value,
+                  "wrong!");
+
+    constexpr index_t ndim_old_top = all_low_dim_old_top_ids.Size();
+    constexpr index_t ndim_new_top = all_up_dim_new_top_ids.Size();
+
+    // low_dim_hidden_idss
+    constexpr auto low_dim_hidden_idss = LowerDimensionOldTopIdss{};
+
+    // up_dim_hidden_idss: shift UpperDimensionNewTopIdss by ndim_bottom
+    constexpr auto up_dim_hidden_idss = generate_tuple(
+        [](auto itran) { return UpperDimensionNewTopIdss{}[itran] + Number<ndim_old_top>{}; },
+        Number<ntransform>{});
+
+    // bottom_dim_hidden_ids
+    constexpr auto bottom_dim_hidden_ids =
+        typename arithmetic_sequence_gen<0, ndim_old_top, 1>::type{};
+
+    // top_dim_hidden_ids
+    constexpr auto top_dim_hidden_ids =
+        typename arithmetic_sequence_gen<0, ndim_new_top, 1>::type{} + Number<ndim_old_top>{};
+
+    return TensorAdaptor<remove_cv_t<Transforms>,
+                         remove_cv_t<decltype(low_dim_hidden_idss)>,
+                         remove_cv_t<decltype(up_dim_hidden_idss)>,
+                         remove_cv_t<decltype(bottom_dim_hidden_ids)>,
+                         remove_cv_t<decltype(top_dim_hidden_ids)>>{transforms};
+}
+
+template <typename X,
+          typename... Xs,
+          typename std::enable_if<sizeof...(Xs) >= 2, bool>::type = false>
+__host__ __device__ constexpr auto chain_tensor_adaptors(const X& x, const Xs&... xs)
+{
+    return chain_tensor_adaptors(x, chain_tensor_adaptors(xs...));
+}
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer.hpp b/composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
new file mode 100644
index 0000000000..694b2fd2cc
--- /dev/null
+++ b/composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
@@ -0,0 +1,171 @@
+#ifndef CK_BLOCKWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_HPP
+#define CK_BLOCKWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_HPP
+
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "cluster_descriptor.hpp"
+#include "threadwise_dynamic_tensor_slice_transfer.hpp"
+
+namespace ck {
+
+// this version does following things to avoid scratch memory issue
+// 1. Use StaticallyIndexedArray instead of C array for thread buffer
+// 2. ThreadwiseDynamicTensorSliceTransfer_v3 does not keep reference to tensor descriptor
+// 3. ThreadwiseDynamicTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
+template <index_t BlockSize,
+          InMemoryDataOperationEnum_t DstInMemOp,
+          typename BlockSliceLengths,
+          typename ThreadSliceLengths,
+          typename ThreadClusterLengths,
+          typename ThreadClusterArrangeOrder,
+          typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
+          index_t SrcVectorDim,
+          index_t DstVectorDim,
+          index_t SrcScalarPerVector,
+          index_t DstScalarPerVector,
+          index_t SrcScalarStrideInVector,
+          index_t DstScalarStrideInVector,
+          bool ThreadTransferSrcResetCoordinateAfterRun,
+          bool ThreadTransferDstResetCoordinateAfterRun>
+struct BlockwiseDynamicTensorSliceTransfer_v4
+{
+    static constexpr index_t nDim = remove_reference_t<SrcDesc>::GetNumOfDimension();
+
+    using Index = MultiIndex<nDim>;
+
+    __device__ constexpr BlockwiseDynamicTensorSliceTransfer_v4(const SrcDesc& src_desc,
+                                                                const Index& src_block_slice_origin,
+                                                                const DstDesc& dst_desc,
+                                                                const Index& dst_block_slice_origin)
+        : threadwise_transfer_(
+              src_desc, make_zero_multi_index<nDim>(), dst_desc, make_zero_multi_index<nDim>())
+
+    {
+        static_assert(nDim == remove_reference_t<remove_cv_t<SrcDesc>>::GetNumOfDimension() &&
+                          nDim == remove_reference_t<remove_cv_t<DstDesc>>::GetNumOfDimension() &&
+                          nDim == BlockSliceLengths::Size() && nDim == ThreadSliceLengths::Size() &&
+                          nDim == ThreadClusterLengths::Size() &&
+                          nDim == ThreadClusterArrangeOrder::Size() &&
+                          nDim == SrcDimAccessOrder::Size() && nDim == DstDimAccessOrder::Size(),
+                      "wrong! nDim not consistent");
+
+        static_assert(
+            is_same<BlockSliceLengths, decltype(ThreadSliceLengths{} * ThreadClusterLengths{})>{},
+            "wrong! threads should be mapped to cover entire slicing window");
+
+        static_assert(BlockSize >= thread_cluster_desc_.GetElementSize(),
+                      "wrong! BlockSize too small");
+
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            const auto thread_cluster_idx = thread_cluster_desc_.CalculateBottomIndex(
+                make_multi_index(get_thread_local_1d_id()));
+
+            const auto thread_data_idx_begin = thread_cluster_idx * ThreadSliceLengths{};
+
+            threadwise_transfer_.SetSrcSliceOrigin(src_desc,
+                                                   src_block_slice_origin + thread_data_idx_begin);
+            threadwise_transfer_.SetDstSliceOrigin(dst_desc,
+                                                   dst_block_slice_origin + thread_data_idx_begin);
+        }
+    }
+
+    template <typename SrcBuffer, typename SrcIteratorHacks>
+    __device__ void RunRead(const SrcDesc& src_desc,
+                            const SrcBuffer& src_buf,
+                            const SrcIteratorHacks& src_iterator_hacks)
+    {
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.RunRead(src_desc, src_buf, src_iterator_hacks);
+        }
+    }
+
+    template <typename SrcBuffer>
+    __device__ void RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf)
+    {
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.RunRead(src_desc, src_buf);
+        }
+    }
+
+    template <typename DstBuffer>
+    __device__ void RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf)
+    {
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.RunWrite(dst_desc, dst_buf);
+        }
+    }
+
+    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& step)
+    {
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveSrcSliceWindow(src_desc, step);
+        }
+    }
+
+    // SrcMoveSliceWindowIteratorHack to control index calculation move slice window
+    template <typename SrcMoveSliceWindowIteratorHack>
+    __device__ void
+    MoveSrcSliceWindow(const SrcDesc& src_desc,
+                       const Index& step,
+                       const SrcMoveSliceWindowIteratorHack& src_move_slice_window_iterator_hack)
+    {
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveSrcSliceWindow(
+                src_desc, step, src_move_slice_window_iterator_hack);
+        }
+    }
+
+    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc, const Index& step)
+    {
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveDstSliceWindow(dst_desc, step);
+        }
+    }
+
+    private:
+    static constexpr auto thread_cluster_desc_ =
+        make_cluster_descriptor_v2(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
+
+    using ThreadwiseTransfer =
+        ThreadwiseDynamicTensorSliceTransfer_v3<ThreadSliceLengths,
+                                                DstInMemOp,
+                                                SrcData,
+                                                DstData,
+                                                SrcDesc,
+                                                DstDesc,
+                                                SrcDimAccessOrder,
+                                                DstDimAccessOrder,
+                                                SrcVectorDim,
+                                                DstVectorDim,
+                                                SrcScalarPerVector,
+                                                DstScalarPerVector,
+                                                SrcScalarStrideInVector,
+                                                DstScalarStrideInVector,
+                                                ThreadTransferSrcResetCoordinateAfterRun,
+                                                ThreadTransferDstResetCoordinateAfterRun>;
+
+    ThreadwiseTransfer threadwise_transfer_;
+};
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer_v2.hpp b/composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer_v2.hpp
new file mode 100644
index 0000000000..20f3225f82
--- /dev/null
+++ b/composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer_v2.hpp
@@ -0,0 +1,158 @@
+#ifndef CK_BLOCKWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_V2_HPP
+#define CK_BLOCKWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_V2_HPP
+
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "cluster_descriptor.hpp"
+#include "threadwise_dynamic_tensor_slice_transfer_v2.hpp"
+
+namespace ck {
+
+// this version does following things to avoid scratch memory issue
+// 1. Use StaticallyIndexedArray instead of C array for thread buffer
+// 2. ThreadwiseDynamicTensorSliceTransfer_v3 does not keep reference to tensor descriptor
+// 3. ThreadwiseDynamicTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
+template <index_t BlockSize,
+          InMemoryDataOperationEnum_t DstInMemOp,
+          typename BlockSliceLengths,
+          typename ThreadSliceLengths,
+          typename ThreadClusterLengths,
+          typename ThreadClusterArrangeOrder,
+          typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
+          typename SrcVectorTensorLengths,
+          typename DstVectorTensorLengths,
+          typename SrcVectorTensorContiguousDimOrder,
+          typename DstVectorTensorContiguousDimOrder,
+          bool ThreadTransferSrcResetCoordinateAfterRun,
+          bool ThreadTransferDstResetCoordinateAfterRun>
+struct BlockwiseDynamicTensorSliceTransfer_v4r1
+{
+    static constexpr index_t nDim = remove_reference_t<SrcDesc>::GetNumOfDimension();
+
+    using Index = MultiIndex<nDim>;
+
+    __device__ constexpr BlockwiseDynamicTensorSliceTransfer_v4r1(
+        const SrcDesc& src_desc,
+        const Index& src_block_slice_origin,
+        const DstDesc& dst_desc,
+        const Index& dst_block_slice_origin)
+        : threadwise_transfer_(
+              src_desc, make_zero_multi_index<nDim>(), dst_desc, make_zero_multi_index<nDim>())
+
+    {
+        static_assert(nDim == remove_reference_t<remove_cv_t<SrcDesc>>::GetNumOfDimension() &&
+                          nDim == remove_reference_t<remove_cv_t<DstDesc>>::GetNumOfDimension() &&
+                          nDim == BlockSliceLengths::Size() && nDim == ThreadSliceLengths::Size() &&
+                          nDim == ThreadClusterLengths::Size() &&
+                          nDim == ThreadClusterArrangeOrder::Size() &&
+                          nDim == SrcDimAccessOrder::Size() && nDim == DstDimAccessOrder::Size(),
+                      "wrong! nDim not consistent");
+
+        static_assert(
+            is_same<BlockSliceLengths, decltype(ThreadSliceLengths{} * ThreadClusterLengths{})>{},
+            "wrong! threads should be mapped to cover entire slicing window");
+
+        static_assert(BlockSize >= thread_cluster_desc_.GetElementSize(),
+                      "wrong! BlockSize too small");
+
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            const auto thread_cluster_idx = thread_cluster_desc_.CalculateBottomIndex(
+                make_multi_index(get_thread_local_1d_id()));
+
+            const auto thread_data_idx_begin = thread_cluster_idx * ThreadSliceLengths{};
+
+            threadwise_transfer_.SetSrcSliceOrigin(src_desc,
+                                                   src_block_slice_origin + thread_data_idx_begin);
+            threadwise_transfer_.SetDstSliceOrigin(dst_desc,
+                                                   dst_block_slice_origin + thread_data_idx_begin);
+        }
+    }
+
+    template <typename SrcBuffer, typename SrcIteratorHacks>
+    __device__ void RunRead(const SrcDesc& src_desc,
+                            const SrcBuffer& src_buf,
+                            const SrcIteratorHacks& src_iterator_hacks)
+    {
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.RunRead(src_desc, src_buf, src_iterator_hacks);
+        }
+    }
+
+    template <typename DstBuffer>
+    __device__ void RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf)
+    {
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.RunWrite(dst_desc, dst_buf);
+        }
+    }
+
+    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& step)
+    {
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveSrcSliceWindow(src_desc, step);
+        }
+    }
+
+    // SrcMoveSliceWindowIteratorHack to control index calculation move slice window
+    template <typename SrcMoveSliceWindowIteratorHack>
+    __device__ void
+    MoveSrcSliceWindow(const SrcDesc& src_desc,
+                       const Index& step,
+                       const SrcMoveSliceWindowIteratorHack& src_move_slice_window_iterator_hack)
+    {
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveSrcSliceWindow(
+                src_desc, step, src_move_slice_window_iterator_hack);
+        }
+    }
+
+    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc, const Index& step)
+    {
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveDstSliceWindow(dst_desc, step);
+        }
+    }
+
+    private:
+    static constexpr auto thread_cluster_desc_ =
+        make_cluster_descriptor_v2(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
+
+    using ThreadwiseTransfer =
+        ThreadwiseDynamicTensorSliceTransfer_v3r1<ThreadSliceLengths,
+                                                  DstInMemOp,
+                                                  SrcData,
+                                                  DstData,
+                                                  SrcDesc,
+                                                  DstDesc,
+                                                  SrcDimAccessOrder,
+                                                  DstDimAccessOrder,
+                                                  SrcVectorTensorLengths,
+                                                  DstVectorTensorLengths,
+                                                  SrcVectorTensorContiguousDimOrder,
+                                                  DstVectorTensorContiguousDimOrder,
+                                                  ThreadTransferSrcResetCoordinateAfterRun,
+                                                  ThreadTransferDstResetCoordinateAfterRun>;
+
+    ThreadwiseTransfer threadwise_transfer_;
+};
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r2.hpp b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r2.hpp
new file mode 100644
index 0000000000..694cf9c6a3
--- /dev/null
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r2.hpp
@@ -0,0 +1,396 @@
+#ifndef CK_BLOCKWISE_GEMM_DLOPS_V2R2_HPP
+#define CK_BLOCKWISE_GEMM_DLOPS_V2R2_HPP
+
+#include "common_header.hpp"
+#include "tensor_adaptor.hpp"
+#include "threadwise_dynamic_tensor_slice_transfer.hpp"
+#include "threadwise_contraction_dlops.hpp"
+
+namespace ck {
+
+// C[M0, M1, N0, N1] += transpose(A[K, M0, M1]) * B[K, N0, N1]
+// A and B are visable to the whole block, C is distributed among each thread
+// Assume:
+//   1. A:
+//     1. AKMBlockDesc is known at compile-time
+//     2. ABlockBuffer is DynamicBuffer
+//   2. B:
+//     1. BKNBlockDesc is known at compile-time
+//     2. BBlockBuffer is DynamicBuffer
+//   3. C:
+//     1. CM0M1N0N1ThreadDesc is known at compile-time
+//     2. CThreadBuffer is StaticBuffer
+// Also assume:
+//   M0 = N0 = 2. It will do 2x2 pipelined read and fma (ABBA optimization)
+template <index_t BlockSize,
+          typename FloatA,
+          typename FloatB,
+          typename FloatC,
+          typename AKMBlockDesc,
+          typename BKNBlockDesc,
+          index_t M1PerThreadM11,
+          index_t N1PerThreadN11,
+          index_t KPerThread,
+          index_t M1N1ThreadClusterM100,
+          index_t M1N1ThreadClusterN100,
+          index_t M1N1ThreadClusterM101,
+          index_t M1N1ThreadClusterN101,
+          index_t AThreadCopyScalarPerVector_M11,
+          index_t BThreadCopyScalarPerVector_N11,
+          typename std::enable_if<AKMBlockDesc::IsKnownAtCompileTime() &&
+                                      BKNBlockDesc::IsKnownAtCompileTime(),
+                                  bool>::type = false>
+struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2
+{
+    using AIndex = MultiIndex<3>;
+    using BIndex = MultiIndex<3>;
+    using CIndex = MultiIndex<4>;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    static constexpr index_t K = AKMBlockDesc{}.GetLength(I0);
+    static constexpr index_t M = AKMBlockDesc{}.GetLength(I1);
+    static constexpr index_t N = BKNBlockDesc{}.GetLength(I1);
+
+    static constexpr index_t M100 = M1N1ThreadClusterM100;
+    static constexpr index_t N100 = M1N1ThreadClusterN100;
+
+    static constexpr index_t M101 = M1N1ThreadClusterM101;
+    static constexpr index_t N101 = M1N1ThreadClusterN101;
+
+    static constexpr index_t M11 = M1PerThreadM11;
+    static constexpr index_t N11 = N1PerThreadN11;
+
+    static constexpr index_t M1 = M1N1ThreadClusterM100 * M1N1ThreadClusterM101 * M1PerThreadM11;
+    static constexpr index_t N1 = M1N1ThreadClusterN100 * M1N1ThreadClusterN101 * N1PerThreadN11;
+
+    static constexpr index_t M0 = M / M1;
+    static constexpr index_t N0 = N / N1;
+
+    __host__ __device__ static constexpr auto
+    MakeAKM0M1BlockDescriptor(const AKMBlockDesc& a_k_m_block_desc)
+    {
+        const auto a_k_m0_m1_block_desc = transform_dynamic_tensor_descriptor(
+            AKMBlockDesc{},
+            make_tuple(make_pass_through_transform(Number<K>{}),
+                       make_unmerge_transform(make_tuple(Number<M0>{}, Number<M1>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}));
+
+        return a_k_m0_m1_block_desc;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeBKN0N1BlockDescriptor(const BKNBlockDesc& b_k_n_block_desc)
+    {
+        const auto b_k_n0_n1_block_desc = transform_dynamic_tensor_descriptor(
+            BKNBlockDesc{},
+            make_tuple(make_pass_through_transform(Number<K>{}),
+                       make_unmerge_transform(make_tuple(Number<N0>{}, Number<N1>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}));
+
+        return b_k_n0_n1_block_desc;
+    }
+
+    __host__ __device__ static constexpr auto MakeCM0M100M101M11N0N100N101N11ToMNBlockAdaptor()
+    {
+        // upper: [M0, M100, M101, M11, N0, N100, N101, N11]
+        // lower: [M, N]
+        constexpr auto c_m0_m100_m101_m11_n0_n100_n101_n11_to_m_n_block_adaptor =
+            make_single_stage_tensor_adaptor(
+                make_tuple(make_unmerge_transform(make_tuple(
+                               Number<M0>{}, Number<M100>{}, Number<M101>{}, Number<M11>{})),
+                           make_unmerge_transform(make_tuple(
+                               Number<N0>{}, Number<N100>{}, Number<N101>{}, Number<N11>{}))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 2, 3>{}, Sequence<4, 5, 6, 7>{}));
+
+        return c_m0_m100_m101_m11_n0_n100_n101_n11_to_m_n_block_adaptor;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCM0M100M101M11N0N100N101N11ToM0M1N0N1BlockAdaptor()
+    {
+        // upper: [M0, M100, M101, M11, N0, N100, N101, N11]
+        // lower: [M0, M1, N0, N1]
+        constexpr auto c_m0_m100_m101_m11_n0_n100_n101_n11_to_m0_m1_n0_n1_block_adaptor =
+            make_single_stage_tensor_adaptor(
+                make_tuple(make_pass_through_transform(Number<M0>{}),
+                           make_unmerge_transform(
+                               make_tuple(Number<M100>{}, Number<M101>{}, Number<M11>{})),
+                           make_pass_through_transform(Number<N0>{}),
+                           make_unmerge_transform(
+                               make_tuple(Number<N100>{}, Number<N101>{}, Number<N11>{}))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{}, Sequence<5, 6, 7>{}));
+
+        return c_m0_m100_m101_m11_n0_n100_n101_n11_to_m0_m1_n0_n1_block_adaptor;
+    }
+
+    __host__ __device__ static constexpr auto GetCM0M1N0N1ThreadTensorLengths()
+    {
+        return Sequence<M0, M11, N0, N11>{};
+    }
+
+    static constexpr auto a_k_m0_m1_block_desc_ = MakeAKM0M1BlockDescriptor(AKMBlockDesc{});
+    static constexpr auto b_k_n0_n1_block_desc_ = MakeBKN0N1BlockDescriptor(BKNBlockDesc{});
+
+    public:
+    __device__ BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2()
+        : c_thread_origin_data_idx_{CalculateCM0M1N0N1ThreadOriginOnBlock(
+              get_thread_local_1d_id())},
+          a_thread_copy_{
+              make_tuple(0, c_thread_origin_data_idx_[I0], c_thread_origin_data_idx_[I1])},
+          b_thread_copy_{
+              make_tuple(0, c_thread_origin_data_idx_[I2], c_thread_origin_data_idx_[I3])}
+    {
+        static_assert(AKMBlockDesc::IsKnownAtCompileTime() && BKNBlockDesc::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        static_assert(BlockSize == M101 * M100 * N101 * N100,
+                      "wrong! blocksize and cluster size not consistent");
+
+        static_assert(M % M1 == 0 && N % N1 == 0, "wrong!");
+
+        static_assert(AKMBlockDesc{}.GetLength(I0) == BKNBlockDesc{}.GetLength(I0),
+                      "wrong! K dimension not consistent");
+
+        // TODO: remove this restriction
+        static_assert(M0 == 2 && N0 == 2, "wrong");
+    }
+
+    __device__ static CIndex CalculateCM0M1N0N1ThreadOriginOnBlock(index_t thread_id)
+    {
+        // lower: [M0, M1, N0, N1]
+        // upper: [M0, M100, M101, M11, N0, N100, N101, N11]
+        constexpr auto adaptor0 = MakeCM0M100M101M11N0N100N101N11ToM0M1N0N1BlockAdaptor();
+
+        // lower: [M0, M100, M101, M11, N0, N100, N101, N11]
+        // upper: [Tid, M0, M11, N0, N11]
+        constexpr auto adaptor1 = make_single_stage_tensor_adaptor(
+            make_tuple(make_merge_transform(make_tuple(M100, N100, M101, N101)),
+                       make_pass_through_transform(M0),
+                       make_pass_through_transform(M11),
+                       make_pass_through_transform(N0),
+                       make_pass_through_transform(N11)),
+            make_tuple(
+                Sequence<1, 5, 2, 6>{}, Sequence<0>{}, Sequence<3>{}, Sequence<4>{}, Sequence<7>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+
+        constexpr auto adaptor = chain_tensor_adaptors(adaptor0, adaptor1);
+
+        return adaptor.CalculateBottomIndex(make_multi_index(thread_id, 0, 0, 0, 0));
+    }
+
+    __host__ __device__ static constexpr index_t GetABlockAlignment() { return M1PerThreadM11; }
+
+    __host__ __device__ static constexpr auto GetBBlockAlignment() { return N1PerThreadN11; }
+
+    template <typename CM0M1N0N1ThreadDesc,
+              typename ABlockBuffer,
+              typename BBlockBuffer,
+              typename CThreadBuffer>
+    __device__ void Run(const CM0M1N0N1ThreadDesc& c_m0_m1_n0_n1_thread_desc,
+                        const ABlockBuffer& a_block_buf,
+                        const BBlockBuffer& b_block_buf,
+                        CThreadBuffer& c_thread_buf) const
+    {
+        static_assert(CM0M1N0N1ThreadDesc::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        // TODO: remove this restriction
+        static_assert(M0 == 2 && N0 == 2 && CM0M1N0N1ThreadDesc{}.GetLength(I0) == M0 &&
+                          CM0M1N0N1ThreadDesc{}.GetLength(I2) == N0,
+                      "wrong");
+
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum_t::Vgpr, FloatA>(
+            a_k_m0_m1_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum_t::Vgpr, FloatB>(
+            b_k_n0_n1_thread_desc_.GetElementSpaceSize());
+
+        constexpr auto threadwise_gemm =
+            ThreadwiseGemmDlops_km0m1_kn0n1_m0m1n0n1<FloatA,
+                                                     FloatB,
+                                                     FloatC,
+                                                     decltype(a_k_m0_m1_thread_desc_),
+                                                     decltype(b_k_n0_n1_thread_desc_),
+                                                     CM0M1N0N1ThreadDesc,
+                                                     Sequence<KPerThread>,
+                                                     Sequence<1, M1PerThreadM11>,
+                                                     Sequence<1, N1PerThreadN11>>{};
+
+        // read A_sub_0
+        a_thread_copy_.Run(a_k_m0_m1_block_desc_,
+                           make_tuple(I0, I0, I0),
+                           a_block_buf,
+                           a_k_m0_m1_thread_desc_,
+                           make_tuple(I0, I0, I0),
+                           a_thread_buf);
+
+        // read B_sub_0
+        b_thread_copy_.Run(b_k_n0_n1_block_desc_,
+                           make_tuple(I0, I0, I0),
+                           b_block_buf,
+                           b_k_n0_n1_thread_desc_,
+                           make_tuple(I0, I0, I0),
+                           b_thread_buf);
+
+        // read B_sub_1
+        b_thread_copy_.Run(b_k_n0_n1_block_desc_,
+                           make_tuple(I0, I1, I0),
+                           b_block_buf,
+                           b_k_n0_n1_thread_desc_,
+                           make_tuple(I0, I1, I0),
+                           b_thread_buf);
+
+        // read A_sub_1
+        a_thread_copy_.Run(a_k_m0_m1_block_desc_,
+                           make_tuple(I0, I1, I0),
+                           a_block_buf,
+                           a_k_m0_m1_thread_desc_,
+                           make_tuple(I0, I1, I0),
+                           a_thread_buf);
+
+        // C_sub_00 += transpose(A_sub_0) * B_sub_0
+        threadwise_gemm.Run(a_thread_buf,
+                            make_tuple(I0, I0, I0),
+                            b_thread_buf,
+                            make_tuple(I0, I0, I0),
+                            c_thread_buf,
+                            make_tuple(I0, I0, I0, I0));
+
+        // C_sub_01 += transpose(A_sub_0) * B_sub_1
+        threadwise_gemm.Run(a_thread_buf,
+                            make_tuple(I0, I0, I0),
+                            b_thread_buf,
+                            make_tuple(I0, I1, I0),
+                            c_thread_buf,
+                            make_tuple(I0, I0, I1, I0));
+
+        // loop over rest of k
+        static_for<KPerThread, K, KPerThread>{}([&](auto k) {
+            // read A_sub_0
+            a_thread_copy_.Run(a_k_m0_m1_block_desc_,
+                               make_tuple(k, I0, I0),
+                               a_block_buf,
+                               a_k_m0_m1_thread_desc_,
+                               make_tuple(I0, I0, I0),
+                               a_thread_buf);
+
+            // C_sub_10 += transpose(A_sub_1) * B_sub_0
+            threadwise_gemm.Run(a_thread_buf,
+                                make_tuple(I0, I1, I0),
+                                b_thread_buf,
+                                make_tuple(I0, I0, I0),
+                                c_thread_buf,
+                                make_tuple(I1, I0, I0, I0));
+
+            // read B_sub_0
+            b_thread_copy_.Run(b_k_n0_n1_block_desc_,
+                               make_tuple(k, I0, I0),
+                               b_block_buf,
+                               b_k_n0_n1_thread_desc_,
+                               make_tuple(I0, I0, I0),
+                               b_thread_buf);
+
+            // C_sub_11 += transpose(A_sub_1) * B_sub_1
+            threadwise_gemm.Run(a_thread_buf,
+                                make_tuple(I0, I1, I0),
+                                b_thread_buf,
+                                make_tuple(I0, I1, I0),
+                                c_thread_buf,
+                                make_tuple(I1, I0, I1, I0));
+
+            // read B_sub_1
+            b_thread_copy_.Run(b_k_n0_n1_block_desc_,
+                               make_tuple(k, I1, I0),
+                               b_block_buf,
+                               b_k_n0_n1_thread_desc_,
+                               make_tuple(I0, I1, I0),
+                               b_thread_buf);
+
+            // read A_sub_1
+            a_thread_copy_.Run(a_k_m0_m1_block_desc_,
+                               make_tuple(k, I1, I0),
+                               a_block_buf,
+                               a_k_m0_m1_thread_desc_,
+                               make_tuple(I0, I1, I0),
+                               a_thread_buf);
+
+            // C_sub_00 += transpose(A_sub_0) * B_sub_0
+            threadwise_gemm.Run(a_thread_buf,
+                                make_tuple(I0, I0, I0),
+                                b_thread_buf,
+                                make_tuple(I0, I0, I0),
+                                c_thread_buf,
+                                make_tuple(I0, I0, I0, I0));
+
+            // C_sub_01 += transpose(A_sub_0) * B_sub_1
+            threadwise_gemm.Run(a_thread_buf,
+                                make_tuple(I0, I0, I0),
+                                b_thread_buf,
+                                make_tuple(I0, I1, I0),
+                                c_thread_buf,
+                                make_tuple(I0, I0, I1, I0));
+        });
+
+        // C_sub_10 += transpose(A_sub_1) * B_sub_0
+        threadwise_gemm.Run(a_thread_buf,
+                            make_tuple(I0, I1, I0),
+                            b_thread_buf,
+                            make_tuple(I0, I0, I0),
+                            c_thread_buf,
+                            make_tuple(I1, I0, I0, I0));
+
+        // C_sub_11 += transpose(A_sub_1) * B_sub_1
+        threadwise_gemm.Run(a_thread_buf,
+                            make_tuple(I0, I1, I0),
+                            b_thread_buf,
+                            make_tuple(I0, I1, I0),
+                            c_thread_buf,
+                            make_tuple(I1, I0, I1, I0));
+    }
+
+    private:
+    // A[K, M0, M1]
+    static constexpr auto a_k_m0_m1_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2(
+        make_tuple(Number<KPerThread>{}, Number<M0>{}, Number<M1PerThreadM11>{}));
+
+    // B[K, N0, N1]
+    static constexpr auto b_k_n0_n1_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2(
+        make_tuple(Number<KPerThread>{}, Number<N0>{}, Number<N1PerThreadN11>{}));
+
+    using AThreadCopy =
+        ThreadwiseDynamicTensorSliceTransfer_v4<FloatA,
+                                                FloatA,
+                                                decltype(a_k_m0_m1_block_desc_),
+                                                decltype(a_k_m0_m1_thread_desc_),
+                                                Sequence<KPerThread, 1, M1PerThreadM11>,
+                                                Sequence<0, 1, 2>,
+                                                2,
+                                                AThreadCopyScalarPerVector_M11,
+                                                1>;
+
+    using BThreadCopy =
+        ThreadwiseDynamicTensorSliceTransfer_v4<FloatB,
+                                                FloatB,
+                                                decltype(b_k_n0_n1_block_desc_),
+                                                decltype(b_k_n0_n1_thread_desc_),
+                                                Sequence<KPerThread, 1, N1PerThreadN11>,
+                                                Sequence<0, 1, 2>,
+                                                2,
+                                                BThreadCopyScalarPerVector_N11,
+                                                1>;
+
+    CIndex c_thread_origin_data_idx_;
+
+    AThreadCopy a_thread_copy_;
+    BThreadCopy b_thread_copy_;
+};
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r3.hpp b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r3.hpp
new file mode 100644
index 0000000000..6a3885936e
--- /dev/null
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r3.hpp
@@ -0,0 +1,410 @@
+#ifndef CK_BLOCKWISE_GEMM_DLOPS_V2R3_HPP
+#define CK_BLOCKWISE_GEMM_DLOPS_V2R3_HPP
+
+#include "common_header.hpp"
+#include "tensor_adaptor.hpp"
+#include "threadwise_dynamic_tensor_slice_transfer_v2.hpp"
+#include "threadwise_contraction_dlops.hpp"
+
+namespace ck {
+
+// C[BM0, BM1, BN0, BN1] += transpose(A[K, BM0, BM1]) * B[K, BN0, BN1]
+// A and B are visable to the whole block, C is distributed among each thread
+// Assume:
+//   1. A:
+//     1. ABlockDesc_BK0_BM_BK1 is known at compile-time
+//     2. ABlockBuffer is DynamicBuffer
+//   2. B:
+//     1. BBlockDesc_BK0_BN_BK1 is known at compile-time
+//     2. BBlockBuffer is DynamicBuffer
+//   3. C:
+//     1. CThreadDesc_BM0_BM11_BN0_BN11 is known at compile-time
+//     2. CThreadBuffer is StaticBuffer
+// Also assume:
+//   BM10BN10ThreadClusterBM10Xs::Size() = BM10BN10ThreadClusterBN10Xs::Size() == 2
+//   BM0 = BN0 = 2. It will do 2x2 pipelined read and fma (ABBA optimization)
+template <index_t BlockSize,
+          typename FloatA,
+          typename FloatB,
+          typename FloatC,
+          typename ABlockDesc_BK0_BM_BK1,
+          typename BBlockDesc_BK0_BN_BK1,
+          index_t BM1PerThreadBM11,
+          index_t BN1PerThreadBN11,
+          index_t BK0PerThread,
+          typename BM10BN10ThreadClusterBM10Xs, // Sequence<BM10BN10ThreadClusterBM100,
+                                                //          BM10BN10ThreadClusterBM101, ...>
+          typename BM10BN10ThreadClusterBN10Xs, // Sequence<BM10BN10ThreadClusterBN100,
+                                                //          BM10BN10ThreadClusterBN101, ...>
+          index_t AThreadCopyScalarPerVector_BM11,
+          index_t BThreadCopyScalarPerVector_BN11,
+          typename std::enable_if<ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&
+                                      BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(),
+                                  bool>::type = false>
+struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2
+{
+    using AIndex = MultiIndex<3>;
+    using BIndex = MultiIndex<3>;
+    using CIndex = MultiIndex<4>;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    static constexpr index_t BK0 = ABlockDesc_BK0_BM_BK1{}.GetLength(I0);
+    static constexpr index_t BK1 = ABlockDesc_BK0_BM_BK1{}.GetLength(I2);
+    static constexpr index_t BM  = ABlockDesc_BK0_BM_BK1{}.GetLength(I1);
+    static constexpr index_t BN  = BBlockDesc_BK0_BN_BK1{}.GetLength(I1);
+
+    static constexpr index_t BM100 = BM10BN10ThreadClusterBM10Xs{}[I0];
+    static constexpr index_t BN100 = BM10BN10ThreadClusterBN10Xs{}[I0];
+
+    static constexpr index_t BM101 = BM10BN10ThreadClusterBM10Xs{}[I1];
+    static constexpr index_t BN101 = BM10BN10ThreadClusterBN10Xs{}[I1];
+
+    static constexpr index_t BM11 = BM1PerThreadBM11;
+    static constexpr index_t BN11 = BN1PerThreadBN11;
+
+    static constexpr index_t BM1 = BM100 * BM101 * BM11;
+    static constexpr index_t BN1 = BN100 * BN101 * BN11;
+
+    static constexpr index_t BM0 = BM / BM1;
+    static constexpr index_t BN0 = BN / BN1;
+
+    __host__ __device__ static constexpr auto
+    MakeABlockDescriptor_BK0_BM0_BM1_BK1(const ABlockDesc_BK0_BM_BK1& a_block_desc_bk0_bm_bk1)
+    {
+        const auto a_block_bk0_bm0_bm1_bk1 = transform_dynamic_tensor_descriptor(
+            a_block_desc_bk0_bm_bk1,
+            make_tuple(make_pass_through_transform(Number<BK0>{}),
+                       make_unmerge_transform(make_tuple(Number<BM0>{}, Number<BM1>{})),
+                       make_pass_through_transform(Number<BK1>{})),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+        return a_block_bk0_bm0_bm1_bk1;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeBBlockDescriptor_BK0_BN0_BN1_BK1(const BBlockDesc_BK0_BN_BK1& b_block_desc_bk0_bn_bk1)
+    {
+        const auto b_block_desc_bk0_bn0_bn1_bk1 = transform_dynamic_tensor_descriptor(
+            b_block_desc_bk0_bn_bk1,
+            make_tuple(make_pass_through_transform(Number<BK0>{}),
+                       make_unmerge_transform(make_tuple(Number<BN0>{}, Number<BN1>{})),
+                       make_pass_through_transform(Number<BK1>{})),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+        return b_block_desc_bk0_bn0_bn1_bk1;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCBlockAdaptor_BM0_BM100_BM101_BM11_BN0_BN100_BN101_BN11_To_BM_BN()
+    {
+        // upper: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11]
+        // lower: [BM, BN]
+        constexpr auto c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m_n =
+            make_single_stage_tensor_adaptor(
+                make_tuple(make_unmerge_transform(make_tuple(
+                               Number<BM0>{}, Number<BM100>{}, Number<BM101>{}, Number<BM11>{})),
+                           make_unmerge_transform(make_tuple(
+                               Number<BN0>{}, Number<BN100>{}, Number<BN101>{}, Number<BN11>{}))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 2, 3>{}, Sequence<4, 5, 6, 7>{}));
+
+        return c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m_n;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCBlockAdaptor_BM0_BM100_BM101_BM11_BN0_BN100_BN101_BN11_To_BM0_BM1_BN0_BN1()
+    {
+        // upper: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11]
+        // lower: [BM0, BM1, BN0, BN1]
+        constexpr auto c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m0_m1_n0_n1 =
+            make_single_stage_tensor_adaptor(
+                make_tuple(make_pass_through_transform(Number<BM0>{}),
+                           make_unmerge_transform(
+                               make_tuple(Number<BM100>{}, Number<BM101>{}, Number<BM11>{})),
+                           make_pass_through_transform(Number<BN0>{}),
+                           make_unmerge_transform(
+                               make_tuple(Number<BN100>{}, Number<BN101>{}, Number<BN11>{}))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{}, Sequence<5, 6, 7>{}));
+
+        return c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m0_m1_n0_n1;
+    }
+
+    __host__ __device__ static constexpr auto GetCThreadTensorLengths_BM0_BM1_BN0_BN1()
+    {
+        return Sequence<BM0, BM11, BN0, BN11>{};
+    }
+
+    static constexpr auto a_block_desc_bk0_bm0_bm1_bk1_ =
+        MakeABlockDescriptor_BK0_BM0_BM1_BK1(ABlockDesc_BK0_BM_BK1{});
+
+    static constexpr auto b_block_desc_bk0_bn0_bn1_bk1_ =
+        MakeBBlockDescriptor_BK0_BN0_BN1_BK1(BBlockDesc_BK0_BN_BK1{});
+
+    public:
+    __device__ BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2()
+        : c_thread_origin_data_idx_{CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1(
+              get_thread_local_1d_id())},
+          a_thread_copy_{
+              make_tuple(0, c_thread_origin_data_idx_[I0], c_thread_origin_data_idx_[I1], 0)},
+          b_thread_copy_{
+              make_tuple(0, c_thread_origin_data_idx_[I2], c_thread_origin_data_idx_[I3], 0)}
+    {
+        static_assert(ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&
+                          BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        static_assert(BlockSize == BM101 * BM100 * BN101 * BN100,
+                      "wrong! blocksize and cluster size not consistent");
+
+        static_assert(BM % BM1 == 0 && BN % BN1 == 0, "wrong!");
+
+        static_assert(ABlockDesc_BK0_BM_BK1{}.GetLength(I0) ==
+                          BBlockDesc_BK0_BN_BK1{}.GetLength(I0),
+                      "wrong! K dimension not consistent");
+
+        // TODO remove this restriction
+        static_assert(BM10BN10ThreadClusterBM10Xs::Size() == 2 &&
+                          BM10BN10ThreadClusterBN10Xs::Size() == 2,
+                      "wrong!");
+
+        // TODO: remove this restriction
+        static_assert(BM0 == 2 && BN0 == 2, "wrong");
+    }
+
+    __device__ static CIndex CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1(index_t thread_id)
+    {
+        // lower: [BM0, BM1, BN0, BN1]
+        // upper: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11]
+        constexpr auto adaptor0 =
+            MakeCBlockAdaptor_BM0_BM100_BM101_BM11_BN0_BN100_BN101_BN11_To_BM0_BM1_BN0_BN1();
+
+        // lower: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11]
+        // upper: [Tid, BM0, BM11, BN0, BN11]
+        constexpr auto adaptor1 = make_single_stage_tensor_adaptor(
+            make_tuple(make_merge_transform(make_tuple(BM100, BN100, BM101, BN101)),
+                       make_pass_through_transform(BM0),
+                       make_pass_through_transform(BM11),
+                       make_pass_through_transform(BN0),
+                       make_pass_through_transform(BN11)),
+            make_tuple(
+                Sequence<1, 5, 2, 6>{}, Sequence<0>{}, Sequence<3>{}, Sequence<4>{}, Sequence<7>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+
+        constexpr auto adaptor = chain_tensor_adaptors(adaptor0, adaptor1);
+
+        return adaptor.CalculateBottomIndex(make_multi_index(thread_id, 0, 0, 0, 0));
+    }
+
+    template <typename CThreadDesc_BM0_BM11_BN0_BN11,
+              typename ABlockBuffer,
+              typename BBlockBuffer,
+              typename CThreadBuffer>
+    __device__ void Run(const CThreadDesc_BM0_BM11_BN0_BN11&,
+                        const ABlockBuffer& a_block_buf,
+                        const BBlockBuffer& b_block_buf,
+                        CThreadBuffer& c_thread_buf) const
+    {
+        static_assert(CThreadDesc_BM0_BM11_BN0_BN11::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        // TODO: remove this restriction
+        static_assert(BM0 == 2 && BN0 == 2 &&
+                          CThreadDesc_BM0_BM11_BN0_BN11{}.GetLength(I0) == BM0 &&
+                          CThreadDesc_BM0_BM11_BN0_BN11{}.GetLength(I2) == BN0,
+                      "wrong");
+
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum_t::Vgpr, FloatA>(
+            a_thread_desc_bk0_bm0_bm1_bk1_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum_t::Vgpr, FloatB>(
+            b_thread_desc_bk0_bn0_bn1_bk1_.GetElementSpaceSize());
+
+        constexpr auto threadwise_contraction =
+            ThreadwiseContractionDlops_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1<
+                FloatA,
+                FloatB,
+                FloatC,
+                decltype(a_thread_desc_bk0_bm0_bm1_bk1_),
+                decltype(b_thread_desc_bk0_bn0_bn1_bk1_),
+                CThreadDesc_BM0_BM11_BN0_BN11,
+                Sequence<BK0PerThread, BK1>,
+                Sequence<1, BM1PerThreadBM11>,
+                Sequence<1, BN1PerThreadBN11>>{};
+
+        // read A_sub_0
+        a_thread_copy_.Run(a_block_desc_bk0_bm0_bm1_bk1_,
+                           make_tuple(I0, I0, I0, I0),
+                           a_block_buf,
+                           a_thread_desc_bk0_bm0_bm1_bk1_,
+                           make_tuple(I0, I0, I0, I0),
+                           a_thread_buf);
+
+        // read B_sub_0
+        b_thread_copy_.Run(b_block_desc_bk0_bn0_bn1_bk1_,
+                           make_tuple(I0, I0, I0, I0),
+                           b_block_buf,
+                           b_thread_desc_bk0_bn0_bn1_bk1_,
+                           make_tuple(I0, I0, I0, I0),
+                           b_thread_buf);
+
+        // read B_sub_1
+        b_thread_copy_.Run(b_block_desc_bk0_bn0_bn1_bk1_,
+                           make_tuple(I0, I1, I0, I0),
+                           b_block_buf,
+                           b_thread_desc_bk0_bn0_bn1_bk1_,
+                           make_tuple(I0, I1, I0, I0),
+                           b_thread_buf);
+
+        // read A_sub_1
+        a_thread_copy_.Run(a_block_desc_bk0_bm0_bm1_bk1_,
+                           make_tuple(I0, I1, I0, I0),
+                           a_block_buf,
+                           a_thread_desc_bk0_bm0_bm1_bk1_,
+                           make_tuple(I0, I1, I0, I0),
+                           a_thread_buf);
+
+        // C_sub_00 += transpose(A_sub_0) * B_sub_0
+        threadwise_contraction.Run(a_thread_buf,
+                                   make_tuple(I0, I0, I0, I0),
+                                   b_thread_buf,
+                                   make_tuple(I0, I0, I0, I0),
+                                   c_thread_buf,
+                                   make_tuple(I0, I0, I0, I0));
+
+        // C_sub_01 += transpose(A_sub_0) * B_sub_1
+        threadwise_contraction.Run(a_thread_buf,
+                                   make_tuple(I0, I0, I0, I0),
+                                   b_thread_buf,
+                                   make_tuple(I0, I1, I0, I0),
+                                   c_thread_buf,
+                                   make_tuple(I0, I0, I1, I0));
+
+        // loop over rest of bk0
+        static_for<BK0PerThread, BK0, BK0PerThread>{}([&](auto bk0) {
+            // read A_sub_0
+            a_thread_copy_.Run(a_block_desc_bk0_bm0_bm1_bk1_,
+                               make_tuple(bk0, I0, I0, I0),
+                               a_block_buf,
+                               a_thread_desc_bk0_bm0_bm1_bk1_,
+                               make_tuple(I0, I0, I0, I0),
+                               a_thread_buf);
+
+            // C_sub_10 += transpose(A_sub_1) * B_sub_0
+            threadwise_contraction.Run(a_thread_buf,
+                                       make_tuple(I0, I1, I0, I0),
+                                       b_thread_buf,
+                                       make_tuple(I0, I0, I0, I0),
+                                       c_thread_buf,
+                                       make_tuple(I1, I0, I0, I0));
+
+            // read B_sub_0
+            b_thread_copy_.Run(b_block_desc_bk0_bn0_bn1_bk1_,
+                               make_tuple(bk0, I0, I0, I0),
+                               b_block_buf,
+                               b_thread_desc_bk0_bn0_bn1_bk1_,
+                               make_tuple(I0, I0, I0, I0),
+                               b_thread_buf);
+
+            // C_sub_11 += transpose(A_sub_1) * B_sub_1
+            threadwise_contraction.Run(a_thread_buf,
+                                       make_tuple(I0, I1, I0, I0),
+                                       b_thread_buf,
+                                       make_tuple(I0, I1, I0, I0),
+                                       c_thread_buf,
+                                       make_tuple(I1, I0, I1, I0));
+
+            // read B_sub_1
+            b_thread_copy_.Run(b_block_desc_bk0_bn0_bn1_bk1_,
+                               make_tuple(bk0, I1, I0, I0),
+                               b_block_buf,
+                               b_thread_desc_bk0_bn0_bn1_bk1_,
+                               make_tuple(I0, I1, I0, I0),
+                               b_thread_buf);
+
+            // read A_sub_1
+            a_thread_copy_.Run(a_block_desc_bk0_bm0_bm1_bk1_,
+                               make_tuple(bk0, I1, I0, I0),
+                               a_block_buf,
+                               a_thread_desc_bk0_bm0_bm1_bk1_,
+                               make_tuple(I0, I1, I0, I0),
+                               a_thread_buf);
+
+            // C_sub_00 += transpose(A_sub_0) * B_sub_0
+            threadwise_contraction.Run(a_thread_buf,
+                                       make_tuple(I0, I0, I0, I0),
+                                       b_thread_buf,
+                                       make_tuple(I0, I0, I0, I0),
+                                       c_thread_buf,
+                                       make_tuple(I0, I0, I0, I0));
+
+            // C_sub_01 += transpose(A_sub_0) * B_sub_1
+            threadwise_contraction.Run(a_thread_buf,
+                                       make_tuple(I0, I0, I0, I0),
+                                       b_thread_buf,
+                                       make_tuple(I0, I1, I0, I0),
+                                       c_thread_buf,
+                                       make_tuple(I0, I0, I1, I0));
+        });
+
+        // C_sub_10 += transpose(A_sub_1) * B_sub_0
+        threadwise_contraction.Run(a_thread_buf,
+                                   make_tuple(I0, I1, I0, I0),
+                                   b_thread_buf,
+                                   make_tuple(I0, I0, I0, I0),
+                                   c_thread_buf,
+                                   make_tuple(I1, I0, I0, I0));
+
+        // C_sub_11 += transpose(A_sub_1) * B_sub_1
+        threadwise_contraction.Run(a_thread_buf,
+                                   make_tuple(I0, I1, I0, I0),
+                                   b_thread_buf,
+                                   make_tuple(I0, I1, I0, I0),
+                                   c_thread_buf,
+                                   make_tuple(I1, I0, I1, I0));
+    }
+
+    private:
+    // A[BK0, BM0, BM1, BK1]
+    static constexpr auto a_thread_desc_bk0_bm0_bm1_bk1_ =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
+            Number<BK0PerThread>{}, Number<BM0>{}, Number<BM1PerThreadBM11>{}, Number<BK1>{}));
+
+    // B[BK0, BN0, BN1, BK1]
+    static constexpr auto b_thread_desc_bk0_bn0_bn1_bk1_ =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
+            Number<BK0PerThread>{}, Number<BN0>{}, Number<BN1PerThreadBN11>{}, Number<BK1>{}));
+
+    using AThreadCopy = ThreadwiseDynamicTensorSliceTransfer_v4r1<
+        FloatA,
+        FloatA,
+        decltype(a_block_desc_bk0_bm0_bm1_bk1_),
+        decltype(a_thread_desc_bk0_bm0_bm1_bk1_),
+        Sequence<BK0PerThread, 1, BM1PerThreadBM11, BK1>, // SliceLengths
+        Sequence<0, 1, 2, 3>,                             // DimAccessOrder
+        Sequence<1, 1, BM1PerThreadBM11, BK1>,            // SrcVectorTensorLengths
+        Sequence<0, 1, 2, 3>>;                            // SrcVectorTensorContiguousDimOrder
+
+    using BThreadCopy = ThreadwiseDynamicTensorSliceTransfer_v4r1<
+        FloatB,
+        FloatB,
+        decltype(b_block_desc_bk0_bn0_bn1_bk1_),
+        decltype(b_thread_desc_bk0_bn0_bn1_bk1_),
+        Sequence<BK0PerThread, 1, BN1PerThreadBN11, BK1>, // SliceLengths
+        Sequence<0, 1, 2, 3>,                             // DimAccessOrder
+        Sequence<1, 1, BN1PerThreadBN11, BK1>,            // SrcVectorTensorLengths
+        Sequence<0, 1, 2, 3>>;                            // SrcVectorTensorContiguousDimOrder
+
+    CIndex c_thread_origin_data_idx_;
+
+    AThreadCopy a_thread_copy_;
+    BThreadCopy b_thread_copy_;
+};
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp
new file mode 100644
index 0000000000..074d519b76
--- /dev/null
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp
@@ -0,0 +1,190 @@
+#ifndef CK_BLOCKWISE_GEMM_DLOPS_V3_HPP
+#define CK_BLOCKWISE_GEMM_DLOPS_V3_HPP
+
+#include "common_header.hpp"
+#include "threadwise_gemm_dlops_v3.hpp"
+
+namespace ck {
+
+template <index_t BlockSize,
+          typename FloatA,
+          typename FloatB,
+          typename FloatC,
+          typename BlockMatrixA,
+          typename BlockMatrixB,
+          typename ThreadMatrixC,
+          index_t KPerThread,
+          index_t HPerThread,
+          index_t WPerThread,
+          index_t EPerThreadLoop,
+          index_t ThreadGemmADataPerRead_K,
+          index_t ThreadGemmBDataPerRead_W>
+struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
+{
+    struct MatrixIndex
+    {
+        index_t k;
+        index_t h;
+        index_t w;
+    };
+
+    // HACK: fix this @Jing Zhang
+    static constexpr index_t KPerThreadSubC = 4;
+
+    static constexpr auto a_thread_mtx_ = make_dynamic_naive_tensor_descriptor_packed_v2(
+        make_tuple(Number<EPerThreadLoop>{}, Number<KPerThreadSubC>{}));
+
+    static constexpr auto b_thread_mtx_ = make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
+        Number<EPerThreadLoop>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{}));
+
+    static constexpr auto c_thread_mtx_ = make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
+        Number<KPerThreadSubC>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{}));
+
+    using AThreadCopy =
+        ThreadwiseDynamicTensorSliceTransfer_v4<FloatA,
+                                                FloatA,
+                                                BlockMatrixA,
+                                                decltype(a_thread_mtx_),
+                                                Sequence<EPerThreadLoop, KPerThreadSubC>,
+                                                Sequence<0, 1>,
+                                                1,
+                                                ThreadGemmADataPerRead_K,
+                                                1>;
+
+    __device__ BlockwiseGemmDlops_km_kn_m0m1n0n1_v3()
+        : c_thread_begin_mtx_idx_{GetBeginOfThreadMatrixC(get_thread_local_1d_id())},
+          a_thread_copy_{make_tuple(0, c_thread_begin_mtx_idx_.k * KPerThread)}
+    {
+        static_assert(BlockMatrixA::IsKnownAtCompileTime() &&
+                          BlockMatrixB::IsKnownAtCompileTime() &&
+                          ThreadMatrixC::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+
+        static_assert(BlockMatrixA{}.GetLength(I0) == BlockMatrixB{}.GetLength(I0),
+                      "wrong! K dimension not consistent\n");
+
+        constexpr index_t K = BlockMatrixA{}.GetLength(I1); // A is transposed
+        constexpr index_t N = BlockMatrixB{}.GetLength(I1);
+        constexpr index_t H = BlockMatrixB{}.GetLength(I2);
+        constexpr index_t W = BlockMatrixB{}.GetLength(I3);
+
+        static_assert(K % KPerThread == 0 && H % HPerThread == 0 && W % WPerThread == 0,
+                      "wrong! Cannot evenly divide work among\n");
+
+        constexpr auto KThreadCluster = K / KPerThread;
+        constexpr auto HThreadCluster = H / HPerThread;
+        constexpr auto WThreadCluster = W / WPerThread;
+
+        static_assert(BlockSize == KThreadCluster * HThreadCluster * WThreadCluster,
+                      "wrong! wrong blocksize\n");
+    }
+
+    __device__ static constexpr auto GetThreadMatrixCLengths()
+    {
+        return Sequence<KPerThread, 1, HPerThread, WPerThread>{};
+    }
+
+    __device__ static MatrixIndex GetBeginOfThreadMatrixC(index_t thread_id)
+    {
+        constexpr index_t H = BlockMatrixB{}.GetLength(Number<2>{});
+        constexpr index_t W = BlockMatrixB{}.GetLength(Number<3>{});
+
+        constexpr auto num_w_threads  = W / WPerThread;
+        constexpr auto num_h_threads  = H / HPerThread;
+        constexpr auto num_hw_threads = num_w_threads * num_h_threads;
+
+        index_t k_thread_id  = thread_id / num_hw_threads;
+        index_t hw_thread_id = thread_id % num_hw_threads;
+
+        index_t h_thread_id = hw_thread_id / num_w_threads;
+        index_t w_thread_id = hw_thread_id % num_w_threads;
+
+        return MatrixIndex{k_thread_id, h_thread_id, w_thread_id};
+    }
+
+    template <typename ABlockBuffer, typename BThreadBuffer, typename CThreadBuffer>
+    __device__ void Run(const ABlockBuffer& a_block_buf,
+                        const BThreadBuffer& b_thread_buf,
+                        CThreadBuffer& c_thread_buf) const
+    {
+        static_assert(is_same<remove_cv_t<remove_reference_t<typename ABlockBuffer::type>>,
+                              remove_cv_t<remove_reference_t<FloatA>>>::value &&
+                      is_same<remove_cv_t<remove_reference_t<typename BThreadBuffer::type>>,
+                              remove_cv_t<remove_reference_t<FloatB>>>::value &&
+                      is_same<remove_cv_t<remove_reference_t<typename CThreadBuffer::type>>,
+                              remove_cv_t<remove_reference_t<FloatC>>>::value &&
+                      "wrong! inconsistent type");
+
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+
+        constexpr auto a_block_mtx = BlockMatrixA{};
+
+        constexpr auto EPerBlock = a_block_mtx.GetLength(I0);
+
+        // HACK: fix this @Jing Zhang
+        constexpr auto HoPerThreadSubC = 2;
+        constexpr auto WoPerThreadSubC = 2;
+
+        static_assert(KPerThread % KPerThreadSubC == 0, "");
+        static_assert(HPerThread % HoPerThreadSubC == 0, "");
+        static_assert(WPerThread % WoPerThreadSubC == 0, "");
+
+        // thread A buffer for GEMM
+        StaticBuffer<AddressSpaceEnum_t::Vgpr, FloatA, a_thread_mtx_.GetElementSpaceSize()>
+            a_thread_buf;
+
+        constexpr auto threadwise_gemm = ThreadwiseGemmDlops_km_kn_mn_v3<FloatA,
+                                                                         FloatB,
+                                                                         FloatC,
+                                                                         decltype(a_thread_mtx_),
+                                                                         decltype(b_thread_mtx_),
+                                                                         decltype(c_thread_mtx_),
+                                                                         HoPerThreadSubC,
+                                                                         WoPerThreadSubC>{};
+
+        static_for<0, EPerBlock, EPerThreadLoop>{}([&](auto e_begin) {
+            static_for<0, KPerThread, KPerThreadSubC>{}([&](auto k_begin) {
+                a_thread_copy_.Run(a_block_mtx,
+                                   make_tuple(e_begin, k_begin),
+                                   a_block_buf,
+                                   a_thread_mtx_,
+                                   make_tuple(I0, I0),
+                                   a_thread_buf);
+
+                static_for<0, HPerThread, HoPerThreadSubC>{}([&](auto h_begin) {
+                    static_for<0, WPerThread, WoPerThreadSubC>{}([&](auto w_begin) {
+                        threadwise_gemm.Run(a_thread_buf,
+                                            make_tuple(I0, I0),
+                                            b_thread_buf,
+                                            make_tuple(e_begin, I0, h_begin, w_begin),
+                                            c_thread_buf,
+                                            make_tuple(k_begin, I0, h_begin, w_begin));
+                    });
+                });
+            });
+        });
+    }
+
+    template <typename ABlockSliceMoveStepIdx>
+    __device__ void MoveASliceWindow(const BlockMatrixA&,
+                                     const ABlockSliceMoveStepIdx& a_block_slice_move_step_idx)
+    {
+        a_thread_copy_.MoveSrcSliceWindow(BlockMatrixA{}, a_block_slice_move_step_idx);
+    }
+
+    private:
+    MatrixIndex c_thread_begin_mtx_idx_;
+
+    AThreadCopy a_thread_copy_;
+};
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp b/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp
new file mode 100644
index 0000000000..98407ab7fc
--- /dev/null
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp
@@ -0,0 +1,528 @@
+#ifndef CK_BLOCKWISE_GEMM_XDLOPS_HPP
+#define CK_BLOCKWISE_GEMM_XDLOPS_HPP
+
+#include "common_header.hpp"
+#include "threadwise_dynamic_tensor_slice_transfer.hpp"
+#include "xdlops_gemm.hpp"
+
+namespace ck {
+
+template <index_t BlockSize,
+          typename FloatAB,
+          class ABlockDesc,
+          class BBlockDesc,
+          index_t MPerWave,
+          index_t NPerWave,
+          index_t K1>
+struct BlockwiseGemmXdlops_km_kn_m0m1m2n_v1
+{
+
+    using CIndex = MultiIndex<2>;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    static constexpr index_t WaveSize = 64;
+
+    static constexpr index_t M0 = ABlockDesc{}.GetLength(I1);
+    static constexpr index_t M1 = ABlockDesc{}.GetLength(I2);
+
+    static constexpr index_t N0 = BBlockDesc{}.GetLength(I1);
+    static constexpr index_t N1 = BBlockDesc{}.GetLength(I2);
+
+    static constexpr auto xdlops_gemm = XdlopsGemm<FloatAB, MPerWave, NPerWave, K1>{};
+
+    static constexpr index_t MWaves = M1 / MPerWave;
+    static constexpr index_t NWaves = N1 / NPerWave;
+
+    static constexpr index_t MRepeat = M0;
+    static constexpr index_t NRepeat = N0;
+
+    __device__ constexpr auto GetCLayout() const { return xdlops_gemm.GetCLayout(); }
+
+    __device__ constexpr auto GetNumBlks() const { return xdlops_gemm.GetCLayout().GetNumBlks(); }
+
+    __device__ constexpr auto GetBlkSize() const { return xdlops_gemm.GetCLayout().GetBlkSize(); }
+
+    __device__ static auto CalculateAThreadOriginDataIndex()
+    {
+        const index_t thread_id = get_thread_local_1d_id();
+        const index_t waveId    = thread_id / WaveSize;
+        const index_t laneId    = thread_id % WaveSize;
+        const index_t waveId_m  = waveId / NWaves;
+        const index_t waveId_n  = waveId % NWaves;
+
+        if constexpr(xdlops_gemm.IsKReduction)
+        {
+            const index_t m_offset = waveId_m * MPerWave + xdlops_gemm.GetBlkTd(laneId);
+            const index_t k_offset = xdlops_gemm.GetBlkId(laneId);
+            return make_tuple(k_offset, 0, m_offset, 0);
+        }
+        else
+        {
+            const index_t m_offset = waveId_m * MPerWave + laneId;
+            const index_t k_offset = 0;
+            return make_tuple(k_offset, 0, m_offset, 0);
+        }
+    }
+
+    __device__ static auto CalculateBThreadOriginDataIndex()
+    {
+        const index_t thread_id = get_thread_local_1d_id();
+        const index_t waveId    = thread_id / WaveSize;
+        const index_t laneId    = thread_id % WaveSize;
+        const index_t waveId_m  = waveId / NWaves;
+        const index_t waveId_n  = waveId % NWaves;
+
+        if constexpr(xdlops_gemm.IsKReduction)
+        {
+            const index_t n_offset = waveId_n * NPerWave + xdlops_gemm.GetBlkTd(laneId);
+            const index_t k_offset = xdlops_gemm.GetBlkId(laneId);
+            return make_tuple(k_offset, 0, n_offset, 0);
+        }
+        else
+        {
+            const index_t n_offset = waveId_n * NPerWave + laneId;
+            const index_t k_offset = 0;
+            return make_tuple(k_offset, 0, n_offset, 0);
+        }
+    }
+
+    template <index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
+    __device__ static CIndex
+        CalculateCThreadOriginDataIndex(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
+    {
+
+        const index_t waveId = get_thread_local_1d_id() / WaveSize;
+
+        const auto thread_mtx_on_blk = xdlops_gemm.GetBeginOfThreadBlk(xdlops_i, blk_i);
+
+        const index_t waveId_m = waveId / NWaves;
+        const index_t waveId_n = waveId % NWaves;
+
+        const index_t m_offset = m0 * M1 + waveId_m * MPerWave + thread_mtx_on_blk[I0];
+        const index_t n_offset = n0 * N1 + waveId_n * NPerWave + thread_mtx_on_blk[I1];
+
+        return CIndex{m_offset, n_offset};
+    }
+
+    __device__ BlockwiseGemmXdlops_km_kn_m0m1m2n_v1()
+        : a_thread_copy_{CalculateAThreadOriginDataIndex()},
+          b_thread_copy_{CalculateBThreadOriginDataIndex()}
+    {
+        static_assert(ABlockDesc::IsKnownAtCompileTime() && BBlockDesc::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        static_assert(ABlockDesc{}.GetLength(I0) == BBlockDesc{}.GetLength(I0),
+                      "wrong! K dimension not consistent");
+
+        static_assert(ABlockDesc{}.GetLength(I3) == BBlockDesc{}.GetLength(I3),
+                      "wrong! K1 dimension not consistent");
+
+        static_assert(BlockSize == MWaves * NWaves * WaveSize,
+                      "BlockSize != MWaves * NWaves * WaveSize\n");
+
+        static_assert(K1 == BBlockDesc{}.GetLength(I3), "K1 is wrong!");
+
+        constexpr index_t KPerBlock = ABlockDesc{}.GetLength(I0);
+
+        static_assert(KPerBlock % xdlops_gemm.KPerXdlops == 0, "KPerBlock is wrong!");
+
+        static_assert(K1 % xdlops_gemm.mfma_type.k_base == 0, "K1 is wrong!");
+    }
+
+    template <typename ABlockBuffer, typename BBlockBuffer, typename CThreadBuffer>
+    __device__ void Run(const ABlockBuffer& a_block_buf,
+                        const BBlockBuffer& b_block_buf,
+                        CThreadBuffer& c_thread_buf) const
+    {
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum_t::Vgpr, FloatAB>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum_t::Vgpr, FloatAB>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        constexpr index_t KPerBlock = ABlockDesc{}.GetLength(I0);
+
+        vector_type<FloatAB, a_thread_desc_.GetElementSpaceSize()> a_thread_vec;
+
+        vector_type<FloatAB, b_thread_desc_.GetElementSpaceSize()> b_thread_vec;
+
+        static_for<0, KPerBlock, xdlops_gemm.KPerXdlops>{}([&](auto k) {
+            // read A
+            a_thread_copy_.Run(ABlockDesc{},
+                               make_tuple(k, I0, I0, I0),
+                               a_block_buf,
+                               a_thread_desc_,
+                               make_tuple(I0, I0, I0, I0),
+                               a_thread_buf);
+
+            // read B
+            b_thread_copy_.Run(BBlockDesc{},
+                               make_tuple(k, I0, I0, I0),
+                               b_block_buf,
+                               b_thread_desc_,
+                               make_tuple(I0, I0, I0, I0),
+                               b_thread_buf);
+
+            using mfma_input_type =
+                typename vector_type<FloatAB, xdlops_gemm.mfma_type.k_base>::type;
+
+            static_for<0, a_thread_desc_.GetElementSpaceSize(), 1>{}([&](auto i) {
+                a_thread_vec.template AsType<FloatAB>()(Number<i>{}) = a_thread_buf[Number<i>{}];
+            });
+
+            static_for<0, b_thread_desc_.GetElementSpaceSize(), 1>{}([&](auto i) {
+                b_thread_vec.template AsType<FloatAB>()(Number<i>{}) = b_thread_buf[Number<i>{}];
+            });
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    xdlops_gemm.template Run<decltype(a_thread_desc_),
+                                             decltype(b_thread_desc_),
+                                             decltype(c_thread_desc_),
+                                             m0,
+                                             n0>(a_thread_vec.template AsType<mfma_input_type>(),
+                                                 b_thread_vec.template AsType<mfma_input_type>(),
+                                                 c_thread_buf);
+                });
+            });
+        });
+    }
+
+    private:
+    // A[K, M]
+    static constexpr auto a_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2(
+        make_tuple(I1, Number<MRepeat>{}, I1, Number<K1>{}));
+
+    // B[K, N]
+    static constexpr auto b_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2(
+        make_tuple(I1, Number<NRepeat>{}, I1, Number<K1>{}));
+
+    static constexpr auto c_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2(
+        make_tuple(Number<MRepeat>{}, Number<NRepeat>{}));
+
+    using AThreadCopy = ThreadwiseDynamicTensorSliceTransfer_v4<FloatAB,
+                                                                FloatAB,
+                                                                ABlockDesc,
+                                                                decltype(a_thread_desc_),
+                                                                Sequence<1, MRepeat, 1, K1>,
+                                                                Sequence<0, 1, 2, 3>,
+                                                                3,
+                                                                K1,
+                                                                1>;
+
+    using BThreadCopy = ThreadwiseDynamicTensorSliceTransfer_v4<FloatAB,
+                                                                FloatAB,
+                                                                BBlockDesc,
+                                                                decltype(b_thread_desc_),
+                                                                Sequence<1, NRepeat, 1, K1>,
+                                                                Sequence<0, 1, 2, 3>,
+                                                                3,
+                                                                K1,
+                                                                1>;
+
+    AThreadCopy a_thread_copy_;
+    BThreadCopy b_thread_copy_;
+};
+
+template <index_t BlockSize,
+          typename FloatAB,
+          class ABlockDesc,
+          class BBlockDesc,
+          index_t MPerWave,
+          index_t NPerWave,
+          index_t K1>
+struct BlockwiseGemmXdlops_km_kn_m0m1m2n_v1_2x2pipeline
+{
+
+    using CIndex = MultiIndex<2>;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    static constexpr auto xdlops_gemm = XdlopsGemm<float, MPerWave, NPerWave, K1>{};
+
+    static constexpr index_t WaveSize = 64;
+
+    static constexpr index_t M0 = ABlockDesc{}.GetLength(I1);
+    static constexpr index_t M1 = ABlockDesc{}.GetLength(I2);
+
+    static constexpr index_t N0 = BBlockDesc{}.GetLength(I1);
+    static constexpr index_t N1 = BBlockDesc{}.GetLength(I2);
+
+    static constexpr index_t MWaves = M1 / MPerWave;
+    static constexpr index_t NWaves = N1 / NPerWave;
+
+    static constexpr index_t MRepeat = M0;
+    static constexpr index_t NRepeat = N0;
+
+    __device__ constexpr auto GetCLayout() const { return xdlops_gemm.GetCLayout(); }
+
+    __device__ constexpr auto GetNumBlks() const { return xdlops_gemm.GetCLayout().GetNumBlks(); }
+
+    __device__ constexpr auto GetBlkSize() const { return xdlops_gemm.GetCLayout().GetBlkSize(); }
+
+    __device__ static auto CalculateAThreadOriginDataIndex()
+    {
+        const index_t thread_id = get_thread_local_1d_id();
+        const index_t waveId    = thread_id / WaveSize;
+        const index_t laneId    = thread_id % WaveSize;
+        const index_t waveId_m  = waveId / NWaves;
+        const index_t waveId_n  = waveId % NWaves;
+
+        if constexpr(xdlops_gemm.IsKReduction)
+        {
+            const index_t m_offset = waveId_m * MPerWave + xdlops_gemm.GetBlkTd(laneId);
+            const index_t k_offset = xdlops_gemm.GetBlkId(laneId);
+            return make_tuple(k_offset, 0, m_offset, 0);
+        }
+        else
+        {
+            const index_t m_offset = waveId_m * MPerWave + laneId;
+            const index_t k_offset = 0;
+            return make_tuple(k_offset, 0, m_offset, 0);
+        }
+    }
+
+    __device__ static auto CalculateBThreadOriginDataIndex()
+    {
+        const index_t thread_id = get_thread_local_1d_id();
+        const index_t waveId    = thread_id / WaveSize;
+        const index_t laneId    = thread_id % WaveSize;
+        const index_t waveId_m  = waveId / NWaves;
+        const index_t waveId_n  = waveId % NWaves;
+
+        if constexpr(xdlops_gemm.IsKReduction)
+        {
+            const index_t n_offset = waveId_n * NPerWave + xdlops_gemm.GetBlkTd(laneId);
+            const index_t k_offset = xdlops_gemm.GetBlkId(laneId);
+            return make_tuple(k_offset, 0, n_offset, 0);
+        }
+        else
+        {
+            const index_t n_offset = waveId_n * NPerWave + laneId;
+            const index_t k_offset = 0;
+            return make_tuple(k_offset, 0, n_offset, 0);
+        }
+    }
+
+    template <index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
+    __device__ static CIndex
+        CalculateCThreadOriginDataIndex(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
+    {
+
+        const index_t waveId = get_thread_local_1d_id() / WaveSize;
+
+        const auto thread_mtx_on_blk = xdlops_gemm.GetBeginOfThreadBlk(xdlops_i, blk_i);
+
+        const index_t waveId_m = waveId / NWaves;
+        const index_t waveId_n = waveId % NWaves;
+
+        const index_t m_offset = m0 * M1 + waveId_m * MPerWave + thread_mtx_on_blk[I0];
+        const index_t n_offset = n0 * N1 + waveId_n * NPerWave + thread_mtx_on_blk[I1];
+
+        return CIndex{m_offset, n_offset};
+    }
+
+    __device__ BlockwiseGemmXdlops_km_kn_m0m1m2n_v1_2x2pipeline()
+        : a_thread_copy_{CalculateAThreadOriginDataIndex()},
+          b_thread_copy_{CalculateBThreadOriginDataIndex()}
+    {
+        static_assert(ABlockDesc::IsKnownAtCompileTime() && BBlockDesc::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        static_assert(ABlockDesc{}.GetLength(I0) == BBlockDesc{}.GetLength(I0),
+                      "wrong! K dimension not consistent");
+
+        static_assert(ABlockDesc{}.GetLength(I3) == BBlockDesc{}.GetLength(I3),
+                      "wrong! K1 dimension not consistent");
+
+        static_assert(BlockSize == MWaves * NWaves * WaveSize,
+                      "BlockSize != MWaves * NWaves * WaveSize\n");
+
+        static_assert(K1 == BBlockDesc{}.GetLength(I3), "K1 is wrong!");
+
+        constexpr index_t KPerBlock = ABlockDesc{}.GetLength(I0);
+
+        static_assert(KPerBlock % xdlops_gemm.KPerXdlops == 0, "KPerBlock is wrong!");
+
+        static_assert(K1 % xdlops_gemm.mfma_type.k_base == 0, "K1 is wrong!");
+    }
+
+    template <typename ABlockBuffer, typename BBlockBuffer, typename CThreadBuffer>
+    __device__ void Run(const ABlockBuffer& a_block_buf,
+                        const BBlockBuffer& b_block_buf,
+                        CThreadBuffer& c_thread_buf) const
+    {
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum_t::Vgpr, FloatAB>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum_t::Vgpr, FloatAB>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        constexpr index_t KPerBlock = ABlockDesc{}.GetLength(I0);
+
+        // read A_sub_0
+        a_thread_copy_.Run(ABlockDesc{},
+                           make_tuple(I0, I0, I0, I0),
+                           a_block_buf,
+                           a_thread_desc_,
+                           make_tuple(I0, I0, I0, I0),
+                           a_thread_buf);
+
+        // read B_sub_0
+        b_thread_copy_.Run(BBlockDesc{},
+                           make_tuple(I0, I0, I0, I0),
+                           b_block_buf,
+                           b_thread_desc_,
+                           make_tuple(I0, I0, I0, I0),
+                           b_thread_buf);
+
+        // read B_sub_1
+        b_thread_copy_.Run(BBlockDesc{},
+                           make_tuple(I0, I1, I0, I0),
+                           b_block_buf,
+                           b_thread_desc_,
+                           make_tuple(I0, I1, I0, I0),
+                           b_thread_buf);
+
+        // read A_sub_1
+        a_thread_copy_.Run(ABlockDesc{},
+                           make_tuple(I0, I1, I0, I0),
+                           a_block_buf,
+                           a_thread_desc_,
+                           make_tuple(I0, I1, I0, I0),
+                           a_thread_buf);
+
+        // C_sub_00 += transpose(A_sub_0) * B_sub_0
+        xdlops_gemm.template Run<decltype(a_thread_desc_),
+                                 decltype(b_thread_desc_),
+                                 decltype(c_thread_desc_),
+                                 0,
+                                 0>(a_thread_buf, b_thread_buf, c_thread_buf);
+
+        // C_sub_01 += transpose(A_sub_0) * B_sub_1
+        xdlops_gemm.template Run<decltype(a_thread_desc_),
+                                 decltype(b_thread_desc_),
+                                 decltype(c_thread_desc_),
+                                 0,
+                                 1>(a_thread_buf, b_thread_buf, c_thread_buf);
+
+        static_for<xdlops_gemm.KPerXdlops, KPerBlock, xdlops_gemm.KPerXdlops>{}([&](auto k) {
+            // read A_sub_0
+            a_thread_copy_.Run(ABlockDesc{},
+                               make_tuple(k, I0, I0, I0),
+                               a_block_buf,
+                               a_thread_desc_,
+                               make_tuple(I0, I0, I0, I0),
+                               a_thread_buf);
+
+            // C_sub_10 += transpose(A_sub_1) * B_sub_0
+            xdlops_gemm.template Run<decltype(a_thread_desc_),
+                                     decltype(b_thread_desc_),
+                                     decltype(c_thread_desc_),
+                                     1,
+                                     0>(a_thread_buf, b_thread_buf, c_thread_buf);
+
+            // read B_sub_0
+            b_thread_copy_.Run(BBlockDesc{},
+                               make_tuple(k, I0, I0, I0),
+                               b_block_buf,
+                               b_thread_desc_,
+                               make_tuple(I0, I0, I0, I0),
+                               b_thread_buf);
+
+            // C_sub_11 += transpose(A_sub_1) * B_sub_1
+            xdlops_gemm.template Run<decltype(a_thread_desc_),
+                                     decltype(b_thread_desc_),
+                                     decltype(c_thread_desc_),
+                                     1,
+                                     1>(a_thread_buf, b_thread_buf, c_thread_buf);
+
+            // read B_sub_1
+            b_thread_copy_.Run(BBlockDesc{},
+                               make_tuple(k, I1, I0, I0),
+                               b_block_buf,
+                               b_thread_desc_,
+                               make_tuple(I0, I1, I0, I0),
+                               b_thread_buf);
+
+            // read A_sub_1
+            a_thread_copy_.Run(ABlockDesc{},
+                               make_tuple(k, I1, I0, I0),
+                               a_block_buf,
+                               a_thread_desc_,
+                               make_tuple(I0, I1, I0, I0),
+                               a_thread_buf);
+
+            // C_sub_00 += transpose(A_sub_0) * B_sub_0
+            xdlops_gemm.template Run<decltype(a_thread_desc_),
+                                     decltype(b_thread_desc_),
+                                     decltype(c_thread_desc_),
+                                     0,
+                                     0>(a_thread_buf, b_thread_buf, c_thread_buf);
+
+            // C_sub_01 += transpose(A_sub_0) * B_sub_1
+            xdlops_gemm.template Run<decltype(a_thread_desc_),
+                                     decltype(b_thread_desc_),
+                                     decltype(c_thread_desc_),
+                                     0,
+                                     1>(a_thread_buf, b_thread_buf, c_thread_buf);
+        });
+
+        // C_sub_10 += transpose(A_sub_1) * B_sub_0
+        xdlops_gemm.template Run<decltype(a_thread_desc_),
+                                 decltype(b_thread_desc_),
+                                 decltype(c_thread_desc_),
+                                 1,
+                                 0>(a_thread_buf, b_thread_buf, c_thread_buf);
+
+        // C_sub_11 += transpose(A_sub_1) * B_sub_1
+        xdlops_gemm.template Run<decltype(a_thread_desc_),
+                                 decltype(b_thread_desc_),
+                                 decltype(c_thread_desc_),
+                                 1,
+                                 1>(a_thread_buf, b_thread_buf, c_thread_buf);
+    }
+
+    private:
+    // A[K, M]
+    static constexpr auto a_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2(
+        make_tuple(I1, Number<MRepeat>{}, I1, Number<K1>{}));
+
+    // B[K, N]
+    static constexpr auto b_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2(
+        make_tuple(I1, Number<NRepeat>{}, I1, Number<K1>{}));
+
+    static constexpr auto c_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2(
+        make_tuple(Number<MRepeat>{}, Number<NRepeat>{}));
+
+    using AThreadCopy = ThreadwiseDynamicTensorSliceTransfer_v4<FloatAB,
+                                                                FloatAB,
+                                                                ABlockDesc,
+                                                                decltype(a_thread_desc_),
+                                                                Sequence<1, 1, 1, K1>,
+                                                                Sequence<0, 1, 2, 3>,
+                                                                3,
+                                                                1, // K1,
+                                                                1>;
+
+    using BThreadCopy = ThreadwiseDynamicTensorSliceTransfer_v4<FloatAB,
+                                                                FloatAB,
+                                                                BBlockDesc,
+                                                                decltype(b_thread_desc_),
+                                                                Sequence<1, 1, 1, K1>,
+                                                                Sequence<0, 1, 2, 3>,
+                                                                3,
+                                                                1, // K1,
+                                                                1>;
+
+    AThreadCopy a_thread_copy_;
+    BThreadCopy b_thread_copy_;
+};
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/tensor_operation/gridwise_dynamic_contraction_dlops_v1r2.hpp b/composable_kernel/include/tensor_operation/gridwise_dynamic_contraction_dlops_v1r2.hpp
new file mode 100644
index 0000000000..6d48a18169
--- /dev/null
+++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_contraction_dlops_v1r2.hpp
@@ -0,0 +1,664 @@
+#ifndef CK_GRIDWISE_DYNAMIC_CONTRACTION_DLOPS_V1R2_HPP
+#define CK_GRIDWISE_DYNAMIC_CONTRACTION_DLOPS_V1R2_HPP
+
+#include "common_header.hpp"
+#include "dynamic_multi_index_transform_helper.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "blockwise_gemm_dlops_v2r3.hpp"
+#include "blockwise_dynamic_tensor_slice_transfer_v2.hpp"
+#include "threadwise_dynamic_tensor_slice_transfer.hpp"
+#include "threadwise_dynamic_tensor_slice_set.hpp"
+
+namespace ck {
+
+template <typename GridwiseContraction,
+          typename FloatAB,
+          typename FloatC,
+          typename AGridDesc_GK0_GM0_GM10_GM11_GK1,
+          typename BGridDesc_GK0_GN0_GN10_GN11_GK1,
+          typename CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1,
+          typename CGridBlockCluster_BlockId_To_GM10_GN10,
+          bool HasMainKBlockLoop,
+          bool HasDoubleTailKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_dynamic_contraction_dlops_v1r2(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            const AGridDesc_GK0_GM0_GM10_GM11_GK1 a_grid_desc_gk0_gm0_gm10_gm11_gk1,
+            const BGridDesc_GK0_GN0_GN10_GN11_GK1 b_grid_desc_gk0_gn0_gn10_gn11_gk1,
+            const CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1 c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
+            const CGridBlockCluster_BlockId_To_GM10_GN10 c_grid_block_cluster_blockid_to_gm10_gn10)
+{
+    constexpr index_t shared_block_size =
+        GridwiseContraction::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseContraction::Run(p_a_grid,
+                             p_b_grid,
+                             p_c_grid,
+                             p_shared_block,
+                             a_grid_desc_gk0_gm0_gm10_gm11_gk1,
+                             b_grid_desc_gk0_gn0_gn10_gn11_gk1,
+                             c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
+                             c_grid_block_cluster_blockid_to_gm10_gn10,
+                             integral_constant<bool, HasMainKBlockLoop>{},
+                             integral_constant<bool, HasDoubleTailKBlockLoop>{});
+}
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
+          typename AGridDesc_GK0_GM0_GM1_GK1,
+          typename BGridDesc_GK0_GN0_GN1_GK1,
+          typename CGridDesc_GM0_GM1_GN0_GN1,
+          index_t GM1PerBlockGM11,
+          index_t GN1PerBlockGN11,
+          index_t GK0PerBlock,
+          index_t BM1PerThreadBM11,
+          index_t BN1PerThreadBN11,
+          index_t BK0PerThread,
+          typename BM10BN10ThreadClusterBM10Xs,
+          typename BM10BN10ThreadClusterBN10Xs,
+          typename ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1,
+          typename ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          typename ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
+          typename ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
+          typename ABlockTransferSrcVectorTensorContiguousDimOrder,
+          typename BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1,
+          typename BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          typename BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
+          typename BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
+          typename BBlockTransferSrcVectorTensorContiguousDimOrder,
+          typename CThreadTransferSrcDstAccessOrder,
+          index_t CThreadTransferSrcDstVectorDim,
+          index_t CThreadTransferDstScalarPerVector,
+          typename AGridIteratorHacks,
+          typename BGridIteratorHacks,
+          typename CGridIteratorHacks,
+          typename AGridMoveSliceWindowIteratorHacks,
+          typename BGridMoveSliceWindowIteratorHacks>
+struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    // GM0 and GN0 need to known at compile-time
+    static constexpr auto GM0 = CGridDesc_GM0_GM1_GN0_GN1{}.GetLength(I0);
+    static constexpr auto GN0 = CGridDesc_GM0_GM1_GN0_GN1{}.GetLength(I2);
+    static constexpr auto GK1 = AGridDesc_GK0_GM0_GM1_GK1{}.GetLength(I3);
+
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        // lds max alignment
+        // TODO: part of them should be moved into blockwise-gemm
+        // TODO: change this. I think it needs multi-dimensional alignment
+        constexpr auto max_lds_align = GK1;
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_block_desc_gk0_gm0_gm10_gm11_gk1 =
+            make_dynamic_naive_tensor_descriptor_aligned_v2(
+                make_tuple(Number<GK0PerBlock>{}, GM0, I1, Number<GM1PerBlockGM11>{}, GK1),
+                max_lds_align);
+
+        // B matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto b_block_desc_gk0_gn0_gn10_gn11_gk1 =
+            make_dynamic_naive_tensor_descriptor_aligned_v2(
+                make_tuple(Number<GK0PerBlock>{}, GN0, I1, Number<GN1PerBlockGN11>{}, GK1),
+                max_lds_align);
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_aligned_space_size = math::integer_least_multiple(
+            a_block_desc_gk0_gm0_gm10_gm11_gk1.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_aligned_space_size = math::integer_least_multiple(
+            b_block_desc_gk0_gn0_gn10_gn11_gk1.GetElementSpaceSize(), max_lds_align);
+
+        return 2 * (a_block_aligned_space_size + b_block_aligned_space_size) * sizeof(FloatAB);
+    }
+
+    __host__ __device__ static constexpr bool
+    CheckValidity(const AGridDesc_GK0_GM0_GM1_GK1& a_grid_desc_gk0_gm0_gm1_gk1,
+                  const BGridDesc_GK0_GN0_GN1_GK1& b_grid_desc_gk0_gn0_gn1_gk1,
+                  const CGridDesc_GM0_GM1_GN0_GN1& c_grid_desc_gm0_gm1_gn0_gn1)
+    {
+        static_assert(is_known_at_compile_time<remove_cv_t<decltype(GM0)>>::value &&
+                          is_known_at_compile_time<remove_cv_t<decltype(GN0)>>::value,
+                      "wrong! GM0 and GN0 need to be known at compile-time");
+
+        const auto GM1 = a_grid_desc_gk0_gm0_gm1_gk1.GetLength(I2);
+        const auto GN1 = b_grid_desc_gk0_gn0_gn1_gk1.GetLength(I2);
+        const auto GK0 = a_grid_desc_gk0_gm0_gm1_gk1.GetLength(I0);
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+
+        return (
+            (GM0 == c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I0) &&
+             GM1 == c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I1) &&
+             GN0 == c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I2) &&
+             GN1 == c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I3) &&
+             GM0 == a_grid_desc_gk0_gm0_gm1_gk1.GetLength(I1) &&
+             GM1 == a_grid_desc_gk0_gm0_gm1_gk1.GetLength(I2) &&
+             GN0 == b_grid_desc_gk0_gn0_gn1_gk1.GetLength(I1) &&
+             GN1 == b_grid_desc_gk0_gn0_gn1_gk1.GetLength(I2) &&
+             GK0 == b_grid_desc_gk0_gn0_gn1_gk1.GetLength(I0) &&
+             GK1 == b_grid_desc_gk0_gn0_gn1_gk1.GetLength(I3)) &&
+            (GM1 % GM1PerBlockGM11 == 0 && GN1 % GN1PerBlockGN11 == 0 && GK0 % GK0PerBlock == 0));
+    }
+
+    __host__ __device__ static constexpr index_t
+    CalculateGridSize(const CGridDesc_GM0_GM1_GN0_GN1& c_grid_desc_gm0_gm1_gn0_gn1)
+    {
+        const auto GM1 = c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I1);
+        const auto GN1 = c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I3);
+
+        constexpr index_t GM11 = GM1PerBlockGM11;
+        constexpr index_t GN11 = GN1PerBlockGN11;
+
+        const index_t GM10 = GM1 / GM11;
+        const index_t GN10 = GN1 / GN11;
+
+        const index_t grid_size = GM10 * GN10;
+
+        return grid_size;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t GK0)
+    {
+        const bool has_main_k_block_loop = (GK0 + GK0PerBlock) / (2 * GK0PerBlock) > 1;
+
+        return has_main_k_block_loop;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasDoubleTailKBlockLoop(index_t GK0)
+    {
+        const bool has_double_tail_k_block_loop = (GK0 / GK0PerBlock) % 2 == 0;
+
+        return has_double_tail_k_block_loop;
+    }
+
+    __host__ __device__ static constexpr auto MakeAGridDescriptor_GK0_GM0_GM10_GM11_GK1(
+        const AGridDesc_GK0_GM0_GM1_GK1& a_grid_desc_gk0_gm0_gm1_gk1)
+    {
+        const auto GK0 = a_grid_desc_gk0_gm0_gm1_gk1.GetLength(I0);
+        const auto GM1 = a_grid_desc_gk0_gm0_gm1_gk1.GetLength(I2);
+
+        const auto GM11 = Number<GM1PerBlockGM11>{};
+        const auto GM10 = GM1 / GM11;
+
+        const auto a_grid_desc_gk0_gm0_gm10_gm11_gk1 = transform_dynamic_tensor_descriptor(
+            a_grid_desc_gk0_gm0_gm1_gk1,
+            make_tuple(make_pass_through_transform(GK0),
+                       make_pass_through_transform(GM0),
+                       make_unmerge_transform(make_tuple(GM10, GM11)),
+                       make_pass_through_transform(GK1)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}));
+
+        return a_grid_desc_gk0_gm0_gm10_gm11_gk1;
+    }
+
+    __host__ __device__ static constexpr auto MakeBGridDescriptor_GK0_GN0_GN10_GN11_GK1(
+        const BGridDesc_GK0_GN0_GN1_GK1& b_grid_desc_gk0_gn0_gn1_gk1)
+    {
+        const auto GK0 = b_grid_desc_gk0_gn0_gn1_gk1.GetLength(I0);
+        const auto GN1 = b_grid_desc_gk0_gn0_gn1_gk1.GetLength(I2);
+
+        const auto GN11 = Number<GN1PerBlockGN11>{};
+        const auto GN10 = GN1 / GN11;
+
+        const auto b_grid_desc_gk0_gn0_gn10_gn11_gk1 = transform_dynamic_tensor_descriptor(
+            b_grid_desc_gk0_gn0_gn1_gk1,
+            make_tuple(make_pass_through_transform(GK0),
+                       make_pass_through_transform(GN0),
+                       make_unmerge_transform(make_tuple(GN10, GN11)),
+                       make_pass_through_transform(GK1)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}));
+
+        return b_grid_desc_gk0_gn0_gn10_gn11_gk1;
+    }
+
+    __host__ __device__ static constexpr auto MakeCGridDescriptor_GM10_BM0_BM1_GN10_BN0_BN1(
+        const CGridDesc_GM0_GM1_GN0_GN1& c_grid_desc_gm0_gm1_gn0_gn1)
+    {
+        const auto GM1 = c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I1);
+        const auto GN1 = c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I3);
+
+        constexpr auto GM11 = Number<GM1PerBlockGM11>{};
+        constexpr auto GN11 = Number<GN1PerBlockGN11>{};
+
+        const auto GM10 = GM1 / GM11;
+        const auto GN10 = GN1 / GN11;
+
+        constexpr auto BM = GM0 * GM11;
+        constexpr auto BN = GN0 * GN11;
+
+        constexpr auto BM1 =
+            Number<container_reduce(BM10BN10ThreadClusterBM10Xs{}, math::multiplies_v2{}, I1) *
+                   BM1PerThreadBM11>{};
+        constexpr auto BN1 =
+            Number<container_reduce(BM10BN10ThreadClusterBN10Xs{}, math::multiplies_v2{}, I1) *
+                   BN1PerThreadBN11>{};
+
+        constexpr auto BM0 = BM / BM1;
+        constexpr auto BN0 = BN / BN1;
+
+        const auto c_gm0_gm10_gm11_gn0_gn10_gn11_grid_desc = transform_dynamic_tensor_descriptor(
+            c_grid_desc_gm0_gm1_gn0_gn1,
+            make_tuple(make_pass_through_transform(GM0),
+                       make_unmerge_transform(make_tuple(GM10, GM11)),
+                       make_pass_through_transform(GN0),
+                       make_unmerge_transform(make_tuple(GN10, GN11))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}, Sequence<4, 5>{}));
+
+        const auto c_gm10_bm_gn10_bn_grid_desc = transform_dynamic_tensor_descriptor(
+            c_gm0_gm10_gm11_gn0_gn10_gn11_grid_desc,
+            make_tuple(make_pass_through_transform(GM10),
+                       make_merge_transform(make_tuple(GM0, GM11)),
+                       make_pass_through_transform(GN10),
+                       make_merge_transform(make_tuple(GN0, GN11))),
+            make_tuple(Sequence<1>{}, Sequence<0, 2>{}, Sequence<4>{}, Sequence<3, 5>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+        const auto c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1 = transform_dynamic_tensor_descriptor(
+            c_gm10_bm_gn10_bn_grid_desc,
+            make_tuple(make_pass_through_transform(GM10),
+                       make_unmerge_transform(make_tuple(BM0, BM1)),
+                       make_pass_through_transform(GN10),
+                       make_unmerge_transform(make_tuple(BN0, BN1))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}, Sequence<4, 5>{}));
+
+        return c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1;
+    }
+
+    __host__ __device__ static constexpr auto MakeCGridBlockCluster_BlockId_To_GM10_GN10(
+        const CGridDesc_GM0_GM1_GN0_GN1& c_grid_desc_gm0_gm1_gn0_gn1)
+    {
+        const auto GM1 = c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I1);
+        const auto GN1 = c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I3);
+
+        constexpr auto GM11 = Number<GM1PerBlockGM11>{};
+        constexpr auto GN11 = Number<GN1PerBlockGN11>{};
+
+        const auto GM10 = GM1 / GM11;
+        const auto GN10 = GN1 / GN11;
+
+        const auto c_grid_block_cluster_blockid_to_gm10_gn10 = make_single_stage_tensor_adaptor(
+            make_tuple(make_merge_transform(make_tuple(GM10, GN10))),
+            make_tuple(Sequence<0, 1>{}),
+            make_tuple(Sequence<0>{}));
+
+        return c_grid_block_cluster_blockid_to_gm10_gn10;
+    }
+
+    using AGridDesc_GK0_GM0_GM10_GM11_GK1 =
+        decltype(MakeAGridDescriptor_GK0_GM0_GM10_GM11_GK1(AGridDesc_GK0_GM0_GM1_GK1{}));
+    using BGridDesc_GK0_GN0_GN10_GN11_GK1 =
+        decltype(MakeBGridDescriptor_GK0_GN0_GN10_GN11_GK1(BGridDesc_GK0_GN0_GN1_GK1{}));
+    using CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1 =
+        decltype(MakeCGridDescriptor_GM10_BM0_BM1_GN10_BN0_BN1(CGridDesc_GM0_GM1_GN0_GN1{}));
+    using CGridBlockCluster_BlockId_To_GM10_GN10 =
+        decltype(MakeCGridBlockCluster_BlockId_To_GM10_GN10(CGridDesc_GM0_GM1_GN0_GN1{}));
+
+    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
+    __device__ static void
+    Run(const FloatAB* __restrict__ p_a_grid,
+        const FloatAB* __restrict__ p_b_grid,
+        FloatC* __restrict__ p_c_grid,
+        FloatAB* __restrict__ p_shared_block,
+        const AGridDesc_GK0_GM0_GM10_GM11_GK1& a_grid_desc_gk0_gm0_gm10_gm11_gk1,
+        const BGridDesc_GK0_GN0_GN10_GN11_GK1& b_grid_desc_gk0_gn0_gn10_gn11_gk1,
+        const CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1& c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
+        const CGridBlockCluster_BlockId_To_GM10_GN10& c_grid_block_cluster_blockid_to_gm10_gn10,
+        integral_constant<bool, HasMainKBlockLoop>,
+        integral_constant<bool, HasDoubleTailKBlockLoop>)
+    {
+        const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_a_grid, a_grid_desc_gk0_gm0_gm10_gm11_gk1.GetElementSpaceSize());
+        const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_b_grid, b_grid_desc_gk0_gn0_gn10_gn11_gk1.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_c_grid, c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1.GetElementSpaceSize());
+
+        const auto GK0 = a_grid_desc_gk0_gm0_gm10_gm11_gk1.GetLength(I0);
+
+        // divide block work by [GM10, GN10]
+        const auto c_gm10_gn10_block_cluster_idx =
+            c_grid_block_cluster_blockid_to_gm10_gn10.CalculateBottomIndex(
+                make_multi_index(get_block_1d_id()));
+
+        // HACK: this force index data into SGPR
+        const index_t igm10 = __builtin_amdgcn_readfirstlane(c_gm10_gn10_block_cluster_idx[I0]);
+        const index_t ign10 = __builtin_amdgcn_readfirstlane(c_gm10_gn10_block_cluster_idx[I1]);
+
+        // lds max alignment
+        // TODO: part of them should be moved into blockwise-gemm
+        // TODO: change this. I think it needs multi-dimensional alignment
+        constexpr auto max_lds_align = GK1;
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_block_desc_gk0_gm0_gm10_gm11_gk1 =
+            make_dynamic_naive_tensor_descriptor_aligned_v2(
+                make_tuple(Number<GK0PerBlock>{}, GM0, I1, Number<GM1PerBlockGM11>{}, GK1),
+                max_lds_align);
+
+        // B matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto b_block_desc_gk0_gn0_gn10_gn11_gk1 =
+            make_dynamic_naive_tensor_descriptor_aligned_v2(
+                make_tuple(Number<GK0PerBlock>{}, GN0, I1, Number<GN1PerBlockGN11>{}, GK1),
+                max_lds_align);
+
+        // A matrix in LDS memory for blockwise GEMM
+        //   be careful of LDS alignment
+        constexpr auto a_block_desc_gk0_bm_gk1 = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<GK0PerBlock>{}, GM0 * Number<GM1PerBlockGM11>{}, GK1), max_lds_align);
+
+        // B matrix in LDS memory for blockwise GEMM
+        //   be careful of LDS alignment
+        constexpr auto b_block_desc_gk0_bn_gk1 = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<GK0PerBlock>{}, GN0 * Number<GN1PerBlockGN11>{}, GK1), max_lds_align);
+
+        static_assert(a_block_desc_gk0_gm0_gm10_gm11_gk1.GetElementSpaceSize() ==
+                              a_block_desc_gk0_bm_gk1.GetElementSpaceSize() &&
+                          b_block_desc_gk0_gn0_gn10_gn11_gk1.GetElementSpaceSize() ==
+                              b_block_desc_gk0_bn_gk1.GetElementSpaceSize(),
+                      "wrong!");
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy = BlockwiseDynamicTensorSliceTransfer_v4r1<
+            BlockSize,
+            InMemoryDataOperationEnum_t::Set,
+            Sequence<GK0PerBlock, GM0, 1, GM1PerBlockGM11, GK1.value>,
+            ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1,
+            ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1,
+            ABlockTransferThreadClusterArrangeOrder,
+            FloatAB,
+            FloatAB,
+            decltype(a_grid_desc_gk0_gm0_gm10_gm11_gk1),
+            decltype(a_block_desc_gk0_gm0_gm10_gm11_gk1),
+            ABlockTransferSrcAccessOrder,
+            Sequence<0, 1, 2, 3, 4>,
+            ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1, // SrcVectorTensorLengths
+            ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1, // DstVectorTensorLengths
+            ABlockTransferSrcVectorTensorContiguousDimOrder, // SrcVectorTensorContiguousDimOrder
+            Sequence<0, 1, 2, 3, 4>,                         // DstVectorTensorContiguousDimOrder
+            false,
+            true>(a_grid_desc_gk0_gm0_gm10_gm11_gk1,
+                  make_multi_index(0, 0, igm10, 0, 0),
+                  a_block_desc_gk0_gm0_gm10_gm11_gk1,
+                  make_multi_index(0, 0, 0, 0, 0));
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy = BlockwiseDynamicTensorSliceTransfer_v4r1<
+            BlockSize,
+            InMemoryDataOperationEnum_t::Set,
+            Sequence<GK0PerBlock, GN0, 1, GN1PerBlockGN11, GK1.value>,
+            BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferThreadClusterArrangeOrder,
+            FloatAB,
+            FloatAB,
+            decltype(b_grid_desc_gk0_gn0_gn10_gn11_gk1),
+            decltype(b_block_desc_gk0_gn0_gn10_gn11_gk1),
+            BBlockTransferSrcAccessOrder,
+            Sequence<0, 1, 2, 3, 4>,
+            BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1, // SrcVectorTensorLengths
+            BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1, // DstVectorTensorLengths
+            BBlockTransferSrcVectorTensorContiguousDimOrder, // SrcVectorTensorContiguousDimOrder
+            Sequence<0, 1, 2, 3, 4>,                         // DstVectorTensorContiguousDimOrder
+            false,
+            true>(b_grid_desc_gk0_gn0_gn10_gn11_gk1,
+                  make_multi_index(0, 0, ign10, 0, 0),
+                  b_block_desc_gk0_gn0_gn10_gn11_gk1,
+                  make_multi_index(0, 0, 0, 0, 0));
+
+        // GEMM definition
+        //   c_mtx += transpose(a_mtx) * b_mtx
+        //     a_mtx[GK0PerBlock, GM1PerBlockGM11] is in LDS
+        //     b_mtx[KPerBlocl, GN1PerBlockGN11] is in LDS
+        //     c_mtx[GM1PerBlockGM11, GN1PerBlockGN11] is distributed among threads, and saved in
+        //       register
+        const auto blockwise_gemm =
+            BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2<
+                BlockSize,
+                FloatAB,
+                FloatAB,
+                FloatAcc,
+                decltype(a_block_desc_gk0_bm_gk1),
+                decltype(b_block_desc_gk0_bn_gk1),
+                BM1PerThreadBM11,
+                BN1PerThreadBN11,
+                BK0PerThread,
+                BM10BN10ThreadClusterBM10Xs,
+                BM10BN10ThreadClusterBN10Xs,
+                BM1PerThreadBM11,
+                BN1PerThreadBN11>{};
+
+        constexpr auto c_thread_tensor_lengths_bm0_bm1_bn0_bn1 =
+            decltype(blockwise_gemm)::GetCThreadTensorLengths_BM0_BM1_BN0_BN1();
+
+        constexpr auto c_thread_desc_bm0_bm1_bn0_bn1 =
+            make_dynamic_naive_tensor_descriptor_packed_v2(
+                sequence_to_tuple_of_number(c_thread_tensor_lengths_bm0_bm1_bn0_bn1));
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_aligned_space_size = math::integer_least_multiple(
+            a_block_desc_gk0_gm0_gm10_gm11_gk1.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_aligned_space_size = math::integer_least_multiple(
+            b_block_desc_gk0_gn0_gn10_gn11_gk1.GetElementSpaceSize(), max_lds_align);
+
+        FloatAB* p_a_block_double = p_shared_block;
+        FloatAB* p_b_block_double = p_shared_block + 2 * a_block_aligned_space_size;
+
+        // register allocation for output
+        auto c_thread_buf = make_static_buffer<AddressSpaceEnum_t::Vgpr, FloatAcc>(
+            c_thread_desc_bm0_bm1_bn0_bn1.GetElementSpaceSize());
+
+        ThreadwiseDynamicTensorSliceSet_v1<FloatAcc,
+                                           decltype(c_thread_desc_bm0_bm1_bn0_bn1),
+                                           decltype(c_thread_tensor_lengths_bm0_bm1_bn0_bn1)>{}
+            .Run(c_thread_desc_bm0_bm1_bn0_bn1,
+                 make_tuple(I0, I0, I0, I0),
+                 c_thread_buf,
+                 FloatAcc{0});
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(GK0PerBlock, 0, 0, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(GK0PerBlock, 0, 0, 0, 0);
+
+        auto a_block_even_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            p_a_block_double, a_block_desc_gk0_gm0_gm10_gm11_gk1.GetElementSpaceSize());
+        auto b_block_even_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            p_b_block_double, b_block_desc_gk0_gn0_gn10_gn11_gk1.GetElementSpaceSize());
+
+        auto a_block_odd_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            p_a_block_double + a_block_aligned_space_size,
+            a_block_desc_gk0_gm0_gm10_gm11_gk1.GetElementSpaceSize());
+        auto b_block_odd_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            p_b_block_double + b_block_aligned_space_size,
+            b_block_desc_gk0_gn0_gn10_gn11_gk1.GetElementSpaceSize());
+
+        // LDS double buffer: preload data into LDS
+        {
+            a_blockwise_copy.RunRead(
+                a_grid_desc_gk0_gm0_gm10_gm11_gk1, a_global_buf, AGridIteratorHacks{});
+            b_blockwise_copy.RunRead(
+                b_grid_desc_gk0_gn0_gn10_gn11_gk1, b_global_buf, BGridIteratorHacks{});
+
+            a_blockwise_copy.RunWrite(a_block_desc_gk0_gm0_gm10_gm11_gk1, a_block_even_buf);
+            b_blockwise_copy.RunWrite(b_block_desc_gk0_gn0_gn10_gn11_gk1, b_block_even_buf);
+        }
+
+        if constexpr(HasMainKBlockLoop)
+        {
+            index_t gk0_block_on_grid = 0;
+
+            // LDS double buffer: main body
+            // use Do-While loop instead of For loop to simplify control flow
+            do
+            {
+                // even iteration
+                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_gk0_gm0_gm10_gm11_gk1,
+                                                    a_block_slice_copy_step,
+                                                    AGridMoveSliceWindowIteratorHacks{});
+                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc_gk0_gn0_gn10_gn11_gk1,
+                                                    b_block_slice_copy_step,
+                                                    BGridMoveSliceWindowIteratorHacks{});
+
+                __syncthreads();
+
+                // LDS doubel buffer: load next data from device mem
+                a_blockwise_copy.RunRead(
+                    a_grid_desc_gk0_gm0_gm10_gm11_gk1, a_global_buf, AGridIteratorHacks{});
+                b_blockwise_copy.RunRead(
+                    b_grid_desc_gk0_gn0_gn10_gn11_gk1, b_global_buf, BGridIteratorHacks{});
+
+                // LDS double buffer: GEMM on current data
+                blockwise_gemm.Run(c_thread_desc_bm0_bm1_bn0_bn1,
+                                   a_block_even_buf,
+                                   b_block_even_buf,
+                                   c_thread_buf);
+
+                // LDS double buffer: store next data to LDS
+                a_blockwise_copy.RunWrite(a_block_desc_gk0_gm0_gm10_gm11_gk1, a_block_odd_buf);
+                b_blockwise_copy.RunWrite(b_block_desc_gk0_gn0_gn10_gn11_gk1, b_block_odd_buf);
+
+                // odd iteration
+                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_gk0_gm0_gm10_gm11_gk1,
+                                                    a_block_slice_copy_step,
+                                                    AGridMoveSliceWindowIteratorHacks{});
+                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc_gk0_gn0_gn10_gn11_gk1,
+                                                    b_block_slice_copy_step,
+                                                    BGridMoveSliceWindowIteratorHacks{});
+
+                __syncthreads();
+
+                // LDS doubel buffer: load next data from device mem
+                a_blockwise_copy.RunRead(
+                    a_grid_desc_gk0_gm0_gm10_gm11_gk1, a_global_buf, AGridIteratorHacks{});
+                b_blockwise_copy.RunRead(
+                    b_grid_desc_gk0_gn0_gn10_gn11_gk1, b_global_buf, BGridIteratorHacks{});
+
+                // LDS double buffer: GEMM on current data
+                blockwise_gemm.Run(
+                    c_thread_desc_bm0_bm1_bn0_bn1, a_block_odd_buf, b_block_odd_buf, c_thread_buf);
+
+                // LDS double buffer: store next data to LDS
+                a_blockwise_copy.RunWrite(a_block_desc_gk0_gm0_gm10_gm11_gk1, a_block_even_buf);
+                b_blockwise_copy.RunWrite(b_block_desc_gk0_gn0_gn10_gn11_gk1, b_block_even_buf);
+
+                gk0_block_on_grid += 2 * GK0PerBlock;
+            } while(gk0_block_on_grid < GK0 - 2 * GK0PerBlock);
+        }
+
+        // LDS double buffer: tail
+        if constexpr(HasDoubleTailKBlockLoop) // if has 2 iteration left
+        {
+            a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_gk0_gm0_gm10_gm11_gk1,
+                                                a_block_slice_copy_step,
+                                                AGridMoveSliceWindowIteratorHacks{});
+            b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc_gk0_gn0_gn10_gn11_gk1,
+                                                b_block_slice_copy_step,
+                                                BGridMoveSliceWindowIteratorHacks{});
+
+            __syncthreads();
+
+            // LDS double buffer: load last data from device mem
+            a_blockwise_copy.RunRead(
+                a_grid_desc_gk0_gm0_gm10_gm11_gk1, a_global_buf, AGridIteratorHacks{});
+            b_blockwise_copy.RunRead(
+                b_grid_desc_gk0_gn0_gn10_gn11_gk1, b_global_buf, BGridIteratorHacks{});
+
+            // LDS double buffer: GEMM on 2nd-last data
+            blockwise_gemm.Run(
+                c_thread_desc_bm0_bm1_bn0_bn1, a_block_even_buf, b_block_even_buf, c_thread_buf);
+
+            // LDS double buffer: store last data to LDS
+            a_blockwise_copy.RunWrite(a_block_desc_gk0_gm0_gm10_gm11_gk1, a_block_odd_buf);
+            b_blockwise_copy.RunWrite(b_block_desc_gk0_gn0_gn10_gn11_gk1, b_block_odd_buf);
+
+            __syncthreads();
+
+            // LDS double buffer: GEMM on last data
+            blockwise_gemm.Run(
+                c_thread_desc_bm0_bm1_bn0_bn1, a_block_odd_buf, b_block_odd_buf, c_thread_buf);
+        }
+        else // if has 1 iteration left
+        {
+            __syncthreads();
+
+            // LDS double buffer: GEMM on last data
+            blockwise_gemm.Run(
+                c_thread_desc_bm0_bm1_bn0_bn1, a_block_even_buf, b_block_even_buf, c_thread_buf);
+        }
+
+        // output: register to global memory
+        {
+            constexpr auto c_thread_desc_gm10_bm0_bm1_gn10_bn0_bn1 =
+                make_dynamic_naive_tensor_descriptor_packed_v2(
+                    make_tuple(I1,
+                               Number<c_thread_tensor_lengths_bm0_bm1_bn0_bn1[I0]>{},
+                               Number<c_thread_tensor_lengths_bm0_bm1_bn0_bn1[I1]>{},
+                               I1,
+                               Number<c_thread_tensor_lengths_bm0_bm1_bn0_bn1[I2]>{},
+                               Number<c_thread_tensor_lengths_bm0_bm1_bn0_bn1[I3]>{}));
+
+            const auto c_thread_origin_on_block_bm0_bm1_bn0_bn1 =
+                blockwise_gemm.CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1(
+                    get_thread_local_1d_id());
+
+            ThreadwiseDynamicTensorSliceTransfer_v1r3<
+                FloatAcc,
+                FloatC,
+                decltype(c_thread_desc_gm10_bm0_bm1_gn10_bn0_bn1),
+                decltype(c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1),
+                Sequence<1,
+                         c_thread_tensor_lengths_bm0_bm1_bn0_bn1[I0],
+                         c_thread_tensor_lengths_bm0_bm1_bn0_bn1[I1],
+                         1,
+                         c_thread_tensor_lengths_bm0_bm1_bn0_bn1[I2],
+                         c_thread_tensor_lengths_bm0_bm1_bn0_bn1[I3]>,
+                CThreadTransferSrcDstAccessOrder,
+                CThreadTransferSrcDstVectorDim,
+                CThreadTransferDstScalarPerVector,
+                CGlobalMemoryDataOperation,
+                1,
+                false>{c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
+                       make_multi_index(igm10,
+                                        c_thread_origin_on_block_bm0_bm1_bn0_bn1[I0],
+                                        c_thread_origin_on_block_bm0_bm1_bn0_bn1[I1],
+                                        ign10,
+                                        c_thread_origin_on_block_bm0_bm1_bn0_bn1[I2],
+                                        c_thread_origin_on_block_bm0_bm1_bn0_bn1[I3])}
+                .Run(c_thread_desc_gm10_bm0_bm1_gn10_bn0_bn1,
+                     make_tuple(I0, I0, I0, I0, I0, I0),
+                     c_thread_buf,
+                     c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
+                     c_grid_buf,
+                     CGridIteratorHacks{});
+        }
+    }
+};
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r2.hpp b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r2.hpp
new file mode 100644
index 0000000000..7a4ef1d7ea
--- /dev/null
+++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r2.hpp
@@ -0,0 +1,679 @@
+#ifndef CK_GRIDWISE_DYNAMIC_GEMM_DLOPS_V1R2_HPP
+#define CK_GRIDWISE_DYNAMIC_GEMM_DLOPS_V1R2_HPP
+
+#include "common_header.hpp"
+#include "dynamic_multi_index_transform_helper.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "blockwise_gemm_dlops_v2r2.hpp"
+#include "blockwise_dynamic_tensor_slice_transfer.hpp"
+#include "threadwise_dynamic_tensor_slice_transfer.hpp"
+#include "threadwise_dynamic_tensor_slice_set.hpp"
+
+namespace ck {
+
+#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AKM0M1GridDesc,
+          typename BKN0N1GridDesc,
+          typename CM0M10M11N0N10N11GridDesc,
+          typename CBlockIdToM0N0BlockClusterAdaptor,
+          bool HasMainKBlockLoop,
+          bool HasDoubleTailKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_dynamic_gemm_dlops_v1r2(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            const AKM0M1GridDesc a_k_m0_m1_grid_desc,
+            const BKN0N1GridDesc b_k_n0_n1_grid_desc,
+            const CM0M10M11N0N10N11GridDesc c_m0_m10_m11_n0_n10_n11_grid_desc,
+            const CBlockIdToM0N0BlockClusterAdaptor c_blockid_to_m0_n0_block_cluster_adaptor)
+{
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseGemm::Run(p_a_grid,
+                      p_b_grid,
+                      p_c_grid,
+                      p_shared_block,
+                      a_k_m0_m1_grid_desc,
+                      b_k_n0_n1_grid_desc,
+                      c_m0_m10_m11_n0_n10_n11_grid_desc,
+                      c_blockid_to_m0_n0_block_cluster_adaptor,
+                      integral_constant<bool, HasMainKBlockLoop>{},
+                      integral_constant<bool, HasDoubleTailKBlockLoop>{});
+}
+#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
+// pass tensor descriptor by CONSTANT void pointer
+// CONSTANT is needed to inform compiler void pointers in the kernel signature are pointing to
+// non-modifiable parameter address space, so compiler can enable corresponding optimization
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AKM0M1GridDesc,
+          typename BKN0N1GridDesc,
+          typename CM0M10M11N0N10N11GridDesc,
+          typename CBlockIdToM0N0BlockClusterAdaptor,
+          bool HasMainKBlockLoop,
+          bool HasDoubleTailKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_dynamic_gemm_dlops_v1r2(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            const void CONSTANT* p_a_k_m0_m1_grid_desc,
+            const void CONSTANT* p_b_k_n0_n1_grid_desc,
+            const void CONSTANT* p_c_m0_m10_m11_n0_n10_n11_grid_desc,
+            const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor)
+{
+    // first cast void CONSTANT void* to void*
+    // second cast void* to Desc*
+    // the copy constructor of tensor descriptor doesn't take address_space(4)
+    const auto a_k_m0_m1_grid_desc =
+        *reinterpret_cast<const AKM0M1GridDesc*>((const void*)p_a_k_m0_m1_grid_desc);
+    const auto b_k_n0_n1_grid_desc =
+        *reinterpret_cast<const BKN0N1GridDesc*>((const void*)p_b_k_n0_n1_grid_desc);
+    const auto c_m0_m10_m11_n0_n10_n11_grid_desc =
+        *reinterpret_cast<const CM0M10M11N0N10N11GridDesc*>(
+            (const void*)p_c_m0_m10_m11_n0_n10_n11_grid_desc);
+    const auto c_blockid_to_m0_n0_block_cluster_adaptor =
+        *reinterpret_cast<const CBlockIdToM0N0BlockClusterAdaptor*>(
+            (const void*)p_c_blockid_to_m0_n0_block_cluster_adaptor);
+
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseGemm::Run(p_a_grid,
+                      p_b_grid,
+                      p_c_grid,
+                      p_shared_block,
+                      a_k_m0_m1_grid_desc,
+                      b_k_n0_n1_grid_desc,
+                      c_m0_m10_m11_n0_n10_n11_grid_desc,
+                      c_blockid_to_m0_n0_block_cluster_adaptor,
+                      integral_constant<bool, HasMainKBlockLoop>{},
+                      integral_constant<bool, HasDoubleTailKBlockLoop>{});
+}
+#endif
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
+          typename AKMGridDesc,
+          typename BKNGridDesc,
+          typename CMNGridDesc,
+          index_t MPerBlockM1,
+          index_t NPerBlockN1,
+          index_t KPerBlock,
+          index_t M1PerThreadM111,
+          index_t N1PerThreadN111,
+          index_t KPerThread,
+          index_t M11N11ThreadClusterM1100,
+          index_t M11N11ThreadClusterN1100,
+          index_t M11N11ThreadClusterM1101,
+          index_t M11N11ThreadClusterN1101,
+          typename ABlockTransferThreadSliceLengths_K_M0_M1,
+          typename ABlockTransferThreadClusterLengths_K_M0_M1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_M1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          typename BBlockTransferThreadSliceLengths_K_N0_N1,
+          typename BBlockTransferThreadClusterLengths_K_N0_N1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_N1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          typename CThreadTransferSrcDstAccessOrder,
+          index_t CThreadTransferSrcDstVectorDim,
+          index_t CThreadTransferDstScalarPerVector,
+          typename AGridIteratorHacks,
+          typename BGridIteratorHacks,
+          typename CGridIteratorHacks,
+          typename AGridMoveSliceWindowIteratorHacks,
+          typename BGridMoveSliceWindowIteratorHacks>
+struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        constexpr auto max_lds_align = math::lcm(Number<ABlockTransferDstScalarPerVector_M1>{},
+                                                 Number<BBlockTransferDstScalarPerVector_N1>{},
+                                                 Number<M1PerThreadM111>{},
+                                                 Number<N1PerThreadN111>{});
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_k_m_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}), max_lds_align);
+
+        // B matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto b_k_n_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}), max_lds_align);
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_aligned_space_size =
+            math::integer_least_multiple(a_k_m_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_aligned_space_size =
+            math::integer_least_multiple(b_k_n_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        return 2 * (a_block_aligned_space_size + b_block_aligned_space_size) * sizeof(FloatAB);
+    }
+
+    __host__ __device__ static constexpr bool CheckValidity(const AKMGridDesc& a_k_m_grid_desc,
+                                                            const BKNGridDesc& b_k_n_grid_desc,
+                                                            const CMNGridDesc& c_m_n_grid_desc)
+    {
+        const auto M = a_k_m_grid_desc.GetLength(I1);
+        const auto N = b_k_n_grid_desc.GetLength(I1);
+        const auto K = a_k_m_grid_desc.GetLength(I0);
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+
+        return (M == c_m_n_grid_desc.GetLength(I0) && N == c_m_n_grid_desc.GetLength(I1) &&
+                K == b_k_n_grid_desc.GetLength(I0)) &&
+               (M % MPerBlockM1 == 0 && N % NPerBlockN1 == 0 && K % KPerBlock == 0);
+    }
+
+    __host__ __device__ static constexpr index_t CalculateGridSize(index_t M, index_t N)
+    {
+        const index_t grid_size = (M / MPerBlockM1) * (N / NPerBlockN1);
+
+        return grid_size;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    {
+        const bool has_main_k_block_loop = (K + KPerBlock) / (2 * KPerBlock) > 1;
+
+        return has_main_k_block_loop;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasDoubleTailKBlockLoop(index_t K)
+    {
+        const bool has_double_tail_k_block_loop = (K / KPerBlock) % 2 == 0;
+
+        return has_double_tail_k_block_loop;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeAKM0M1GridDescriptor(const AKMGridDesc& a_k_m_grid_desc)
+    {
+        const auto K = a_k_m_grid_desc.GetLength(I0);
+        const auto M = a_k_m_grid_desc.GetLength(I1);
+
+        const auto M1 = Number<MPerBlockM1>{};
+        const auto M0 = M / M1;
+
+        const auto a_k_m0_m1_grid_desc = transform_dynamic_tensor_descriptor(
+            a_k_m_grid_desc,
+            make_tuple(make_pass_through_transform(K), make_unmerge_transform(make_tuple(M0, M1))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}));
+
+        return a_k_m0_m1_grid_desc;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeBKN0N1GridDescriptor(const BKNGridDesc& b_k_n_grid_desc)
+    {
+        const auto K = b_k_n_grid_desc.GetLength(I0);
+        const auto N = b_k_n_grid_desc.GetLength(I1);
+
+        const auto N1 = Number<NPerBlockN1>{};
+        const auto N0 = N / N1;
+
+        const auto b_k_n0_n1_grid_desc = transform_dynamic_tensor_descriptor(
+            b_k_n_grid_desc,
+            make_tuple(make_pass_through_transform(K), make_unmerge_transform(make_tuple(N0, N1))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}));
+
+        return b_k_n0_n1_grid_desc;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCM0M10M11N0N10N11GridDescriptor(const CMNGridDesc& c_m_n_grid_desc)
+    {
+        const auto M = c_m_n_grid_desc.GetLength(I0);
+        const auto N = c_m_n_grid_desc.GetLength(I1);
+
+        constexpr auto M1 = Number<MPerBlockM1>{};
+        constexpr auto N1 = Number<NPerBlockN1>{};
+
+        const auto M0 = M / M1;
+        const auto N0 = N / N1;
+
+        constexpr auto M11 =
+            Number<M11N11ThreadClusterM1100 * M11N11ThreadClusterM1101 * M1PerThreadM111>{};
+        constexpr auto N11 =
+            Number<M11N11ThreadClusterN1100 * M11N11ThreadClusterN1101 * N1PerThreadN111>{};
+
+        constexpr auto M10 = M1 / M11;
+        constexpr auto N10 = N1 / N11;
+
+        const auto c_m0_m10_m11_n0_n10_n11_grid_desc = transform_dynamic_tensor_descriptor(
+            c_m_n_grid_desc,
+            make_tuple(make_unmerge_transform(make_tuple(M0, M10, M11)),
+                       make_unmerge_transform(make_tuple(N0, N10, N11))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{}));
+
+        return c_m0_m10_m11_n0_n10_n11_grid_desc;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCBlockIdToM0N0BlockClusterAdaptor(const CMNGridDesc& c_m_n_grid_desc)
+    {
+        const auto M = c_m_n_grid_desc.GetLength(I0);
+        const auto N = c_m_n_grid_desc.GetLength(I1);
+
+        constexpr auto M1 = Number<MPerBlockM1>{};
+        constexpr auto N1 = Number<NPerBlockN1>{};
+
+        const auto M0 = M / M1;
+        const auto N0 = N / N1;
+
+        const auto c_blockid_to_m0_n0_block_cluster_adaptor =
+            make_single_stage_tensor_adaptor(make_tuple(make_merge_transform(make_tuple(M0, N0))),
+                                             make_tuple(Sequence<0, 1>{}),
+                                             make_tuple(Sequence<0>{}));
+
+        return c_blockid_to_m0_n0_block_cluster_adaptor;
+    }
+
+    using AKM0M1GridDesc            = decltype(MakeAKM0M1GridDescriptor(AKMGridDesc{}));
+    using BKN0N1GridDesc            = decltype(MakeBKN0N1GridDescriptor(BKNGridDesc{}));
+    using CM0M10M11N0N10N11GridDesc = decltype(MakeCM0M10M11N0N10N11GridDescriptor(CMNGridDesc{}));
+    using CBlockIdToM0N0BlockClusterAdaptor =
+        decltype(MakeCBlockIdToM0N0BlockClusterAdaptor(CMNGridDesc{}));
+
+    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
+    __device__ static void
+    Run(const FloatAB* __restrict__ p_a_grid,
+        const FloatAB* __restrict__ p_b_grid,
+        FloatC* __restrict__ p_c_grid,
+        FloatAB* __restrict__ p_shared_block,
+        const AKM0M1GridDesc& a_k_m0_m1_grid_desc,
+        const BKN0N1GridDesc& b_k_n0_n1_grid_desc,
+        const CM0M10M11N0N10N11GridDesc& c_m0_m10_m11_n0_n10_n11_grid_desc,
+        const CBlockIdToM0N0BlockClusterAdaptor& c_blockid_to_m0_n0_block_cluster_adaptor,
+        integral_constant<bool, HasMainKBlockLoop>,
+        integral_constant<bool, HasDoubleTailKBlockLoop>)
+    {
+        const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_a_grid, a_k_m0_m1_grid_desc.GetElementSpaceSize());
+        const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_b_grid, b_k_n0_n1_grid_desc.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_c_grid, c_m0_m10_m11_n0_n10_n11_grid_desc.GetElementSpaceSize());
+
+        const auto K = a_k_m0_m1_grid_desc.GetLength(I0);
+
+        // divide block work by [M, N]
+        const auto c_m0_n0_block_cluster_idx =
+            c_blockid_to_m0_n0_block_cluster_adaptor.CalculateBottomIndex(
+                make_multi_index(get_block_1d_id()));
+
+        // HACK: this force index data into SGPR
+        const index_t im0 = __builtin_amdgcn_readfirstlane(c_m0_n0_block_cluster_idx[I0]);
+        const index_t in0 = __builtin_amdgcn_readfirstlane(c_m0_n0_block_cluster_idx[I1]);
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(Number<ABlockTransferDstScalarPerVector_M1>{},
+                                                 Number<BBlockTransferDstScalarPerVector_N1>{},
+                                                 Number<M1PerThreadM111>{},
+                                                 Number<N1PerThreadN111>{});
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_k_m_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}), max_lds_align);
+
+        // B matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto b_k_n_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}), max_lds_align);
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_k_m0_m1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<KPerBlock>{}, I1, Number<MPerBlockM1>{}), max_lds_align);
+
+        // B matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto b_k_n0_n1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<KPerBlock>{}, I1, Number<NPerBlockN1>{}), max_lds_align);
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            BlockwiseDynamicTensorSliceTransfer_v4<BlockSize,
+                                                   InMemoryDataOperationEnum_t::Set,
+                                                   Sequence<KPerBlock, 1, MPerBlockM1>,
+                                                   ABlockTransferThreadSliceLengths_K_M0_M1,
+                                                   ABlockTransferThreadClusterLengths_K_M0_M1,
+                                                   ABlockTransferThreadClusterArrangeOrder,
+                                                   FloatAB,
+                                                   FloatAB,
+                                                   decltype(a_k_m0_m1_grid_desc),
+                                                   decltype(a_k_m0_m1_block_desc),
+                                                   ABlockTransferSrcAccessOrder,
+                                                   Sequence<0, 1, 2>,
+                                                   ABlockTransferSrcVectorDim,
+                                                   2,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   ABlockTransferDstScalarPerVector_M1,
+                                                   1,
+                                                   1,
+                                                   AThreadTransferSrcResetCoordinateAfterRun,
+                                                   true>(a_k_m0_m1_grid_desc,
+                                                         make_multi_index(0, im0, 0),
+                                                         a_k_m0_m1_block_desc,
+                                                         make_multi_index(0, 0, 0));
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy =
+            BlockwiseDynamicTensorSliceTransfer_v4<BlockSize,
+                                                   InMemoryDataOperationEnum_t::Set,
+                                                   Sequence<KPerBlock, 1, NPerBlockN1>,
+                                                   BBlockTransferThreadSliceLengths_K_N0_N1,
+                                                   BBlockTransferThreadClusterLengths_K_N0_N1,
+                                                   BBlockTransferThreadClusterArrangeOrder,
+                                                   FloatAB,
+                                                   FloatAB,
+                                                   decltype(b_k_n0_n1_grid_desc),
+                                                   decltype(b_k_n0_n1_block_desc),
+                                                   BBlockTransferSrcAccessOrder,
+                                                   Sequence<0, 1, 2>,
+                                                   BBlockTransferSrcVectorDim,
+                                                   2,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   BBlockTransferDstScalarPerVector_N1,
+                                                   1,
+                                                   1,
+                                                   BThreadTransferSrcResetCoordinateAfterRun,
+                                                   true>(b_k_n0_n1_grid_desc,
+                                                         make_multi_index(0, in0, 0),
+                                                         b_k_n0_n1_block_desc,
+                                                         make_multi_index(0, 0, 0));
+
+        // GEMM definition
+        //   c_mtx += transpose(a_mtx) * b_mtx
+        //     a_mtx[KPerBlock, MPerBlockM1] is in LDS
+        //     b_mtx[KPerBlocl, NPerBlockN1] is in LDS
+        //     c_mtx[MPerBlockM1, NPerBlockN1] is distributed among threads, and saved in
+        //       register
+        const auto blockwise_gemm =
+            BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2<BlockSize,
+                                                                FloatAB,
+                                                                FloatAB,
+                                                                FloatAcc,
+                                                                decltype(a_k_m_block_desc),
+                                                                decltype(b_k_n_block_desc),
+                                                                M1PerThreadM111,
+                                                                N1PerThreadN111,
+                                                                KPerThread,
+                                                                M11N11ThreadClusterM1100,
+                                                                M11N11ThreadClusterN1100,
+                                                                M11N11ThreadClusterM1101,
+                                                                M11N11ThreadClusterN1101,
+                                                                M1PerThreadM111,
+                                                                N1PerThreadN111>{};
+        constexpr auto c_m10_m11_n10_n11_thread_tensor_lengths =
+            decltype(blockwise_gemm)::GetCM0M1N0N1ThreadTensorLengths();
+
+        constexpr auto c_m10_m11_n10_n11_thread_desc =
+            make_dynamic_naive_tensor_descriptor_packed_v2(
+                sequence_to_tuple_of_number(c_m10_m11_n10_n11_thread_tensor_lengths));
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_aligned_space_size =
+            math::integer_least_multiple(a_k_m0_m1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_aligned_space_size =
+            math::integer_least_multiple(b_k_n0_n1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        FloatAB* p_a_block_double = p_shared_block;
+        FloatAB* p_b_block_double = p_shared_block + 2 * a_block_aligned_space_size;
+
+        // register allocation for output
+        auto c_thread_buf = make_static_buffer<AddressSpaceEnum_t::Vgpr, FloatAcc>(
+            c_m10_m11_n10_n11_thread_desc.GetElementSpaceSize());
+
+        ThreadwiseDynamicTensorSliceSet_v1<FloatAcc,
+                                           decltype(c_m10_m11_n10_n11_thread_desc),
+                                           decltype(c_m10_m11_n10_n11_thread_tensor_lengths)>{}
+            .Run(c_m10_m11_n10_n11_thread_desc,
+                 make_tuple(I0, I0, I0, I0),
+                 c_thread_buf,
+                 FloatAcc{0});
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock, 0, 0);
+
+        // hack to control index calculation when iterating over A and B matrix for threadwise copy
+        constexpr auto a_k_m0_m1_global_iterator_hacks = AGridIteratorHacks{};
+        constexpr auto b_k_n0_n1_global_iterator_hacks = BGridIteratorHacks{};
+
+        // hack to control index calculation when move slice window for A and B matrix for
+        // threadwise copy
+        constexpr auto a_k_m0_m1_global_move_slice_window_iterator_hack =
+            AGridMoveSliceWindowIteratorHacks{};
+        constexpr auto b_k_n0_n1_global_move_slice_window_iterator_hack =
+            BGridMoveSliceWindowIteratorHacks{};
+
+        auto a_block_even_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            p_a_block_double, a_k_m0_m1_block_desc.GetElementSpaceSize());
+        auto b_block_even_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            p_b_block_double, b_k_n0_n1_block_desc.GetElementSpaceSize());
+
+        auto a_block_odd_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            p_a_block_double + a_block_aligned_space_size,
+            a_k_m0_m1_block_desc.GetElementSpaceSize());
+        auto b_block_odd_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            p_b_block_double + b_block_aligned_space_size,
+            b_k_n0_n1_block_desc.GetElementSpaceSize());
+
+        // LDS double buffer: preload data into LDS
+        {
+            a_blockwise_copy.RunRead(
+                a_k_m0_m1_grid_desc, a_global_buf, a_k_m0_m1_global_iterator_hacks);
+            b_blockwise_copy.RunRead(
+                b_k_n0_n1_grid_desc, b_global_buf, b_k_n0_n1_global_iterator_hacks);
+
+            a_blockwise_copy.RunWrite(a_k_m0_m1_block_desc, a_block_even_buf);
+            b_blockwise_copy.RunWrite(b_k_n0_n1_block_desc, b_block_even_buf);
+        }
+
+        if constexpr(HasMainKBlockLoop)
+        {
+            index_t k_block_data_begin = 0;
+
+            // LDS double buffer: main body
+            // use Do-While loop instead of For loop to simplify control flow
+            do
+            {
+                // even iteration
+                a_blockwise_copy.MoveSrcSliceWindow(
+                    a_k_m0_m1_grid_desc,
+                    a_block_slice_copy_step,
+                    a_k_m0_m1_global_move_slice_window_iterator_hack);
+                b_blockwise_copy.MoveSrcSliceWindow(
+                    b_k_n0_n1_grid_desc,
+                    b_block_slice_copy_step,
+                    b_k_n0_n1_global_move_slice_window_iterator_hack);
+
+                __syncthreads();
+
+                // LDS doubel buffer: load next data from device mem
+                a_blockwise_copy.RunRead(
+                    a_k_m0_m1_grid_desc, a_global_buf, a_k_m0_m1_global_iterator_hacks);
+                b_blockwise_copy.RunRead(
+                    b_k_n0_n1_grid_desc, b_global_buf, b_k_n0_n1_global_iterator_hacks);
+
+                // LDS double buffer: GEMM on current data
+                blockwise_gemm.Run(c_m10_m11_n10_n11_thread_desc,
+                                   a_block_even_buf,
+                                   b_block_even_buf,
+                                   c_thread_buf);
+
+                // LDS double buffer: store next data to LDS
+                a_blockwise_copy.RunWrite(a_k_m0_m1_block_desc, a_block_odd_buf);
+                b_blockwise_copy.RunWrite(b_k_n0_n1_block_desc, b_block_odd_buf);
+
+                // odd iteration
+                a_blockwise_copy.MoveSrcSliceWindow(
+                    a_k_m0_m1_grid_desc,
+                    a_block_slice_copy_step,
+                    a_k_m0_m1_global_move_slice_window_iterator_hack);
+                b_blockwise_copy.MoveSrcSliceWindow(
+                    b_k_n0_n1_grid_desc,
+                    b_block_slice_copy_step,
+                    b_k_n0_n1_global_move_slice_window_iterator_hack);
+
+                __syncthreads();
+
+                // LDS doubel buffer: load next data from device mem
+                a_blockwise_copy.RunRead(
+                    a_k_m0_m1_grid_desc, a_global_buf, a_k_m0_m1_global_iterator_hacks);
+                b_blockwise_copy.RunRead(
+                    b_k_n0_n1_grid_desc, b_global_buf, b_k_n0_n1_global_iterator_hacks);
+
+                // LDS double buffer: GEMM on current data
+                blockwise_gemm.Run(
+                    c_m10_m11_n10_n11_thread_desc, a_block_odd_buf, b_block_odd_buf, c_thread_buf);
+
+                // LDS double buffer: store next data to LDS
+                a_blockwise_copy.RunWrite(a_k_m0_m1_block_desc, a_block_even_buf);
+                b_blockwise_copy.RunWrite(b_k_n0_n1_block_desc, b_block_even_buf);
+
+                k_block_data_begin += 2 * KPerBlock;
+            } while(k_block_data_begin < K - 2 * KPerBlock);
+        }
+
+        // LDS double buffer: tail
+        if constexpr(HasDoubleTailKBlockLoop) // if has 2 iteration left
+        {
+            a_blockwise_copy.MoveSrcSliceWindow(a_k_m0_m1_grid_desc,
+                                                a_block_slice_copy_step,
+                                                a_k_m0_m1_global_move_slice_window_iterator_hack);
+            b_blockwise_copy.MoveSrcSliceWindow(b_k_n0_n1_grid_desc,
+                                                b_block_slice_copy_step,
+                                                b_k_n0_n1_global_move_slice_window_iterator_hack);
+
+            __syncthreads();
+
+            // LDS double buffer: load last data from device mem
+            a_blockwise_copy.RunRead(
+                a_k_m0_m1_grid_desc, a_global_buf, a_k_m0_m1_global_iterator_hacks);
+            b_blockwise_copy.RunRead(
+                b_k_n0_n1_grid_desc, b_global_buf, b_k_n0_n1_global_iterator_hacks);
+
+            // LDS double buffer: GEMM on 2nd-last data
+            blockwise_gemm.Run(
+                c_m10_m11_n10_n11_thread_desc, a_block_even_buf, b_block_even_buf, c_thread_buf);
+
+            // LDS double buffer: store last data to LDS
+            a_blockwise_copy.RunWrite(a_k_m0_m1_block_desc, a_block_odd_buf);
+            b_blockwise_copy.RunWrite(b_k_n0_n1_block_desc, b_block_odd_buf);
+
+            __syncthreads();
+
+            // LDS double buffer: GEMM on last data
+            blockwise_gemm.Run(
+                c_m10_m11_n10_n11_thread_desc, a_block_odd_buf, b_block_odd_buf, c_thread_buf);
+        }
+        else // if has 1 iteration left
+        {
+            __syncthreads();
+
+            // LDS double buffer: GEMM on last data
+            blockwise_gemm.Run(
+                c_m10_m11_n10_n11_thread_desc, a_block_even_buf, b_block_even_buf, c_thread_buf);
+        }
+
+        // output: register to global memory
+        {
+            constexpr index_t M11 =
+                M1PerThreadM111 * M11N11ThreadClusterM1100 * M11N11ThreadClusterM1101;
+            constexpr index_t N11 =
+                N1PerThreadN111 * M11N11ThreadClusterN1100 * M11N11ThreadClusterN1101;
+
+            constexpr index_t M10 = MPerBlockM1 / M11;
+            constexpr index_t N10 = NPerBlockN1 / N11;
+
+            constexpr index_t M111 = M1PerThreadM111;
+            constexpr index_t N111 = N1PerThreadN111;
+
+            constexpr auto c_m0_m10_m11_n0_n10_n11_thread_desc =
+                make_dynamic_naive_tensor_descriptor_packed_v2(
+                    make_tuple(I1,
+                               Number<c_m10_m11_n10_n11_thread_tensor_lengths[I0]>{},
+                               Number<c_m10_m11_n10_n11_thread_tensor_lengths[I1]>{},
+                               I1,
+                               Number<c_m10_m11_n10_n11_thread_tensor_lengths[I2]>{},
+                               Number<c_m10_m11_n10_n11_thread_tensor_lengths[I3]>{}));
+
+            const auto c_m10_m11_n10_n11_thread_origin_idx_on_block =
+                blockwise_gemm.CalculateCM0M1N0N1ThreadOriginOnBlock(get_thread_local_1d_id());
+
+            ThreadwiseDynamicTensorSliceTransfer_v1r3<
+                FloatAcc,
+                FloatC,
+                decltype(c_m0_m10_m11_n0_n10_n11_thread_desc),
+                decltype(c_m0_m10_m11_n0_n10_n11_grid_desc),
+                Sequence<1,
+                         c_m10_m11_n10_n11_thread_tensor_lengths[I0],
+                         c_m10_m11_n10_n11_thread_tensor_lengths[I1],
+                         1,
+                         c_m10_m11_n10_n11_thread_tensor_lengths[I2],
+                         c_m10_m11_n10_n11_thread_tensor_lengths[I3]>,
+                CThreadTransferSrcDstAccessOrder,
+                CThreadTransferSrcDstVectorDim,
+                CThreadTransferDstScalarPerVector,
+                CGlobalMemoryDataOperation,
+                1,
+                true>{c_m0_m10_m11_n0_n10_n11_grid_desc,
+                      make_multi_index(im0,
+                                       c_m10_m11_n10_n11_thread_origin_idx_on_block[I0],
+                                       c_m10_m11_n10_n11_thread_origin_idx_on_block[I1],
+                                       in0,
+                                       c_m10_m11_n10_n11_thread_origin_idx_on_block[I2],
+                                       c_m10_m11_n10_n11_thread_origin_idx_on_block[I3])}
+                .Run(c_m0_m10_m11_n0_n10_n11_thread_desc,
+                     make_tuple(I0, I0, I0, I0, I0, I0),
+                     c_thread_buf,
+                     c_m0_m10_m11_n0_n10_n11_grid_desc,
+                     c_grid_buf,
+                     CGridIteratorHacks{});
+        }
+    }
+};
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r3.hpp b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r3.hpp
new file mode 100644
index 0000000000..db3cb99121
--- /dev/null
+++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r3.hpp
@@ -0,0 +1,671 @@
+#ifndef CK_GRIDWISE_DYNAMIC_GEMM_V1R3_HPP
+#define CK_GRIDWISE_DYNAMIC_GEMM_V1R3_HPP
+
+#include "common_header.hpp"
+#include "dynamic_multi_index_transform_helper.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "blockwise_gemm_dlops_v2r3.hpp"
+#include "blockwise_dynamic_tensor_slice_transfer_v2.hpp"
+#include "threadwise_dynamic_tensor_slice_transfer_v2.hpp"
+#include "threadwise_dynamic_tensor_slice_set.hpp"
+
+namespace ck {
+
+#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AK0M0M1K1GridDesc,
+          typename BK0N0N1K1GridDesc,
+          typename CM0M10M11N0N10N11GridDesc,
+          typename CBlockIdToM0N0BlockClusterAdaptor,
+          bool HasMainKBlockLoop,
+          bool HasDoubleTailKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_dynamic_gemm_dlops_v1r3(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            const AK0M0M1K1GridDesc a_k0_m0_m1_k1_grid_desc,
+            const BK0N0N1K1GridDesc b_k0_n0_n1_k1_grid_desc,
+            const CM0M10M11N0N10N11GridDesc c_m0_m10_m11_n0_n10_n11_grid_desc,
+            const CBlockIdToM0N0BlockClusterAdaptor c_blockid_to_m0_n0_block_cluster_adaptor)
+{
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseGemm::Run(p_a_grid,
+                      p_b_grid,
+                      p_c_grid,
+                      p_shared_block,
+                      a_k0_m0_m1_k1_grid_desc,
+                      b_k0_n0_n1_k1_grid_desc,
+                      c_m0_m10_m11_n0_n10_n11_grid_desc,
+                      c_blockid_to_m0_n0_block_cluster_adaptor,
+                      integral_constant<bool, HasMainKBlockLoop>{},
+                      integral_constant<bool, HasDoubleTailKBlockLoop>{});
+}
+#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
+// pass tensor descriptor by CONSTANT void pointer
+// CONSTANT is needed to inform compiler void pointers in the kernel signature are pointing to
+// non-modifiable parameter address space, so compiler can enable corresponding optimization
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AK0M0M1K1GridDesc,
+          typename BK0N0N1K1GridDesc,
+          typename CM0M10M11N0N10N11GridDesc,
+          typename CBlockIdToM0N0BlockClusterAdaptor,
+          bool HasMainKBlockLoop,
+          bool HasDoubleTailKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_dynamic_gemm_dlops_v1r3(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            const void CONSTANT* p_a_k0_m0_m1_k1_grid_desc,
+            const void CONSTANT* p_b_k0_n0_n1_k1_grid_desc,
+            const void CONSTANT* p_c_m0_m10_m11_n0_n10_n11_grid_desc,
+            const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor)
+{
+    // first cast void CONSTANT void* to void*
+    // second cast void* to Desc*
+    // the copy constructor of tensor descriptor doesn't take address_space(4)
+    const auto a_k0_m0_m1_k1_grid_desc =
+        *reinterpret_cast<const AK0M0M1K1GridDesc*>((const void*)p_a_k0_m0_m1_k1_grid_desc);
+    const auto b_k0_n0_n1_k1_grid_desc =
+        *reinterpret_cast<const BK0N0N1K1GridDesc*>((const void*)p_b_k0_n0_n1_k1_grid_desc);
+    const auto c_m0_m10_m11_n0_n10_n11_grid_desc =
+        *reinterpret_cast<const CM0M10M11N0N10N11GridDesc*>(
+            (const void*)p_c_m0_m10_m11_n0_n10_n11_grid_desc);
+    const auto c_blockid_to_m0_n0_block_cluster_adaptor =
+        *reinterpret_cast<const CBlockIdToM0N0BlockClusterAdaptor*>(
+            (const void*)p_c_blockid_to_m0_n0_block_cluster_adaptor);
+
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseGemm::Run(p_a_grid,
+                      p_b_grid,
+                      p_c_grid,
+                      p_shared_block,
+                      a_k0_m0_m1_k1_grid_desc,
+                      b_k0_n0_n1_k1_grid_desc,
+                      c_m0_m10_m11_n0_n10_n11_grid_desc,
+                      c_blockid_to_m0_n0_block_cluster_adaptor,
+                      integral_constant<bool, HasMainKBlockLoop>{},
+                      integral_constant<bool, HasDoubleTailKBlockLoop>{});
+}
+#endif
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
+          typename AK0MK1GridDesc,
+          typename BK0NK1GridDesc,
+          typename CMNGridDesc,
+          index_t MPerBlockM1,
+          index_t NPerBlockN1,
+          index_t KPerBlock,
+          index_t M1PerThreadM111,
+          index_t N1PerThreadN111,
+          index_t KPerThread,
+          typename M11N11ThreadClusterM110Xs,
+          typename M11N11ThreadClusterN110Xs,
+          typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
+          typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
+          typename ABlockTransferSrcVectorTensorContiguousDimOrder,
+          typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
+          typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
+          typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
+          typename BBlockTransferSrcVectorTensorContiguousDimOrder,
+          typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
+          typename CThreadTransferSrcDstAccessOrder,
+          index_t CThreadTransferSrcDstVectorDim,
+          index_t CThreadTransferDstScalarPerVector,
+          typename AGridIteratorHacks,
+          typename BGridIteratorHacks,
+          typename CGridIteratorHacks,
+          typename AGridMoveSliceWindowIteratorHacks,
+          typename BGridMoveSliceWindowIteratorHacks>
+struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    // K1 should be Number<...>
+    static constexpr auto K1 = AK0MK1GridDesc{}.GetLength(I2);
+
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        // TODO: change this. I think it needs multi-dimensional alignment
+        constexpr auto max_lds_align = K1;
+
+        // TODO: check alignment
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_k_m_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}, K1), max_lds_align);
+
+        // TODO: check alignment
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_k_n_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}, K1), max_lds_align);
+
+        // TODO: check alignment
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_aligned_space_size =
+            math::integer_least_multiple(a_k_m_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_aligned_space_size =
+            math::integer_least_multiple(b_k_n_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        return 2 * (a_block_aligned_space_size + b_block_aligned_space_size) * sizeof(FloatAB);
+    }
+
+    __host__ __device__ static constexpr bool
+    CheckValidity(const AK0MK1GridDesc& a_k0_m_k1_grid_desc,
+                  const BK0NK1GridDesc& b_k0_n_k1_grid_desc,
+                  const CMNGridDesc& c_m_n_grid_desc)
+    {
+        const auto M  = a_k0_m_k1_grid_desc.GetLength(I1);
+        const auto N  = b_k0_n_k1_grid_desc.GetLength(I1);
+        const auto K0 = a_k0_m_k1_grid_desc.GetLength(I0);
+        const auto K1 = a_k0_m_k1_grid_desc.GetLength(I2);
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+
+        return (M == c_m_n_grid_desc.GetLength(I0) && N == c_m_n_grid_desc.GetLength(I1) &&
+                K0 == b_k0_n_k1_grid_desc.GetLength(I0) &&
+                K1 == b_k0_n_k1_grid_desc.GetLength(I2)) &&
+               (M % MPerBlockM1 == 0 && N % NPerBlockN1 == 0 && K0 % KPerBlock == 0);
+    }
+
+    __host__ __device__ static constexpr index_t CalculateGridSize(index_t M, index_t N)
+    {
+        const index_t grid_size = (M / MPerBlockM1) * (N / NPerBlockN1);
+
+        return grid_size;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K0)
+    {
+        const bool has_main_k_block_loop = (K0 + KPerBlock) / (2 * KPerBlock) > 1;
+
+        return has_main_k_block_loop;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasDoubleTailKBlockLoop(index_t K0)
+    {
+        const bool has_double_tail_k_block_loop = (K0 / KPerBlock) % 2 == 0;
+
+        return has_double_tail_k_block_loop;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeAK0M0M1K1GridDescriptor(const AK0MK1GridDesc& a_k0_m_k1_grid_desc)
+    {
+        const auto K0 = a_k0_m_k1_grid_desc.GetLength(I0);
+        const auto M  = a_k0_m_k1_grid_desc.GetLength(I1);
+
+        const auto M1 = Number<MPerBlockM1>{};
+        const auto M0 = M / M1;
+
+        const auto a_k0_m0_m1_k1_grid_desc = transform_dynamic_tensor_descriptor(
+            a_k0_m_k1_grid_desc,
+            make_tuple(make_pass_through_transform(K0),
+                       make_unmerge_transform(make_tuple(M0, M1)),
+                       make_pass_through_transform(K1)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+        return a_k0_m0_m1_k1_grid_desc;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeBK0N0N1K1GridDescriptor(const BK0NK1GridDesc& b_k0_n_k1_grid_desc)
+    {
+        const auto K0 = b_k0_n_k1_grid_desc.GetLength(I0);
+        const auto N  = b_k0_n_k1_grid_desc.GetLength(I1);
+
+        const auto N1 = Number<NPerBlockN1>{};
+        const auto N0 = N / N1;
+
+        const auto b_k0_n0_n1_k1_grid_desc = transform_dynamic_tensor_descriptor(
+            b_k0_n_k1_grid_desc,
+            make_tuple(make_pass_through_transform(K0),
+                       make_unmerge_transform(make_tuple(N0, N1)),
+                       make_pass_through_transform(K1)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+        return b_k0_n0_n1_k1_grid_desc;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCM0M10M11N0N10N11GridDescriptor(const CMNGridDesc& c_m_n_grid_desc)
+    {
+        const auto M = c_m_n_grid_desc.GetLength(I0);
+        const auto N = c_m_n_grid_desc.GetLength(I1);
+
+        constexpr auto M1 = Number<MPerBlockM1>{};
+        constexpr auto N1 = Number<NPerBlockN1>{};
+
+        const auto M0 = M / M1;
+        const auto N0 = N / N1;
+
+        constexpr auto M11 =
+            Number<container_reduce(M11N11ThreadClusterM110Xs{}, math::multiplies_v2{}, I1) *
+                   M1PerThreadM111>{};
+        constexpr auto N11 =
+            Number<container_reduce(M11N11ThreadClusterN110Xs{}, math::multiplies_v2{}, I1) *
+                   N1PerThreadN111>{};
+
+        constexpr auto M10 = M1 / M11;
+        constexpr auto N10 = N1 / N11;
+
+        const auto c_m0_m10_m11_n0_n10_n11_grid_desc = transform_dynamic_tensor_descriptor(
+            c_m_n_grid_desc,
+            make_tuple(make_unmerge_transform(make_tuple(M0, M10, M11)),
+                       make_unmerge_transform(make_tuple(N0, N10, N11))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{}));
+
+        return c_m0_m10_m11_n0_n10_n11_grid_desc;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCBlockIdToM0N0BlockClusterAdaptor(const CMNGridDesc& c_m_n_grid_desc)
+    {
+        const auto M = c_m_n_grid_desc.GetLength(I0);
+        const auto N = c_m_n_grid_desc.GetLength(I1);
+
+        constexpr auto M1 = Number<MPerBlockM1>{};
+        constexpr auto N1 = Number<NPerBlockN1>{};
+
+        const auto M0 = M / M1;
+        const auto N0 = N / N1;
+
+        const auto c_blockid_to_m0_n0_block_cluster_adaptor =
+            make_single_stage_tensor_adaptor(make_tuple(make_merge_transform(make_tuple(M0, N0))),
+                                             make_tuple(Sequence<0, 1>{}),
+                                             make_tuple(Sequence<0>{}));
+
+        return c_blockid_to_m0_n0_block_cluster_adaptor;
+    }
+
+    using AK0M0M1K1GridDesc         = decltype(MakeAK0M0M1K1GridDescriptor(AK0MK1GridDesc{}));
+    using BK0N0N1K1GridDesc         = decltype(MakeBK0N0N1K1GridDescriptor(BK0NK1GridDesc{}));
+    using CM0M10M11N0N10N11GridDesc = decltype(MakeCM0M10M11N0N10N11GridDescriptor(CMNGridDesc{}));
+    using CBlockIdToM0N0BlockClusterAdaptor =
+        decltype(MakeCBlockIdToM0N0BlockClusterAdaptor(CMNGridDesc{}));
+
+    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
+    __device__ static void
+    Run(const FloatAB* __restrict__ p_a_grid,
+        const FloatAB* __restrict__ p_b_grid,
+        FloatC* __restrict__ p_c_grid,
+        FloatAB* __restrict__ p_shared_block,
+        const AK0M0M1K1GridDesc& a_k0_m0_m1_k1_grid_desc,
+        const BK0N0N1K1GridDesc& b_k0_n0_n1_k1_grid_desc,
+        const CM0M10M11N0N10N11GridDesc& c_m0_m10_m11_n0_n10_n11_grid_desc,
+        const CBlockIdToM0N0BlockClusterAdaptor& c_blockid_to_m0_n0_block_cluster_adaptor,
+        integral_constant<bool, HasMainKBlockLoop>,
+        integral_constant<bool, HasDoubleTailKBlockLoop>)
+    {
+        const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_a_grid, a_k0_m0_m1_k1_grid_desc.GetElementSpaceSize());
+        const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_b_grid, b_k0_n0_n1_k1_grid_desc.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_c_grid, c_m0_m10_m11_n0_n10_n11_grid_desc.GetElementSpaceSize());
+
+        // divide block work by [M, N]
+        const auto c_m0_n0_block_cluster_idx =
+            c_blockid_to_m0_n0_block_cluster_adaptor.CalculateBottomIndex(
+                make_multi_index(get_block_1d_id()));
+
+        // HACK: this force index data into SGPR
+        const index_t im0 = __builtin_amdgcn_readfirstlane(c_m0_n0_block_cluster_idx[I0]);
+        const index_t in0 = __builtin_amdgcn_readfirstlane(c_m0_n0_block_cluster_idx[I1]);
+
+        // TODO: change this. I think it needs multi-dimensional alignment
+        constexpr auto max_lds_align = K1;
+
+        // TODO: check alignment
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_k0_m0_m1_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<KPerBlock>{}, I1, Number<MPerBlockM1>{}, K1), max_lds_align);
+
+        // TODO: check alignment
+        // B matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto b_k0_n0_n1_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<KPerBlock>{}, I1, Number<NPerBlockN1>{}, K1), max_lds_align);
+
+        // TODO: check alignment
+        // A matrix in LDS memory, for blockwise GEMM
+        constexpr auto a_k0_m_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}, K1), max_lds_align);
+
+        // TODO: check alignment
+        // B matrix in LDS memory, for blockwise GEMM
+        constexpr auto b_k0_n_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}, K1), max_lds_align);
+
+        static_assert(a_k0_m0_m1_k1_block_desc.GetElementSpaceSize() ==
+                          a_k0_m_k1_block_desc.GetElementSpaceSize() &&
+                      b_k0_n0_n1_k1_block_desc.GetElementSpaceSize() ==
+                          b_k0_n_k1_block_desc.GetElementSpaceSize() &&
+                      "wrong!");
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy = BlockwiseDynamicTensorSliceTransfer_v4r1<
+            BlockSize,
+            InMemoryDataOperationEnum_t::Set,
+            Sequence<KPerBlock, 1, MPerBlockM1, K1.value>,
+            ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
+            ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
+            ABlockTransferThreadClusterArrangeOrder,
+            FloatAB,
+            FloatAB,
+            decltype(a_k0_m0_m1_k1_grid_desc),
+            decltype(a_k0_m0_m1_k1_block_desc),
+            ABlockTransferSrcAccessOrder,
+            Sequence<0, 1, 2, 3>,
+            ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1, // SrcVectorTensorLengths
+            ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1, // DstVectorTensorLengths
+            ABlockTransferSrcVectorTensorContiguousDimOrder,  // SrcVectorTensorContiguousDimOrder
+            Sequence<0, 1, 2, 3>,                             // DstVectorTensorContiguousDimOrder
+            false,
+            true>(a_k0_m0_m1_k1_grid_desc,
+                  make_multi_index(0, im0, 0, 0),
+                  a_k0_m0_m1_k1_block_desc,
+                  make_multi_index(0, 0, 0, 0));
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy = BlockwiseDynamicTensorSliceTransfer_v4r1<
+            BlockSize,
+            InMemoryDataOperationEnum_t::Set,
+            Sequence<KPerBlock, 1, NPerBlockN1, K1.value>,
+            BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
+            BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
+            BBlockTransferThreadClusterArrangeOrder,
+            FloatAB,
+            FloatAB,
+            decltype(b_k0_n0_n1_k1_grid_desc),
+            decltype(b_k0_n0_n1_k1_block_desc),
+            BBlockTransferSrcAccessOrder,
+            Sequence<0, 1, 2, 3>,
+            BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1, // SrcVectorTensorLengths
+            BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1, // DstVectorTensorLengths
+            BBlockTransferSrcVectorTensorContiguousDimOrder,  // SrcVectorTensorContiguousDimOrder
+            Sequence<0, 1, 2, 3>,                             // DstVectorTensorContiguousDimOrder
+            false,
+            true>(b_k0_n0_n1_k1_grid_desc,
+                  make_multi_index(0, in0, 0, 0),
+                  b_k0_n0_n1_k1_block_desc,
+                  make_multi_index(0, 0, 0, 0));
+
+        // GEMM definition
+        //   c_mtx += transpose(a_mtx) * b_mtx
+        //     a_mtx[KPerBlock, MPerBlockM1] is in LDS
+        //     b_mtx[KPerBlocl, NPerBlockN1] is in LDS
+        //     c_mtx[MPerBlockM1, NPerBlockN1] is distributed among threads, and saved in
+        //       register
+        const auto blockwise_gemm =
+            BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2<
+                BlockSize,
+                FloatAB,
+                FloatAB,
+                FloatAcc,
+                decltype(a_k0_m_k1_block_desc),
+                decltype(b_k0_n_k1_block_desc),
+                M1PerThreadM111,
+                N1PerThreadN111,
+                KPerThread,
+                M11N11ThreadClusterM110Xs,
+                M11N11ThreadClusterN110Xs,
+                M1PerThreadM111,
+                N1PerThreadN111>{};
+
+        constexpr auto c_m10_m11_n10_n11_thread_tensor_lengths =
+            decltype(blockwise_gemm)::GetCThreadTensorLengths_BM0_BM1_BN0_BN1();
+
+        constexpr auto c_m10_m11_n10_n11_thread_desc =
+            make_dynamic_naive_tensor_descriptor_packed_v2(
+                sequence_to_tuple_of_number(c_m10_m11_n10_n11_thread_tensor_lengths));
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_aligned_space_size = math::integer_least_multiple(
+            a_k0_m0_m1_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_aligned_space_size = math::integer_least_multiple(
+            b_k0_n0_n1_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        FloatAB* p_a_block_double = p_shared_block;
+        FloatAB* p_b_block_double = p_shared_block + 2 * a_block_aligned_space_size;
+
+        // register allocation for output
+        auto c_thread_buf = make_static_buffer<AddressSpaceEnum_t::Vgpr, FloatAcc>(
+            c_m10_m11_n10_n11_thread_desc.GetElementSpaceSize());
+
+        ThreadwiseDynamicTensorSliceSet_v1<FloatAcc,
+                                           decltype(c_m10_m11_n10_n11_thread_desc),
+                                           decltype(c_m10_m11_n10_n11_thread_tensor_lengths)>{}
+            .Run(c_m10_m11_n10_n11_thread_desc,
+                 make_tuple(I0, I0, I0, I0),
+                 c_thread_buf,
+                 FloatAcc{0});
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock, 0, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock, 0, 0, 0);
+
+        auto a_block_even_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            p_a_block_double, a_k0_m0_m1_k1_block_desc.GetElementSpaceSize());
+        auto b_block_even_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            p_b_block_double, b_k0_n0_n1_k1_block_desc.GetElementSpaceSize());
+
+        auto a_block_odd_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            p_a_block_double + a_block_aligned_space_size,
+            a_k0_m0_m1_k1_block_desc.GetElementSpaceSize());
+        auto b_block_odd_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            p_b_block_double + b_block_aligned_space_size,
+            b_k0_n0_n1_k1_block_desc.GetElementSpaceSize());
+
+        // LDS double buffer: preload data into LDS
+        {
+            a_blockwise_copy.RunRead(a_k0_m0_m1_k1_grid_desc, a_global_buf, AGridIteratorHacks{});
+            b_blockwise_copy.RunRead(b_k0_n0_n1_k1_grid_desc, b_global_buf, BGridIteratorHacks{});
+
+            a_blockwise_copy.RunWrite(a_k0_m0_m1_k1_block_desc, a_block_even_buf);
+            b_blockwise_copy.RunWrite(b_k0_n0_n1_k1_block_desc, b_block_even_buf);
+        }
+
+        if constexpr(HasMainKBlockLoop)
+        {
+            const auto K0 = a_k0_m0_m1_k1_grid_desc.GetLength(I0);
+
+            index_t k_block_data_begin = 0;
+
+            // LDS double buffer: main body
+            // use Do-While loop instead of For loop to simplify control flow
+            do
+            {
+                // even iteration
+                a_blockwise_copy.MoveSrcSliceWindow(a_k0_m0_m1_k1_grid_desc,
+                                                    a_block_slice_copy_step,
+                                                    AGridMoveSliceWindowIteratorHacks{});
+                b_blockwise_copy.MoveSrcSliceWindow(b_k0_n0_n1_k1_grid_desc,
+                                                    b_block_slice_copy_step,
+                                                    BGridMoveSliceWindowIteratorHacks{});
+
+                __syncthreads();
+
+                // LDS doubel buffer: load next data from device mem
+                a_blockwise_copy.RunRead(
+                    a_k0_m0_m1_k1_grid_desc, a_global_buf, AGridIteratorHacks{});
+                b_blockwise_copy.RunRead(
+                    b_k0_n0_n1_k1_grid_desc, b_global_buf, BGridIteratorHacks{});
+
+                // LDS double buffer: GEMM on current data
+                blockwise_gemm.Run(c_m10_m11_n10_n11_thread_desc,
+                                   a_block_even_buf,
+                                   b_block_even_buf,
+                                   c_thread_buf);
+
+                // LDS double buffer: store next data to LDS
+                a_blockwise_copy.RunWrite(a_k0_m0_m1_k1_block_desc, a_block_odd_buf);
+                b_blockwise_copy.RunWrite(b_k0_n0_n1_k1_block_desc, b_block_odd_buf);
+
+                // odd iteration
+                a_blockwise_copy.MoveSrcSliceWindow(a_k0_m0_m1_k1_grid_desc,
+                                                    a_block_slice_copy_step,
+                                                    AGridMoveSliceWindowIteratorHacks{});
+                b_blockwise_copy.MoveSrcSliceWindow(b_k0_n0_n1_k1_grid_desc,
+                                                    b_block_slice_copy_step,
+                                                    BGridMoveSliceWindowIteratorHacks{});
+
+                __syncthreads();
+
+                // LDS doubel buffer: load next data from device mem
+                a_blockwise_copy.RunRead(
+                    a_k0_m0_m1_k1_grid_desc, a_global_buf, AGridIteratorHacks{});
+                b_blockwise_copy.RunRead(
+                    b_k0_n0_n1_k1_grid_desc, b_global_buf, BGridIteratorHacks{});
+
+                // LDS double buffer: GEMM on current data
+                blockwise_gemm.Run(
+                    c_m10_m11_n10_n11_thread_desc, a_block_odd_buf, b_block_odd_buf, c_thread_buf);
+
+                // LDS double buffer: store next data to LDS
+                a_blockwise_copy.RunWrite(a_k0_m0_m1_k1_block_desc, a_block_even_buf);
+                b_blockwise_copy.RunWrite(b_k0_n0_n1_k1_block_desc, b_block_even_buf);
+
+                k_block_data_begin += 2 * KPerBlock;
+            } while(k_block_data_begin < K0 - 2 * KPerBlock);
+        }
+
+        // LDS double buffer: tail
+        if constexpr(HasDoubleTailKBlockLoop) // if has 2 iteration left
+        {
+            a_blockwise_copy.MoveSrcSliceWindow(a_k0_m0_m1_k1_grid_desc,
+                                                a_block_slice_copy_step,
+                                                AGridMoveSliceWindowIteratorHacks{});
+            b_blockwise_copy.MoveSrcSliceWindow(b_k0_n0_n1_k1_grid_desc,
+                                                b_block_slice_copy_step,
+                                                BGridMoveSliceWindowIteratorHacks{});
+
+            __syncthreads();
+
+            // LDS double buffer: load last data from device mem
+            a_blockwise_copy.RunRead(a_k0_m0_m1_k1_grid_desc, a_global_buf, AGridIteratorHacks{});
+            b_blockwise_copy.RunRead(b_k0_n0_n1_k1_grid_desc, b_global_buf, BGridIteratorHacks{});
+
+            // LDS double buffer: GEMM on 2nd-last data
+            blockwise_gemm.Run(
+                c_m10_m11_n10_n11_thread_desc, a_block_even_buf, b_block_even_buf, c_thread_buf);
+
+            // LDS double buffer: store last data to LDS
+            a_blockwise_copy.RunWrite(a_k0_m0_m1_k1_block_desc, a_block_odd_buf);
+            b_blockwise_copy.RunWrite(b_k0_n0_n1_k1_block_desc, b_block_odd_buf);
+
+            __syncthreads();
+
+            // LDS double buffer: GEMM on last data
+            blockwise_gemm.Run(
+                c_m10_m11_n10_n11_thread_desc, a_block_odd_buf, b_block_odd_buf, c_thread_buf);
+        }
+        else // if has 1 iteration left
+        {
+            __syncthreads();
+
+            // LDS double buffer: GEMM on last data
+            blockwise_gemm.Run(
+                c_m10_m11_n10_n11_thread_desc, a_block_even_buf, b_block_even_buf, c_thread_buf);
+        }
+
+        // output: register to global memory
+        {
+            constexpr auto M11 =
+                Number<container_reduce(M11N11ThreadClusterM110Xs{}, math::multiplies_v2{}, I1) *
+                       M1PerThreadM111>{};
+            constexpr auto N11 =
+                Number<container_reduce(M11N11ThreadClusterN110Xs{}, math::multiplies_v2{}, I1) *
+                       N1PerThreadN111>{};
+
+            constexpr index_t M10 = MPerBlockM1 / M11;
+            constexpr index_t N10 = NPerBlockN1 / N11;
+
+            constexpr index_t M111 = M1PerThreadM111;
+            constexpr index_t N111 = N1PerThreadN111;
+
+            constexpr auto c_m0_m10_m11_n0_n10_n11_thread_desc =
+                make_dynamic_naive_tensor_descriptor_packed_v2(
+                    make_tuple(I1,
+                               Number<c_m10_m11_n10_n11_thread_tensor_lengths[I0]>{},
+                               Number<c_m10_m11_n10_n11_thread_tensor_lengths[I1]>{},
+                               I1,
+                               Number<c_m10_m11_n10_n11_thread_tensor_lengths[I2]>{},
+                               Number<c_m10_m11_n10_n11_thread_tensor_lengths[I3]>{}));
+
+            const auto c_m10_m11_n10_n11_thread_origin_idx_on_block =
+                blockwise_gemm.CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1(
+                    get_thread_local_1d_id());
+
+            ThreadwiseDynamicTensorSliceTransfer_v1r3<
+                FloatAcc,
+                FloatC,
+                decltype(c_m0_m10_m11_n0_n10_n11_thread_desc),
+                decltype(c_m0_m10_m11_n0_n10_n11_grid_desc),
+                Sequence<1,
+                         c_m10_m11_n10_n11_thread_tensor_lengths[I0],
+                         c_m10_m11_n10_n11_thread_tensor_lengths[I1],
+                         1,
+                         c_m10_m11_n10_n11_thread_tensor_lengths[I2],
+                         c_m10_m11_n10_n11_thread_tensor_lengths[I3]>,
+                CThreadTransferSrcDstAccessOrder,
+                CThreadTransferSrcDstVectorDim,
+                CThreadTransferDstScalarPerVector,
+                CGlobalMemoryDataOperation,
+                1,
+                true>{c_m0_m10_m11_n0_n10_n11_grid_desc,
+                      make_multi_index(im0,
+                                       c_m10_m11_n10_n11_thread_origin_idx_on_block[I0],
+                                       c_m10_m11_n10_n11_thread_origin_idx_on_block[I1],
+                                       in0,
+                                       c_m10_m11_n10_n11_thread_origin_idx_on_block[I2],
+                                       c_m10_m11_n10_n11_thread_origin_idx_on_block[I3])}
+                .Run(c_m0_m10_m11_n0_n10_n11_thread_desc,
+                     make_tuple(I0, I0, I0, I0, I0, I0),
+                     c_thread_buf,
+                     c_m0_m10_m11_n0_n10_n11_grid_desc,
+                     c_grid_buf,
+                     CGridIteratorHacks{});
+        }
+    }
+};
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v2.hpp b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v2.hpp
new file mode 100644
index 0000000000..34dea34833
--- /dev/null
+++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v2.hpp
@@ -0,0 +1,463 @@
+#ifndef CK_GRIDWISE_DYNAMIC_GEMM_V2_HPP
+#define CK_GRIDWISE_DYNAMIC_GEMM_V2_HPP
+
+#include "common_header.hpp"
+#include "dynamic_multi_index_transform_helper.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "blockwise_dynamic_tensor_slice_transfer.hpp"
+#include "threadwise_dynamic_tensor_slice_transfer.hpp"
+#include "blockwise_gemm_dlops_v3.hpp"
+
+namespace ck {
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
+          typename AGlobalDesc,
+          typename BGlobalDesc,
+          typename CGlobalDesc,
+          index_t KPerBlock,
+          index_t HoPerBlock,
+          index_t WoPerBlock,
+          index_t EPerBlock,
+          index_t KPerThread,
+          index_t HoPerThread,
+          index_t WoPerThread,
+          index_t EPerThread,
+          typename ABlockTransferThreadSliceLengths_E_K,
+          typename ABlockTransferThreadClusterLengths_E_K,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_K,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          typename CThreadTransferSrcDstAccessOrder,
+          index_t CThreadTransferSrcDstVectorDim,
+          index_t CThreadTransferDstScalarPerVector,
+          typename AGlobalIteratorHacks,
+          typename BGlobalIteratorHacks,
+          typename CGlobalIteratorHacks,
+          typename AGlobalMoveSliceWindowIteratorHacks,
+          typename BGlobalMoveSliceWindowIteratorHacks>
+struct GridwiseDynamicGemmDlops_km_kn_mn_v3
+{
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        constexpr auto E = EPerBlock * 3 * 3;
+
+        constexpr auto max_lds_align =
+            math::lcm(Number<ABlockTransferDstScalarPerVector_K>{}, Number<KPerBlock>{});
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_e_k_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<E>{}, Number<KPerBlock>{}), max_lds_align);
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size =
+            math::integer_least_multiple(a_e_k_desc.GetElementSpaceSize(), max_lds_align);
+
+        return a_block_space_size * sizeof(FloatAB);
+    }
+
+    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
+    __device__ void Run(const AGlobalDesc& a_e_k_global_desc,
+                        const FloatAB* __restrict__ p_a_global,
+                        const BGlobalDesc& b_e_n_ho_wo_global_desc,
+                        const FloatAB* __restrict__ p_b_global,
+                        const CGlobalDesc& c_k_n_ho_wo_global_desc,
+                        FloatC* __restrict__ p_c_global,
+                        FloatAB* __restrict__ p_shared_block,
+                        integral_constant<bool, HasMainKBlockLoop>,
+                        integral_constant<bool, HasDoubleTailKBlockLoop>) const
+    {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+
+        const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_a_global, a_e_k_global_desc.GetElementSpaceSize());
+        const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_b_global, b_e_n_ho_wo_global_desc.GetElementSpaceSize());
+        auto c_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_c_global, c_k_n_ho_wo_global_desc.GetElementSpaceSize());
+
+        constexpr auto E = EPerBlock * 3 * 3;
+
+        // const auto E = a_e_k_global_desc.GetLength(I0);
+        const auto K = a_e_k_global_desc.GetLength(I1);
+
+        const auto N  = b_e_n_ho_wo_global_desc.GetLength(I1);
+        const auto Ho = b_e_n_ho_wo_global_desc.GetLength(I2);
+        const auto Wo = b_e_n_ho_wo_global_desc.GetLength(I3);
+
+// divide block work by [M, N]
+#if 0
+        const auto k_block_work_num   = K / Number<KPerBlock>{};
+        const auto ho_block_work_num  = Ho / Number<HoPerBlock>{};
+        const auto wo_block_work_num  = Wo / Number<WoPerBlock>{};
+        const auto hwo_block_work_num = ho_block_work_num * wo_block_work_num;
+
+        const index_t k_block_work_id   = get_block_1d_id() / hwo_block_work_num;
+        const index_t hwo_block_work_id = get_block_1d_id() - k_block_work_id * hwo_block_work_num;
+
+        const index_t ho_block_work_id = hwo_block_work_id / wo_block_work_num;
+        const index_t wo_block_work_id = hwo_block_work_id - ho_block_work_id * wo_block_work_num;
+#else
+        // Hack: this force result into SGPR
+        const index_t k_block_work_num   = __builtin_amdgcn_readfirstlane(K / KPerBlock);
+        const index_t ho_block_work_num  = __builtin_amdgcn_readfirstlane(Ho / HoPerBlock);
+        const index_t wo_block_work_num  = __builtin_amdgcn_readfirstlane(Wo / WoPerBlock);
+        const index_t hwo_block_work_num = ho_block_work_num * wo_block_work_num;
+
+        const index_t k_block_work_id =
+            __builtin_amdgcn_readfirstlane(get_block_1d_id() / hwo_block_work_num);
+        const index_t hwo_block_work_id = get_block_1d_id() - k_block_work_id * hwo_block_work_num;
+
+        const index_t ho_block_work_id =
+            __builtin_amdgcn_readfirstlane(hwo_block_work_id / wo_block_work_num);
+        const index_t wo_block_work_id = hwo_block_work_id - ho_block_work_id * wo_block_work_num;
+#endif
+
+        // lds max alignment
+        constexpr auto max_lds_align =
+            math::lcm(Number<ABlockTransferDstScalarPerVector_K>{}, Number<KPerBlock>{});
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_e_k_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<EPerBlock>{}, Number<KPerBlock>{}), max_lds_align);
+
+        constexpr auto a_e_k_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<E>{}, Number<KPerBlock>{}), max_lds_align);
+
+        // B matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto b_e_n_ho_wo_block_desc =
+            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
+                Number<EPerBlock>{}, Number<1>{}, Number<HoPerBlock>{}, Number<WoPerBlock>{}));
+
+        // c_thread_mtx definition: this is a mess
+        // TODO:: more elegent way of defining c_thread_mtx
+        constexpr auto c_k_n_ho_wo_thread_desc =
+            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
+                Number<KPerThread>{}, Number<1>{}, Number<HoPerThread>{}, Number<WoPerThread>{}));
+
+        auto blockwise_gemm =
+            BlockwiseGemmDlops_km_kn_m0m1n0n1_v3<BlockSize,
+                                                 FloatAB,
+                                                 FloatAB,
+                                                 FloatAcc,
+                                                 decltype(a_e_k_block_desc),
+                                                 decltype(b_e_n_ho_wo_block_desc),
+                                                 decltype(c_k_n_ho_wo_thread_desc),
+                                                 KPerThread,
+                                                 HoPerThread,
+                                                 WoPerThread,
+                                                 EPerThread,
+                                                 ABlockTransferSrcScalarPerVector,
+                                                 ABlockTransferDstScalarPerVector_K>{};
+
+        auto c_thread_mtx_index = blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
+
+        const auto k_thread_id  = c_thread_mtx_index.k;
+        const auto ho_thread_id = c_thread_mtx_index.h;
+        const auto wo_thread_id = c_thread_mtx_index.w;
+
+        const index_t k_block_data_on_global  = k_block_work_id * KPerBlock;
+        const index_t ho_block_data_on_global = ho_block_work_id * HoPerBlock;
+        const index_t wo_block_data_on_global = wo_block_work_id * WoPerBlock;
+
+        const index_t ho_thread_data_on_global =
+            ho_block_data_on_global + ho_thread_id * HoPerThread;
+        const index_t wo_thread_data_on_global =
+            wo_block_data_on_global + wo_thread_id * WoPerThread;
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            BlockwiseDynamicTensorSliceTransfer_v4<BlockSize,
+                                                   InMemoryDataOperationEnum_t::Set,
+                                                   Sequence<E, KPerBlock>,
+                                                   ABlockTransferThreadSliceLengths_E_K,
+                                                   ABlockTransferThreadClusterLengths_E_K,
+                                                   ABlockTransferThreadClusterArrangeOrder,
+                                                   FloatAB,
+                                                   FloatAB,
+                                                   decltype(a_e_k_global_desc),
+                                                   decltype(a_e_k_desc),
+                                                   ABlockTransferSrcAccessOrder,
+                                                   Sequence<0, 1>,
+                                                   ABlockTransferSrcVectorDim,
+                                                   1,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   ABlockTransferDstScalarPerVector_K,
+                                                   1,
+                                                   1,
+                                                   AThreadTransferSrcResetCoordinateAfterRun,
+                                                   true>(
+                a_e_k_global_desc,
+                make_multi_index(0, k_block_data_on_global),
+                a_e_k_desc,
+                make_multi_index(0, 0));
+
+        constexpr auto b_e_n_ho_wo_thread_desc =
+            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
+                Number<EPerBlock>{}, Number<1>{}, Number<HoPerThread>{}, Number<WoPerThread>{}));
+
+        auto b_threadwise_transfer = ThreadwiseDynamicTensorSliceTransfer_v2<
+            FloatAB,
+            FloatAB,
+            decltype(b_e_n_ho_wo_global_desc),
+            decltype(b_e_n_ho_wo_thread_desc),
+            Sequence<EPerBlock, 1, HoPerThread, WoPerThread>,
+            BBlockTransferSrcAccessOrder,
+            BBlockTransferSrcVectorDim,
+            BBlockTransferSrcScalarPerVector,
+            1,
+            true>(b_e_n_ho_wo_global_desc,
+                  make_multi_index(0, 0, ho_thread_data_on_global, wo_thread_data_on_global));
+
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            p_shared_block, a_e_k_desc.GetElementSpaceSize());
+
+        // register allocation for output
+        StaticBuffer<AddressSpaceEnum_t::Vgpr,
+                     FloatAcc,
+                     c_k_n_ho_wo_thread_desc.GetElementSpaceSize()>
+            c_thread_buf;
+
+        // initialize output thread tensor
+        ThreadwiseDynamicTensorSliceSet_v1<FloatAcc,
+                                           decltype(c_k_n_ho_wo_thread_desc),
+                                           Sequence<KPerThread, 1, HoPerThread, WoPerThread>>{}
+            .Run(c_k_n_ho_wo_thread_desc, make_tuple(I0, I0, I0, I0), c_thread_buf, FloatAcc{0});
+
+        constexpr auto b_thread_slice_copy_step = make_multi_index(EPerBlock, 0, 0, 0);
+
+        // hack to control index calculation when iterating over A and B matrix for threadwise copy
+        constexpr auto a_e_k_global_iterator_hacks       = AGlobalIteratorHacks{};
+        constexpr auto b_e_n_ho_wo_global_iterator_hacks = BGlobalIteratorHacks{};
+
+        // hack to control index calculation when move slice window for A and B matrix for
+        // threadwise copy
+        constexpr auto a_e_k_global_move_slice_window_iterator_hack =
+            AGlobalMoveSliceWindowIteratorHacks{};
+        constexpr auto b_e_n_ho_wo_global_move_slice_window_iterator_hack =
+            BGlobalMoveSliceWindowIteratorHacks{};
+
+        // double regsiter buffer for b
+        StaticBuffer<AddressSpaceEnum_t::Vgpr,
+                     FloatAB,
+                     b_e_n_ho_wo_thread_desc.GetElementSpaceSize()>
+            b_thread_even_buf, b_thread_odd_buf;
+
+        // LDS double buffer: preload data
+        {
+            a_blockwise_copy.RunRead(a_e_k_global_desc, a_global_buf, a_e_k_global_iterator_hacks);
+
+            b_threadwise_transfer.Run(b_e_n_ho_wo_global_desc,
+                                      b_global_buf,
+                                      b_e_n_ho_wo_thread_desc,
+                                      make_tuple(I0, I0, I0, I0),
+                                      b_thread_even_buf,
+                                      b_e_n_ho_wo_global_iterator_hacks);
+
+            a_blockwise_copy.RunWrite(a_e_k_desc, a_block_buf);
+        }
+
+        __syncthreads();
+
+        if constexpr(HasMainKBlockLoop)
+        {
+            index_t e_block_data_begin = 0;
+
+            // LDS double buffer: main body
+            // use Do-While loop instead of For loop to simplify control flow
+            do
+            {
+                // even iteration
+                b_threadwise_transfer.MoveSrcSliceWindow(b_e_n_ho_wo_global_desc,
+                                                         b_thread_slice_copy_step);
+
+                b_threadwise_transfer.Run(b_e_n_ho_wo_global_desc,
+                                          b_global_buf,
+                                          b_e_n_ho_wo_thread_desc,
+                                          make_tuple(I0, I0, I0, I0),
+                                          b_thread_odd_buf,
+                                          b_e_n_ho_wo_global_iterator_hacks);
+
+                // LDS double buffer: GEMM on current data
+                // TODO: @Zhang Jing: blockwise gemm should be able to move slice window
+                blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
+
+                blockwise_gemm.MoveASliceWindow(a_e_k_block_desc, make_tuple(EPerBlock, 0));
+
+                b_threadwise_transfer.MoveSrcSliceWindow(b_e_n_ho_wo_global_desc,
+                                                         b_thread_slice_copy_step);
+
+                b_threadwise_transfer.Run(b_e_n_ho_wo_global_desc,
+                                          b_global_buf,
+                                          b_e_n_ho_wo_thread_desc,
+                                          make_tuple(I0, I0, I0, I0),
+                                          b_thread_even_buf,
+                                          b_e_n_ho_wo_global_iterator_hacks);
+
+                // LDS double buffer: GEMM on current data
+                blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf);
+
+                blockwise_gemm.MoveASliceWindow(a_e_k_block_desc, make_tuple(EPerBlock, 0));
+
+                e_block_data_begin += 2 * EPerBlock;
+
+            } while(e_block_data_begin < E - 2 * EPerBlock);
+        }
+
+        // LDS double buffer: tail
+        if constexpr(HasDoubleTailKBlockLoop) // if has 2 iteration left
+        {
+            b_threadwise_transfer.MoveSrcSliceWindow(b_e_n_ho_wo_global_desc,
+                                                     b_thread_slice_copy_step);
+
+            b_threadwise_transfer.Run(b_e_n_ho_wo_global_desc,
+                                      b_global_buf,
+                                      b_e_n_ho_wo_thread_desc,
+                                      make_tuple(I0, I0, I0, I0),
+                                      b_thread_odd_buf,
+                                      b_e_n_ho_wo_global_iterator_hacks);
+
+            // LDS double buffer: GEMM on 2nd-last data
+            blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
+
+            blockwise_gemm.MoveASliceWindow(a_e_k_block_desc, make_tuple(EPerBlock, 0));
+
+            // LDS double buffer: GEMM on last data
+            blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf);
+        }
+        else // if has 1 iteration left
+        {
+            // LDS double buffer: GEMM on last data
+            blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
+        }
+
+        // output: register to global memory
+        {
+            // hack to control index calculation when iterating over c_k_n_ho_wo_global tensor
+            constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks = CGlobalIteratorHacks{};
+
+            const index_t k_thread_data_on_global =
+                k_block_data_on_global + k_thread_id * KPerThread;
+
+            ThreadwiseDynamicTensorSliceTransfer_v1r3<
+                FloatAcc,
+                FloatC,
+                decltype(c_k_n_ho_wo_thread_desc),
+                decltype(c_k_n_ho_wo_global_desc),
+                Sequence<KPerThread, 1, HoPerThread, WoPerThread>,
+                CThreadTransferSrcDstAccessOrder,
+                CThreadTransferSrcDstVectorDim,
+                CThreadTransferDstScalarPerVector,
+                CGlobalMemoryDataOperation,
+                1,
+                true>(
+                c_k_n_ho_wo_global_desc,
+                make_multi_index(
+                    k_thread_data_on_global, 0, ho_thread_data_on_global, wo_thread_data_on_global))
+                .Run(c_k_n_ho_wo_thread_desc,
+                     make_tuple(I0, I0, I0, I0),
+                     c_thread_buf,
+                     c_k_n_ho_wo_global_desc,
+                     c_global_buf,
+                     c_k_n_ho_wo_global_tensor_iterator_hacks);
+        }
+    }
+
+    // pass tensor descriptor by reference
+    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
+    __device__ void Run(const AGlobalDesc& a_e_k_global_desc,
+                        const FloatAB* __restrict__ p_a_global,
+                        const BGlobalDesc& b_e_n_ho_wo_global_desc,
+                        const FloatAB* __restrict__ p_b_global,
+                        const CGlobalDesc& c_k_n_ho_wo_global_desc,
+                        FloatC* __restrict__ p_c_global,
+                        integral_constant<bool, HasMainKBlockLoop>,
+                        integral_constant<bool, HasDoubleTailKBlockLoop>) const
+    {
+        constexpr index_t shared_block_size = GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+        __shared__ FloatAB p_shared_block[shared_block_size];
+
+        Run(a_e_k_global_desc,
+            p_a_global,
+            b_e_n_ho_wo_global_desc,
+            p_b_global,
+            c_k_n_ho_wo_global_desc,
+            p_c_global,
+            p_shared_block,
+            integral_constant<bool, HasMainKBlockLoop>{},
+            integral_constant<bool, HasDoubleTailKBlockLoop>{});
+    }
+
+    // pass tensor descriptors by their pointers
+    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
+    __device__ void Run(const AGlobalDesc* p_a_e_k_global_desc,
+                        const FloatAB* __restrict__ p_a_global,
+                        const BGlobalDesc* p_b_e_n_ho_wo_global_desc,
+                        const FloatAB* __restrict__ p_b_global,
+                        const CGlobalDesc* p_c_k_n_ho_wo_global_desc,
+                        FloatC* __restrict__ p_c_global,
+                        integral_constant<bool, HasMainKBlockLoop>,
+                        integral_constant<bool, HasDoubleTailKBlockLoop>) const
+    {
+        const auto a_e_k_global_desc       = *p_a_e_k_global_desc;
+        const auto b_e_n_ho_wo_global_desc = *p_b_e_n_ho_wo_global_desc;
+        const auto c_k_n_ho_wo_global_desc = *p_c_k_n_ho_wo_global_desc;
+
+        Run(a_e_k_global_desc,
+            p_a_global,
+            b_e_n_ho_wo_global_desc,
+            p_b_global,
+            c_k_n_ho_wo_global_desc,
+            p_c_global,
+            integral_constant<bool, HasMainKBlockLoop>{},
+            integral_constant<bool, HasDoubleTailKBlockLoop>{});
+    }
+
+    // pass tensor descriptors by void*
+    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
+    __device__ void Run(const void* p_a_e_k_global_desc,
+                        const FloatAB* __restrict__ p_a_global,
+                        const void* p_b_e_n_ho_wo_global_desc,
+                        const FloatAB* __restrict__ p_b_global,
+                        const void* p_c_k_n_ho_wo_global_desc,
+                        FloatC* __restrict__ p_c_global,
+                        integral_constant<bool, HasMainKBlockLoop>,
+                        integral_constant<bool, HasDoubleTailKBlockLoop>) const
+    {
+        const auto a_e_k_global_desc = *reinterpret_cast<const AGlobalDesc*>(p_a_e_k_global_desc);
+        const auto b_e_n_ho_wo_global_desc =
+            *reinterpret_cast<const BGlobalDesc*>(p_b_e_n_ho_wo_global_desc);
+        const auto c_k_n_ho_wo_global_desc =
+            *reinterpret_cast<const CGlobalDesc*>(p_c_k_n_ho_wo_global_desc);
+
+        Run(a_e_k_global_desc,
+            p_a_global,
+            b_e_n_ho_wo_global_desc,
+            p_b_global,
+            c_k_n_ho_wo_global_desc,
+            p_c_global,
+            integral_constant<bool, HasMainKBlockLoop>{},
+            integral_constant<bool, HasDoubleTailKBlockLoop>{});
+    }
+};
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp
new file mode 100644
index 0000000000..a5b1de79a7
--- /dev/null
+++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp
@@ -0,0 +1,823 @@
+#ifndef CK_GRIDWISE_DYNAMIC_GEMM_XDLOPS_V2R3_HPP
+#define CK_GRIDWISE_DYNAMIC_GEMM_XDLOPS_V2R3_HPP
+
+#include "common_header.hpp"
+#include "dynamic_multi_index_transform_helper.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "blockwise_gemm_xdlops.hpp"
+#include "blockwise_dynamic_tensor_slice_transfer.hpp"
+#include "threadwise_dynamic_tensor_slice_transfer.hpp"
+#include "threadwise_dynamic_tensor_slice_set.hpp"
+
+namespace ck {
+
+#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AK0MK1GridDesc,
+          typename BK0NK1GridDesc,
+          typename CM0M1M2NGridDesc,
+          typename CBlockClusterAdaptor>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_dynamic_gemm_xdlops_v2r3(const FloatAB* __restrict__ p_a_grid,
+                                        const FloatAB* __restrict__ p_b_grid,
+                                        FloatC* __restrict__ p_c_grid,
+                                        const AK0MK1GridDesc a_k0_m_k1_grid_desc,
+                                        const BK0NK1GridDesc b_k0_n_k1_grid_desc,
+                                        const CM0M1M2NGridDesc c_m0_m1_m2_n_grid_desc,
+                                        const CBlockClusterAdaptor c_block_cluster_adaptor)
+{
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseGemm::Run(p_a_grid,
+                      p_b_grid,
+                      p_c_grid,
+                      p_shared_block,
+                      a_k0_m_k1_grid_desc,
+                      b_k0_n_k1_grid_desc,
+                      c_m0_m1_m2_n_grid_desc,
+                      c_block_cluster_adaptor);
+}
+#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AK0MK1GridDesc,
+          typename BK0NK1GridDesc,
+          typename CM0M1M2NGridDesc,
+          typename CBlockClusterAdaptor>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_dynamic_gemm_xdlops_v2r3(const FloatAB* __restrict__ p_a_grid,
+                                        const FloatAB* __restrict__ p_b_grid,
+                                        FloatC* __restrict__ p_c_grid,
+                                        const void CONSTANT* p_a_k0_m_k1_grid_desc,
+                                        const void CONSTANT* p_b_k0_n_k1_grid_desc,
+                                        const void CONSTANT* p_c_m0_m1_m2_n_grid_desc,
+                                        const void CONSTANT* p_c_block_cluster_adaptor)
+{
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    const auto a_k0_m_k1_grid_desc =
+        *reinterpret_cast<const AK0MK1GridDesc*>((const void*)p_a_k0_m_k1_grid_desc);
+    const auto b_k0_n_k1_grid_desc =
+        *reinterpret_cast<const BK0NK1GridDesc*>((const void*)p_b_k0_n_k1_grid_desc);
+    const auto c_m0_m1_m2_n_grid_desc =
+        *reinterpret_cast<const CM0M1M2NGridDesc*>((const void*)p_c_m0_m1_m2_n_grid_desc);
+    const auto c_block_cluster_adaptor =
+        *reinterpret_cast<const CBlockClusterAdaptor*>((const void*)p_c_block_cluster_adaptor);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseGemm::Run(p_a_grid,
+                      p_b_grid,
+                      p_c_grid,
+                      p_shared_block,
+                      a_k0_m_k1_grid_desc,
+                      b_k0_n_k1_grid_desc,
+                      c_m0_m1_m2_n_grid_desc,
+                      c_block_cluster_adaptor);
+}
+#endif
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
+          typename AK0MK1GridDesc,
+          typename BK0NK1GridDesc,
+          typename CMNGridDesc,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerWave,
+          index_t NPerWave,
+          index_t K1Value,
+          index_t MRepeat,
+          index_t NRepeat,
+          typename ABlockTransferThreadSliceLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_K1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          typename BBlockTransferThreadSliceLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_K1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          typename CThreadTransferSrcDstAccessOrder,
+          index_t CThreadTransferSrcDstVectorDim,
+          index_t CThreadTransferDstScalarPerVector,
+          typename AGridIteratorHacks,
+          typename BGridIteratorHacks,
+          typename CGridIteratorHacks,
+          typename AGridMoveSliceWindowIteratorHacks,
+          typename BGridMoveSliceWindowIteratorHacks,
+          bool CAccessOrderMRepeatNRepeat>
+struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    // K1 should be Number<...>
+    static constexpr auto K1 = Number<K1Value>{};
+
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        constexpr auto max_lds_align = K1;
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_k0_m_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<KPerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
+
+        // B matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto b_k0_n_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<KPerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size =
+            math::integer_least_multiple(a_k0_m_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_space_size =
+            math::integer_least_multiple(b_k0_n_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        return (a_block_space_size + b_block_space_size) * sizeof(FloatAB);
+    }
+
+    __host__ __device__ static constexpr bool
+    CheckValidity(const AK0MK1GridDesc& a_k0_m_k1_grid_desc,
+                  const BK0NK1GridDesc& b_k0_n_k1_grid_desc,
+                  const CMNGridDesc& c_m_n_grid_desc)
+    {
+        // TODO: turn on this
+        static_assert(is_known_at_compile_time<remove_cv_t<decltype(K1)>>::value,
+                      "wrong! K1 need to be known at compile-time");
+
+        const auto M  = a_k0_m_k1_grid_desc.GetLength(I1);
+        const auto N  = b_k0_n_k1_grid_desc.GetLength(I1);
+        const auto K0 = a_k0_m_k1_grid_desc.GetLength(I0);
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+
+        return (M == c_m_n_grid_desc.GetLength(I0) && N == c_m_n_grid_desc.GetLength(I1) &&
+                K0 == b_k0_n_k1_grid_desc.GetLength(I0) &&
+                K1 == a_k0_m_k1_grid_desc.GetLength(I2) &&
+                K1 == b_k0_n_k1_grid_desc.GetLength(I2)) &&
+               (M % MPerBlock == 0 && N % NPerBlock == 0 && K0 % KPerBlock == 0) &&
+               (MPerBlock % MPerWave == 0 && NPerBlock % NPerWave == 0);
+    }
+
+    __host__ __device__ static constexpr index_t
+    CalculateGridSize(const CMNGridDesc& c_m_n_grid_desc)
+    {
+        const auto M = c_m_n_grid_desc.GetLength(I0);
+        const auto N = c_m_n_grid_desc.GetLength(I1);
+
+        const index_t grid_size = (M / MPerBlock) * (N / NPerBlock);
+
+        return grid_size;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCM0M1M2NGridDescriptor(const CMNGridDesc& c_m_n_grid_desc)
+    {
+        const auto M = c_m_n_grid_desc.GetLength(I0);
+        const auto N = c_m_n_grid_desc.GetLength(I1);
+
+        constexpr auto xdlops_gemm = XdlopsGemm<FloatAB, MPerWave, NPerWave, K1>{};
+
+        constexpr auto CLayout = xdlops_gemm.GetCLayout();
+
+        constexpr auto M0 = Number<CLayout.M1()>{};
+        constexpr auto M1 = Number<CLayout.N1()>{};
+        constexpr auto M2 = Number<CLayout.M0()>{};
+
+        constexpr index_t MWaves = MPerBlock / (MPerWave * MRepeat);
+        constexpr index_t NWaves = NPerBlock / (NPerWave * NRepeat);
+
+        constexpr auto N0 = Number<CLayout.N1()>{};
+        constexpr auto N1 = Number<CLayout.N0()>{};
+
+        const auto c_m0_m1_m2_n_grid_desc = transform_dynamic_tensor_descriptor(
+            c_m_n_grid_desc,
+            make_tuple(make_unmerge_transform(make_tuple(MRepeat, MWaves, M0, M1, M2)),
+                       make_unmerge_transform(make_tuple(NRepeat, NWaves, N1))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 2, 4, 5, 6>{}, Sequence<1, 3, 7>{}));
+
+        return c_m0_m1_m2_n_grid_desc;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCBlockClusterAdaptor(const CMNGridDesc& c_m_n_grid_desc)
+    {
+        const auto M = c_m_n_grid_desc.GetLength(I0);
+        const auto N = c_m_n_grid_desc.GetLength(I1);
+
+        constexpr auto M1 = Number<MPerBlock>{};
+        constexpr auto N1 = Number<NPerBlock>{};
+
+        const auto M0 = M / M1;
+        const auto N0 = N / N1;
+
+#if 1
+        const auto c_blockid_to_m0_n0_block_cluster_adaptor =
+            make_single_stage_tensor_adaptor(make_tuple(make_merge_transform(make_tuple(M0, N0))),
+                                             make_tuple(Sequence<0, 1>{}),
+                                             make_tuple(Sequence<0>{}));
+#elif 1
+        const auto c_blockid_to_m0_n0_block_cluster_adaptor =
+            make_single_stage_tensor_adaptor(make_tuple(make_merge_transform(make_tuple(N0, M0))),
+                                             make_tuple(Sequence<1, 0>{}),
+                                             make_tuple(Sequence<0>{}));
+#endif
+
+        return c_blockid_to_m0_n0_block_cluster_adaptor;
+    }
+
+    using CM0M1M2NGridDesc     = decltype(MakeCM0M1M2NGridDescriptor(CMNGridDesc{}));
+    using CBlockClusterAdaptor = decltype(MakeCBlockClusterAdaptor(CMNGridDesc{}));
+
+    __device__ static void Run(const FloatAB* __restrict__ p_a_grid,
+                               const FloatAB* __restrict__ p_b_grid,
+                               FloatC* __restrict__ p_c_grid,
+                               FloatAB* __restrict__ p_shared_block,
+                               const AK0MK1GridDesc& a_k0_m_k1_grid_desc,
+                               const BK0NK1GridDesc& b_k0_n_k1_grid_desc,
+                               const CM0M1M2NGridDesc& c_m0_m1_m2_n_grid_desc,
+                               const CBlockClusterAdaptor& c_block_cluster_adaptor)
+    {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_a_grid, a_k0_m_k1_grid_desc.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_b_grid, b_k0_n_k1_grid_desc.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_c_grid, c_m0_m1_m2_n_grid_desc.GetElementSpaceSize());
+
+        const auto K0 = a_k0_m_k1_grid_desc.GetLength(I0);
+        const auto M  = a_k0_m_k1_grid_desc.GetLength(I1);
+        const auto N  = b_k0_n_k1_grid_desc.GetLength(I1);
+
+        // divide block work by [M, N]
+        const auto block_work_idx =
+            c_block_cluster_adaptor.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        // HACK: this force m/n_block_data_idx_on_grid into SGPR
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock);
+
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);
+
+        // lds max alignment
+        constexpr auto max_lds_align = K1;
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_k0_m_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<KPerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
+
+        // B matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto b_k0_n_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<KPerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            BlockwiseDynamicTensorSliceTransfer_v4<BlockSize,
+                                                   InMemoryDataOperationEnum_t::Set,
+                                                   Sequence<KPerBlock, MPerBlock, K1>,
+                                                   ABlockTransferThreadSliceLengths_K0_M_K1,
+                                                   ABlockTransferThreadClusterLengths_K0_M_K1,
+                                                   ABlockTransferThreadClusterArrangeOrder,
+                                                   FloatAB,
+                                                   FloatAB,
+                                                   decltype(a_k0_m_k1_grid_desc),
+                                                   decltype(a_k0_m_k1_block_desc),
+                                                   ABlockTransferSrcAccessOrder,
+                                                   Sequence<1, 0, 2>,
+                                                   ABlockTransferSrcVectorDim,
+                                                   2,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   ABlockTransferDstScalarPerVector_K1,
+                                                   1,
+                                                   1,
+                                                   AThreadTransferSrcResetCoordinateAfterRun,
+                                                   true>(
+                a_k0_m_k1_grid_desc,
+                make_multi_index(0, m_block_data_idx_on_grid, 0),
+                a_k0_m_k1_block_desc,
+                make_multi_index(0, 0, 0));
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy =
+            BlockwiseDynamicTensorSliceTransfer_v4<BlockSize,
+                                                   InMemoryDataOperationEnum_t::Set,
+                                                   Sequence<KPerBlock, NPerBlock, K1>,
+                                                   BBlockTransferThreadSliceLengths_K0_N_K1,
+                                                   BBlockTransferThreadClusterLengths_K0_N_K1,
+                                                   BBlockTransferThreadClusterArrangeOrder,
+                                                   FloatAB,
+                                                   FloatAB,
+                                                   decltype(b_k0_n_k1_grid_desc),
+                                                   decltype(b_k0_n_k1_block_desc),
+                                                   BBlockTransferSrcAccessOrder,
+                                                   Sequence<1, 0, 2>,
+                                                   BBlockTransferSrcVectorDim,
+                                                   2,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   BBlockTransferDstScalarPerVector_K1,
+                                                   1,
+                                                   1,
+                                                   BThreadTransferSrcResetCoordinateAfterRun,
+                                                   true>(
+                b_k0_n_k1_grid_desc,
+                make_multi_index(0, n_block_data_idx_on_grid, 0),
+                b_k0_n_k1_block_desc,
+                make_multi_index(0, 0, 0));
+
+        // GEMM definition
+        //   c_mtx += transpose(a_mtx) * b_mtx
+        //     a_mtx[KPerBlock, MPerBlock] is in LDS
+        //     b_mtx[KPerBlock, NPerBlock] is in LDS
+        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
+        //       register
+        // sanity check
+
+        static_assert(MPerBlock % (MPerWave * MRepeat) == 0 &&
+                          NPerBlock % (NPerWave * NRepeat) == 0,
+                      "wrong!");
+
+        constexpr auto a_k0_m0_m1_k1_block_desc = transform_dynamic_tensor_descriptor(
+            a_k0_m_k1_block_desc,
+            make_tuple(make_pass_through_transform(Number<KPerBlock>{}),
+                       make_unmerge_transform(
+                           make_tuple(Number<MRepeat>{}, Number<MPerBlock / MRepeat>{})),
+                       make_pass_through_transform(K1)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+        constexpr auto b_k0_n0_n1_k1_block_desc = transform_dynamic_tensor_descriptor(
+            b_k0_n_k1_block_desc,
+            make_tuple(make_pass_through_transform(Number<KPerBlock>{}),
+                       make_unmerge_transform(
+                           make_tuple(Number<NRepeat>{}, Number<NPerBlock / NRepeat>{})),
+                       make_pass_through_transform(K1)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+        const auto blockwise_gemm =
+            BlockwiseGemmXdlops_km_kn_m0m1m2n_v1<BlockSize,
+                                                 FloatAB,
+                                                 decltype(a_k0_m0_m1_k1_block_desc),
+                                                 decltype(b_k0_n0_n1_k1_block_desc),
+                                                 MPerWave,
+                                                 NPerWave,
+                                                 K1>{};
+
+        constexpr auto CLayout = blockwise_gemm.GetCLayout();
+
+        constexpr index_t BlkSize   = CLayout.GetBlkSize();
+        constexpr index_t NumBlks   = CLayout.GetNumBlks();
+        constexpr index_t NumXdlops = CLayout.GetNumXdlops();
+
+        static_assert(NumBlks == 1 && NumXdlops == 1, "K Reduction Mfma only");
+
+        constexpr auto c_mr_nr_blk_desc = make_dynamic_naive_tensor_descriptor_packed_v2(
+            make_tuple(Number<MRepeat>{}, Number<NRepeat>{}));
+
+        StaticBuffer<AddressSpaceEnum_t::Vgpr,
+                     vector_type<FloatAcc, BlkSize>,
+                     c_mr_nr_blk_desc.GetElementSpaceSize()>
+            c_thread_buf;
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size =
+            math::integer_least_multiple(a_k0_m_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_space_size =
+            math::integer_least_multiple(b_k0_n_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        FloatAB* p_a_block = p_shared_block;
+        FloatAB* p_b_block = p_shared_block + a_block_space_size;
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock, 0, 0);
+
+        // hack to control index calculation when iterating over A and B matrix for threadwise copy
+        constexpr auto a_k0_m_k1_grid_iterator_hacks = AGridIteratorHacks{};
+        constexpr auto b_k0_n_k1_grid_iterator_hacks = BGridIteratorHacks{};
+
+        // hack to control index calculation when move slice window for A and B matrix for
+        // threadwise copy
+        constexpr auto a_k0_m_k1_grid_move_slice_window_iterator_hack =
+            AGridMoveSliceWindowIteratorHacks{};
+        constexpr auto b_k0_n_k1_grid_move_slice_window_iterator_hack =
+            BGridMoveSliceWindowIteratorHacks{};
+
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            p_a_block, a_k0_m_k1_block_desc.GetElementSpaceSize());
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            p_b_block, b_k0_n_k1_block_desc.GetElementSpaceSize());
+
+        // preload data into LDS
+        {
+            a_blockwise_copy.RunRead(
+                a_k0_m_k1_grid_desc, a_grid_buf, a_k0_m_k1_grid_iterator_hacks);
+            b_blockwise_copy.RunRead(
+                b_k0_n_k1_grid_desc, b_grid_buf, b_k0_n_k1_grid_iterator_hacks);
+
+            a_blockwise_copy.RunWrite(a_k0_m_k1_block_desc, a_block_buf);
+            b_blockwise_copy.RunWrite(b_k0_n_k1_block_desc, b_block_buf);
+        }
+
+        // main body
+        index_t k_block_data_begin = 0;
+
+        do
+        {
+            a_blockwise_copy.MoveSrcSliceWindow(a_k0_m_k1_grid_desc,
+                                                a_block_slice_copy_step,
+                                                a_k0_m_k1_grid_move_slice_window_iterator_hack);
+            b_blockwise_copy.MoveSrcSliceWindow(b_k0_n_k1_grid_desc,
+                                                b_block_slice_copy_step,
+                                                b_k0_n_k1_grid_move_slice_window_iterator_hack);
+
+            a_blockwise_copy.RunRead(
+                a_k0_m_k1_grid_desc, a_grid_buf, a_k0_m_k1_grid_iterator_hacks);
+
+            block_sync_lds();
+
+            b_blockwise_copy.RunRead(
+                b_k0_n_k1_grid_desc, b_grid_buf, b_k0_n_k1_grid_iterator_hacks);
+
+            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
+
+            block_sync_lds();
+
+            a_blockwise_copy.RunWrite(a_k0_m_k1_block_desc, a_block_buf);
+            b_blockwise_copy.RunWrite(b_k0_n_k1_block_desc, b_block_buf);
+
+            k_block_data_begin += KPerBlock;
+        } while(k_block_data_begin < (K0 - KPerBlock));
+
+        // tail
+        {
+            block_sync_lds();
+
+            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
+        }
+
+#if 0
+        // output: register to global memory
+        {
+            constexpr index_t M0 = CLayout.M1();
+            constexpr index_t M1 = CLayout.N1();
+            constexpr index_t M2 = CLayout.M0();
+
+            constexpr index_t N0 = CLayout.N1();
+            constexpr index_t N1 = CLayout.N0();
+
+            constexpr auto c_m0_m1_m2_n_thread_desc =
+                make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(Number<MRepeat>{},
+                                                                          Number<NRepeat>{},
+                                                                          Number<1>{},
+                                                                          Number<1>{},
+                                                                          Number<M0>{},
+                                                                          Number<1>{},
+                                                                          Number<M2>{},
+                                                                          Number<1>{}));
+
+            StaticBuffer<AddressSpaceEnum_t::Vgpr, FloatC, c_m0_m1_m2_n_thread_desc.GetElementSpaceSize()>
+                c_blk_buf_;
+
+            static_for<0, MRepeat, 1>{}([&](auto mr_i) {
+                static_for<0, NRepeat, 1>{}([&](auto nr_i) {
+                    constexpr auto blk_off =
+                        c_mr_nr_blk_desc.CalculateOffset(make_tuple(mr_i, nr_i));
+
+                    static_for<0, BlkSize, 1>{}([&](auto j) {
+                        c_blk_buf_(Number<blk_off * BlkSize + j>{}) =
+                            c_thread_buf[Number<blk_off>{}]
+                                .template AsType<FloatAcc>()[Number<j>{}];
+                    });
+                });
+            });
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_grid =
+                m_block_data_idx_on_grid + c_thread_mtx_on_block[I0];
+
+            const index_t n_thread_data_on_grid =
+                n_block_data_idx_on_grid + c_thread_mtx_on_block[I1];
+
+            constexpr auto c_m0_m1_m2_n_grid_tensor_iterator_hacks = CGridIteratorHacks{};
+
+            constexpr index_t MWaves = MPerBlock / (MPerWave * MRepeat);
+            constexpr index_t NWaves = NPerBlock / (NPerWave * NRepeat);
+
+            ThreadwiseDynamicTensorSliceTransfer_v1r3<
+                FloatC,
+                FloatC,
+                decltype(c_m0_m1_m2_n_thread_desc),
+                decltype(c_m0_m1_m2_n_grid_desc),
+                Sequence<MRepeat, NRepeat, 1, 1, M0, 1, M2, 1>,
+                CThreadTransferSrcDstAccessOrder,
+                CThreadTransferSrcDstVectorDim,
+                CThreadTransferDstScalarPerVector,
+                CGlobalMemoryDataOperation,
+                1,
+                true>{
+                c_m0_m1_m2_n_grid_desc,
+                make_multi_index(m_thread_data_on_grid / (M2 * M1 * M0 * MWaves),
+                                 n_thread_data_on_grid / (N1 * NWaves),
+                                 m_thread_data_on_grid % (M2 * M1 * M0 * MWaves) / (M2 * M1 * M0),
+                                 n_thread_data_on_grid % (N1 * NWaves) / N1,
+                                 m_thread_data_on_grid % (M2 * M1 * M0) / (M2 * M1),
+                                 m_thread_data_on_grid % (M2 * M1) / M2,
+                                 m_thread_data_on_grid % M2,
+                                 n_thread_data_on_grid % N1)}
+                .Run(c_m0_m1_m2_n_thread_desc,
+                     make_tuple(I0, I0, I0, I0, I0, I0, I0, I0),
+                     c_blk_buf_,
+                     c_m0_m1_m2_n_grid_desc,
+                     c_grid_buf,
+                     c_m0_m1_m2_n_grid_tensor_iterator_hacks);
+        }
+#else
+        {
+            constexpr index_t M0 = CLayout.M1();
+            constexpr index_t M1 = CLayout.N1();
+            constexpr index_t M2 = CLayout.M0();
+
+            constexpr auto c_m0_m1_m2_n_thread_desc =
+                make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
+                    I1, I1, I1, I1, Number<M0>{}, Number<1>{}, Number<M2>{}, Number<1>{}));
+
+            StaticBuffer<AddressSpaceEnum_t::Vgpr, FloatC, BlkSize> c_blk_buf_;
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_grid =
+                m_block_data_idx_on_grid + c_thread_mtx_on_block[I0];
+
+            const index_t n_thread_data_on_grid =
+                n_block_data_idx_on_grid + c_thread_mtx_on_block[I1];
+
+            constexpr auto c_m0_m1_m2_n_grid_tensor_iterator_hacks = CGridIteratorHacks{};
+
+            auto c_thread_copy =
+                ThreadwiseDynamicTensorSliceTransfer_v1r3<FloatC,
+                                                          FloatC,
+                                                          decltype(c_m0_m1_m2_n_thread_desc),
+                                                          decltype(c_m0_m1_m2_n_grid_desc),
+                                                          Sequence<1, 1, 1, 1, M0, 1, M2, 1>,
+                                                          CThreadTransferSrcDstAccessOrder,
+                                                          CThreadTransferSrcDstVectorDim,
+                                                          CThreadTransferDstScalarPerVector,
+                                                          CGlobalMemoryDataOperation,
+                                                          1,
+                                                          true>{
+                    c_m0_m1_m2_n_grid_desc,
+                    make_multi_index(0,
+                                     0,
+                                     0,
+                                     0,
+                                     m_thread_data_on_grid / (M2 * M1),
+                                     m_thread_data_on_grid % (M2 * M1) / M2,
+                                     m_thread_data_on_grid % M2,
+                                     n_thread_data_on_grid)};
+
+            auto init_copy = [&](auto c_thread_idx_) {
+                constexpr auto blk_off = c_mr_nr_blk_desc.CalculateOffset(c_thread_idx_);
+                c_thread_copy.Run(c_m0_m1_m2_n_thread_desc,
+                                  make_tuple(I0, I0, I0, I0, I0, I0, I0, I0),
+                                  c_thread_buf[Number<blk_off>{}].template AsType<FloatAcc>(),
+                                  c_m0_m1_m2_n_grid_desc,
+                                  c_grid_buf,
+                                  c_m0_m1_m2_n_grid_tensor_iterator_hacks);
+
+                return c_thread_idx_;
+            };
+
+            auto mrepeat_plus_copy = [&](auto c_thread_idx_) {
+                constexpr auto mrepeat_step_plus = make_multi_index(1, 0, 0, 0, 0, 0, 0, 0);
+                c_thread_copy.MoveDstSliceWindow(c_m0_m1_m2_n_grid_desc, mrepeat_step_plus);
+
+                constexpr auto blk_off = c_mr_nr_blk_desc.CalculateOffset(c_thread_idx_);
+                c_thread_copy.Run(c_m0_m1_m2_n_thread_desc,
+                                  make_tuple(I0, I0, I0, I0, I0, I0, I0, I0),
+                                  c_thread_buf[Number<blk_off>{}].template AsType<FloatAcc>(),
+                                  c_m0_m1_m2_n_grid_desc,
+                                  c_grid_buf,
+                                  c_m0_m1_m2_n_grid_tensor_iterator_hacks);
+            };
+
+            auto nrepeat_plus_copy = [&](auto c_thread_idx_) {
+                constexpr auto nrepeat_step_plus = make_multi_index(0, 1, 0, 0, 0, 0, 0, 0);
+                c_thread_copy.MoveDstSliceWindow(c_m0_m1_m2_n_grid_desc, nrepeat_step_plus);
+
+                constexpr auto blk_off = c_mr_nr_blk_desc.CalculateOffset(c_thread_idx_);
+                c_thread_copy.Run(c_m0_m1_m2_n_thread_desc,
+                                  make_tuple(I0, I0, I0, I0, I0, I0, I0, I0),
+                                  c_thread_buf[Number<blk_off>{}].template AsType<FloatAcc>(),
+                                  c_m0_m1_m2_n_grid_desc,
+                                  c_grid_buf,
+                                  c_m0_m1_m2_n_grid_tensor_iterator_hacks);
+            };
+
+            auto mrepeat_minus_copy = [&](auto c_thread_idx_) {
+                constexpr auto mrepeat_step_plus = make_multi_index(-1, 0, 0, 0, 0, 0, 0, 0);
+                c_thread_copy.MoveDstSliceWindow(c_m0_m1_m2_n_grid_desc, mrepeat_step_plus);
+
+                constexpr auto blk_off = c_mr_nr_blk_desc.CalculateOffset(c_thread_idx_);
+                c_thread_copy.Run(c_m0_m1_m2_n_thread_desc,
+                                  make_tuple(I0, I0, I0, I0, I0, I0, I0, I0),
+                                  c_thread_buf[Number<blk_off>{}].template AsType<FloatAcc>(),
+                                  c_m0_m1_m2_n_grid_desc,
+                                  c_grid_buf,
+                                  c_m0_m1_m2_n_grid_tensor_iterator_hacks);
+            };
+
+            auto nrepeat_minus_copy = [&](auto c_thread_idx_) {
+                constexpr auto nrepeat_step_minus = make_multi_index(0, -1, 0, 0, 0, 0, 0, 0);
+                c_thread_copy.MoveDstSliceWindow(c_m0_m1_m2_n_grid_desc, nrepeat_step_minus);
+
+                constexpr auto blk_off = c_mr_nr_blk_desc.CalculateOffset(c_thread_idx_);
+                c_thread_copy.Run(c_m0_m1_m2_n_thread_desc,
+                                  make_tuple(I0, I0, I0, I0, I0, I0, I0, I0),
+                                  c_thread_buf[Number<blk_off>{}].template AsType<FloatAcc>(),
+                                  c_m0_m1_m2_n_grid_desc,
+                                  c_grid_buf,
+                                  c_m0_m1_m2_n_grid_tensor_iterator_hacks);
+            };
+
+            static_assert((MRepeat == 4 && NRepeat == 4) or (MRepeat == 4 && NRepeat == 2) or
+                              (MRepeat == 2 && NRepeat == 4) or (MRepeat == 2 && NRepeat == 2) or
+                              (MRepeat == 2 && NRepeat == 1) or (MRepeat == 1 && NRepeat == 2) or
+                              (MRepeat == 1 && NRepeat == 1),
+                          "wrong");
+
+            if constexpr(MRepeat == 4 && NRepeat == 4)
+            {
+                init_copy(make_tuple(I0, I0));
+
+                if constexpr(CAccessOrderMRepeatNRepeat)
+                {
+                    nrepeat_plus_copy(make_tuple(I0, I1));
+                    nrepeat_plus_copy(make_tuple(I0, I2));
+                    nrepeat_plus_copy(make_tuple(I0, I3));
+                    mrepeat_plus_copy(make_tuple(I1, I3));
+                    nrepeat_minus_copy(make_tuple(I1, I2));
+                    nrepeat_minus_copy(make_tuple(I1, I1));
+                    nrepeat_minus_copy(make_tuple(I1, I0));
+                    mrepeat_plus_copy(make_tuple(I2, I0));
+                    nrepeat_plus_copy(make_tuple(I2, I1));
+                    nrepeat_plus_copy(make_tuple(I2, I2));
+                    nrepeat_plus_copy(make_tuple(I2, I3));
+                    mrepeat_plus_copy(make_tuple(I3, I3));
+                    nrepeat_minus_copy(make_tuple(I3, I2));
+                    nrepeat_minus_copy(make_tuple(I3, I1));
+                    nrepeat_minus_copy(make_tuple(I3, I0));
+                }
+                else
+                {
+                    mrepeat_plus_copy(make_tuple(I1, I0));
+                    mrepeat_plus_copy(make_tuple(I2, I0));
+                    mrepeat_plus_copy(make_tuple(I3, I0));
+                    nrepeat_plus_copy(make_tuple(I3, I1));
+                    mrepeat_minus_copy(make_tuple(I2, I1));
+                    mrepeat_minus_copy(make_tuple(I1, I1));
+                    mrepeat_minus_copy(make_tuple(I0, I1));
+                    nrepeat_plus_copy(make_tuple(I0, I2));
+                    mrepeat_plus_copy(make_tuple(I1, I2));
+                    mrepeat_plus_copy(make_tuple(I2, I2));
+                    mrepeat_plus_copy(make_tuple(I3, I2));
+                    nrepeat_plus_copy(make_tuple(I3, I3));
+                    mrepeat_minus_copy(make_tuple(I2, I3));
+                    mrepeat_minus_copy(make_tuple(I1, I3));
+                    mrepeat_minus_copy(make_tuple(I0, I3));
+                }
+            }
+            else if constexpr(MRepeat == 4 && NRepeat == 2)
+            {
+                init_copy(make_tuple(I0, I0));
+
+                if constexpr(CAccessOrderMRepeatNRepeat)
+                {
+                    nrepeat_plus_copy(make_tuple(I0, I1));
+                    mrepeat_plus_copy(make_tuple(I1, I1));
+                    nrepeat_minus_copy(make_tuple(I1, I0));
+                    mrepeat_plus_copy(make_tuple(I2, I0));
+                    nrepeat_plus_copy(make_tuple(I2, I1));
+                    mrepeat_plus_copy(make_tuple(I3, I1));
+                    nrepeat_minus_copy(make_tuple(I3, I0));
+                }
+                else
+                {
+                    mrepeat_plus_copy(make_tuple(I1, I0));
+                    mrepeat_plus_copy(make_tuple(I2, I0));
+                    mrepeat_plus_copy(make_tuple(I3, I0));
+                    nrepeat_plus_copy(make_tuple(I3, I1));
+                    mrepeat_minus_copy(make_tuple(I2, I1));
+                    mrepeat_minus_copy(make_tuple(I1, I1));
+                    mrepeat_minus_copy(make_tuple(I0, I1));
+                }
+            }
+            else if constexpr(MRepeat == 2 && NRepeat == 4)
+            {
+                init_copy(make_tuple(I0, I0));
+
+                if constexpr(CAccessOrderMRepeatNRepeat)
+                {
+                    nrepeat_plus_copy(make_tuple(I0, I1));
+                    nrepeat_plus_copy(make_tuple(I0, I2));
+                    nrepeat_plus_copy(make_tuple(I0, I3));
+                    mrepeat_plus_copy(make_tuple(I1, I3));
+                    nrepeat_minus_copy(make_tuple(I1, I2));
+                    nrepeat_minus_copy(make_tuple(I1, I1));
+                    nrepeat_minus_copy(make_tuple(I1, I0));
+                }
+                else
+                {
+                    mrepeat_plus_copy(make_tuple(I1, I0));
+                    nrepeat_plus_copy(make_tuple(I1, I1));
+                    mrepeat_minus_copy(make_tuple(I0, I1));
+                    nrepeat_plus_copy(make_tuple(I0, I2));
+                    mrepeat_plus_copy(make_tuple(I1, I2));
+                    nrepeat_plus_copy(make_tuple(I1, I3));
+                    mrepeat_minus_copy(make_tuple(I0, I3));
+                }
+            }
+            else if constexpr(MRepeat == 2 && NRepeat == 2)
+            {
+                init_copy(make_tuple(I0, I0));
+
+                if constexpr(CAccessOrderMRepeatNRepeat)
+                {
+                    nrepeat_plus_copy(make_tuple(I0, I1));
+                    mrepeat_plus_copy(make_tuple(I1, I1));
+                    nrepeat_minus_copy(make_tuple(I1, I0));
+                }
+                else
+                {
+                    mrepeat_plus_copy(make_tuple(I1, I0));
+                    nrepeat_plus_copy(make_tuple(I1, I1));
+                    mrepeat_minus_copy(make_tuple(I0, I1));
+                }
+            }
+            else if constexpr(MRepeat == 2 && NRepeat == 1)
+            {
+                init_copy(make_tuple(I0, I0));
+                mrepeat_plus_copy(make_tuple(I1, I0));
+            }
+            else if constexpr(MRepeat == 1 && NRepeat == 2)
+            {
+                init_copy(make_tuple(I0, I0));
+                nrepeat_plus_copy(make_tuple(I0, I1));
+            }
+            else if constexpr(MRepeat == 1 && NRepeat == 1)
+            {
+                init_copy(make_tuple(I0, I0));
+            }
+        }
+#endif
+    }
+}; // namespace ck
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp b/composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp
new file mode 100644
index 0000000000..7e7bb9c8c3
--- /dev/null
+++ b/composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp
@@ -0,0 +1,230 @@
+#ifndef CK_THREADWISE_CONTRACTION_DLOPS_HPP
+#define CK_THREADWISE_CONTRACTION_DLOPS_HPP
+
+#include "common_header.hpp"
+#include "math.hpp"
+
+namespace ck {
+
+// C[TM0, TM1, TN0, TN1] += A[TK, TM0, TM1] * B[TK, TN0, TN1]
+//   Tensor element can be vectorized data
+// Assume:
+//   1. AThreadDesc_TK0_TM0_TM1_TK1, BThreadDesc_TK0_TN0_TN1_TK1, CThreadDesc_TM0_TM1_TN0_TN1 are
+//   known at compile-time
+//   2. AOriginIdx, BOriginIdx, COriginIdx are known at compile-time
+template <typename FloatA,
+          typename FloatB,
+          typename FloatC,
+          typename AThreadDesc_TK0_TM0_TM1_TK1,
+          typename BThreadDesc_TK0_TN0_TN1_TK1,
+          typename CThreadDesc_TM0_TM1_TN0_TN1,
+          typename TKLengths,
+          typename TMLengths,
+          typename TNLengths,
+          typename std::enable_if<AThreadDesc_TK0_TM0_TM1_TK1::IsKnownAtCompileTime() &&
+                                      BThreadDesc_TK0_TN0_TN1_TK1::IsKnownAtCompileTime() &&
+                                      CThreadDesc_TM0_TM1_TN0_TN1::IsKnownAtCompileTime(),
+                                  bool>::type = false>
+struct ThreadwiseGemmDlops_km0m1_kn0n1_m0m1n0n1
+{
+    __device__ constexpr ThreadwiseGemmDlops_km0m1_kn0n1_m0m1n0n1()
+    {
+        static_assert(AThreadDesc_TK0_TM0_TM1_TK1::IsKnownAtCompileTime() &&
+                          BThreadDesc_TK0_TN0_TN1_TK1::IsKnownAtCompileTime() &&
+                          CThreadDesc_TM0_TM1_TN0_TN1::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        // TODO: sanity-check: compare AThreadDesc_TK0_TM0_TM1_TK1, BThreadDesc_TK0_TN0_TN1_TK1,
+        // CThreadDesc_TM0_TM1_TN0_TN1 Size with KLenghts, TMLengths and TNLengths
+
+        // TODO remove this restriction
+        static_assert(TKLengths::Size() == 1 && TMLengths::Size() == 2 && TNLengths::Size() == 2,
+                      "wrong!");
+    }
+
+    template <typename ABuffer,
+              typename AOriginIdx,
+              typename BBuffer,
+              typename BOriginIdx,
+              typename CBuffer,
+              typename COriginIdx>
+    __device__ static void Run(const ABuffer& a_buf,
+                               AOriginIdx,
+                               const BBuffer& b_buf,
+                               BOriginIdx,
+                               CBuffer& c_buf,
+                               COriginIdx)
+    {
+        static_assert(
+            is_known_at_compile_time<remove_cv_t<remove_reference_t<AOriginIdx>>>::value &&
+                is_known_at_compile_time<remove_cv_t<remove_reference_t<BOriginIdx>>>::value &&
+                is_known_at_compile_time<remove_cv_t<remove_reference_t<COriginIdx>>>::value,
+            "wrong! AOriginIdx, BOriginIdx, COringinIdx should be known at compile-time");
+
+        static_assert(is_same<remove_cv_t<remove_reference_t<typename ABuffer::type>>,
+                              remove_cv_t<remove_reference_t<FloatA>>>::value &&
+                      is_same<remove_cv_t<remove_reference_t<typename BBuffer::type>>,
+                              remove_cv_t<remove_reference_t<FloatB>>>::value &&
+                      is_same<remove_cv_t<remove_reference_t<typename CBuffer::type>>,
+                              remove_cv_t<remove_reference_t<FloatC>>>::value &&
+                      "wrong! inconsistent type");
+
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+
+        constexpr auto TK  = TKLengths{}[I0];
+        constexpr auto TM0 = TMLengths{}[I0];
+        constexpr auto TM1 = TMLengths{}[I1];
+        constexpr auto TN0 = TNLengths{}[I0];
+        constexpr auto TN1 = TNLengths{}[I1];
+
+        constexpr auto a_origin_idx = to_multi_index(AOriginIdx{});
+        constexpr auto b_origin_idx = to_multi_index(BOriginIdx{});
+        constexpr auto c_origin_idx = to_multi_index(COriginIdx{});
+
+        static_for<0, TK, 1>{}([&](auto tk) {
+            static_for<0, TM0, 1>{}([&](auto tm0) {
+                static_for<0, TM1, 1>{}([&](auto tm1) {
+                    static_for<0, TN0, 1>{}([&](auto tn0) {
+                        static_for<0, TN1, 1>{}([&](auto tn1) {
+                            constexpr index_t a_offset =
+                                AThreadDesc_TK0_TM0_TM1_TK1{}.CalculateOffset(
+                                    a_origin_idx + make_multi_index(tk, tm0, tm1));
+                            constexpr index_t b_offset =
+                                BThreadDesc_TK0_TN0_TN1_TK1{}.CalculateOffset(
+                                    b_origin_idx + make_multi_index(tk, tn0, tn1));
+                            constexpr index_t c_offset =
+                                CThreadDesc_TM0_TM1_TN0_TN1{}.CalculateOffset(
+                                    c_origin_idx + make_multi_index(tm0, tm1, tn0, tn1));
+
+                            amd_inner_product_dlop<FloatA, FloatB, FloatC>(
+                                a_buf[Number<a_offset>{}],
+                                b_buf[Number<b_offset>{}],
+                                c_buf(Number<c_offset>{}));
+                        });
+                    });
+                });
+            });
+        });
+    }
+};
+
+// C[TM0, TM1, TN0, TN1] += A[TK0, TM0, TM1, TK1] * B[TK0, TN0, TN1, TK1]
+//   Tensor element can be vectorized data
+// Assume:
+//   1. AThreadDesc_TK0_TM0_TM1_TK1, BThreadDesc_TK0_TN0_TN1_TK1, CThreadDesc_TM0_TM1_TN0_TN1 are
+//      known at compile-time
+//   2. AOriginIdx, BOriginIdx, COriginIdx are known at compile-time
+template <typename FloatA,
+          typename FloatB,
+          typename FloatC,
+          typename AThreadDesc_TK0_TM0_TM1_TK1,
+          typename BThreadDesc_TK0_TN0_TN1_TK1,
+          typename CThreadDesc_TM0_TM1_TN0_TN1,
+          typename TKLengths,
+          typename TMLengths,
+          typename TNLengths,
+          typename std::enable_if<AThreadDesc_TK0_TM0_TM1_TK1::IsKnownAtCompileTime() &&
+                                      BThreadDesc_TK0_TN0_TN1_TK1::IsKnownAtCompileTime() &&
+                                      CThreadDesc_TM0_TM1_TN0_TN1::IsKnownAtCompileTime(),
+                                  bool>::type = false>
+struct ThreadwiseContractionDlops_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1
+{
+    __device__ constexpr ThreadwiseContractionDlops_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1()
+    {
+        static_assert(AThreadDesc_TK0_TM0_TM1_TK1::IsKnownAtCompileTime() &&
+                          BThreadDesc_TK0_TN0_TN1_TK1::IsKnownAtCompileTime() &&
+                          CThreadDesc_TM0_TM1_TN0_TN1::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        // TODO: sanity-check: compare AThreadDesc_TK0_TM0_TM1_TK1, BThreadDesc_TK0_TN0_TN1_TK1,
+        // CThreadDesc_TM0_TM1_TN0_TN1 Size with KLenghts, TMLengths and TNLengths
+
+        // TODO remove this restriction
+        static_assert(TKLengths::Size() == 2 && TMLengths::Size() == 2 && TNLengths::Size() == 2,
+                      "wrong!");
+    }
+
+    template <typename ABuffer,
+              typename AOriginIdx,
+              typename BBuffer,
+              typename BOriginIdx,
+              typename CBuffer,
+              typename COriginIdx>
+    __device__ static void Run(const ABuffer& a_buf,
+                               AOriginIdx,
+                               const BBuffer& b_buf,
+                               BOriginIdx,
+                               CBuffer& c_buf,
+                               COriginIdx)
+    {
+        static_assert(
+            is_known_at_compile_time<remove_cv_t<remove_reference_t<AOriginIdx>>>::value &&
+                is_known_at_compile_time<remove_cv_t<remove_reference_t<BOriginIdx>>>::value &&
+                is_known_at_compile_time<remove_cv_t<remove_reference_t<COriginIdx>>>::value,
+            "wrong! AOriginIdx, BOriginIdx, COringinIdx should be known at compile-time");
+
+        static_assert(is_same<remove_cv_t<remove_reference_t<typename ABuffer::type>>,
+                              remove_cv_t<remove_reference_t<FloatA>>>::value &&
+                      is_same<remove_cv_t<remove_reference_t<typename BBuffer::type>>,
+                              remove_cv_t<remove_reference_t<FloatB>>>::value &&
+                      is_same<remove_cv_t<remove_reference_t<typename CBuffer::type>>,
+                              remove_cv_t<remove_reference_t<FloatC>>>::value &&
+                      "wrong! inconsistent type");
+
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+
+        constexpr index_t TK0 = TKLengths{}[I0];
+        constexpr index_t TK1 = TKLengths{}[I1];
+        constexpr index_t TM0 = TMLengths{}[I0];
+        constexpr index_t TM1 = TMLengths{}[I1];
+        constexpr index_t TN0 = TNLengths{}[I0];
+        constexpr index_t TN1 = TNLengths{}[I1];
+
+        constexpr auto a_origin_idx = to_multi_index(AOriginIdx{});
+        constexpr auto b_origin_idx = to_multi_index(BOriginIdx{});
+        constexpr auto c_origin_idx = to_multi_index(COriginIdx{});
+
+        static_for<0, TK0, 1>{}([&](auto tk0) {
+            static_for<0, TM0, 1>{}([&](auto tm0) {
+                static_for<0, TM1, 1>{}([&](auto tm1) {
+                    static_for<0, TN0, 1>{}([&](auto tn0) {
+                        static_for<0, TN1, 1>{}([&](auto tn1) {
+                            vector_type<FloatA, TK1> a_vec;
+                            vector_type<FloatB, TK1> b_vec;
+
+                            static_for<0, TK1, 1>{}([&](auto tk1) {
+                                constexpr index_t a_offset =
+                                    AThreadDesc_TK0_TM0_TM1_TK1{}.CalculateOffset(
+                                        a_origin_idx + make_multi_index(tk0, tm0, tm1, tk1));
+
+                                constexpr index_t b_offset =
+                                    BThreadDesc_TK0_TN0_TN1_TK1{}.CalculateOffset(
+                                        b_origin_idx + make_multi_index(tk0, tn0, tn1, tk1));
+
+                                a_vec.template AsType<FloatA>()(tk1) = a_buf[Number<a_offset>{}];
+                                b_vec.template AsType<FloatB>()(tk1) = b_buf[Number<b_offset>{}];
+                            });
+
+                            using a_vector_t = typename vector_type<FloatA, TK1>::type;
+                            using b_vector_t = typename vector_type<FloatB, TK1>::type;
+
+                            constexpr index_t c_offset =
+                                CThreadDesc_TM0_TM1_TN0_TN1{}.CalculateOffset(
+                                    c_origin_idx + make_multi_index(tm0, tm1, tn0, tn1));
+
+                            amd_inner_product_dlop<a_vector_t, b_vector_t, FloatC>(
+                                a_vec.template AsType<a_vector_t>()[I0],
+                                b_vec.template AsType<b_vector_t>()[I0],
+                                c_buf(Number<c_offset>{}));
+                        });
+                    });
+                });
+            });
+        });
+    }
+};
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_set.hpp b/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_set.hpp
new file mode 100644
index 0000000000..f1b632aa84
--- /dev/null
+++ b/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_set.hpp
@@ -0,0 +1,59 @@
+#ifndef CK_THREADWISE_DYNAMIC_TENSOR_SET_HPP
+#define CK_THREADWISE_DYNAMIC_TENSOR_SET_HPP
+
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// Assume:
+//   1. Desc is known at compile-time
+//   2. Buffer is StaticBuffer
+//   3. OriginIdx is known at compile-time
+//   4. use #-iterator
+template <typename Data,
+          typename Desc,
+          typename SliceLengths,
+          typename std::enable_if<Desc::IsKnownAtCompileTime(), bool>::type = false>
+struct ThreadwiseDynamicTensorSliceSet_v1
+{
+    static constexpr index_t nDim = SliceLengths::Size();
+
+    using Index = MultiIndex<nDim>;
+
+    template <typename OriginIdx, typename Buffer>
+    __device__ void Run(const Desc&, const OriginIdx&, Buffer& buf, const Data& initial_value) const
+    {
+        static_assert(Desc::IsKnownAtCompileTime(),
+                      "wrong! SrcDesc and DstDesc need to known at compile-time");
+
+        static_assert(Buffer::IsStaticBuffer(), "wrong! DstBuffer need to be StaticBuffer");
+
+        static_assert(is_known_at_compile_time<remove_cv_t<remove_reference_t<OriginIdx>>>::value,
+                      "wrong! OriginIdx need to be known at compile-time");
+
+        // Desc is known at compile-time
+        constexpr auto desc = remove_cv_t<remove_reference_t<Desc>>{};
+
+        // OriginIdx is known at compile-time
+        constexpr auto origin_idx = to_multi_index(OriginIdx{});
+
+        static_ford<SliceLengths>{}([&](auto access_idx) {
+            constexpr auto coord = make_dynamic_tensor_coordinate(desc, origin_idx + access_idx);
+
+            constexpr bool is_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(desc, coord);
+
+            constexpr index_t offset = coord.GetOffset();
+
+            if constexpr(is_valid)
+            {
+                buf(Number<offset>{}) = initial_value;
+            }
+        });
+    }
+};
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp b/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
new file mode 100644
index 0000000000..9626113686
--- /dev/null
+++ b/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
@@ -0,0 +1,1449 @@
+#ifndef CK_THREADWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_HPP
+#define CK_THREADWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_HPP
+
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// Do following things to avoid "alloca" in LLVM-IR, which would cause scratch memory
+// and sometimes useless instructions:
+//   1. Don't save a reference to tensor descriptor in class, pass in tensor descriptor as argument
+//   instead
+//   2. Don't construct a new tensor coordinate everytime when using it, update and reuse the same
+//   tensor coordinate instead
+//   3. Don't use a pointer to VGPR buffer, use vector instead
+
+namespace detail {
+// TODO: How to fix this? It uses an struct instead of lambda because lambda
+// doesn't have constructor
+template <index_t VectorDim, index_t ScalarPerVector>
+struct lambda_scalar_per_access
+{
+    __host__ __device__ constexpr auto operator()(index_t i) const
+    {
+        return (i == VectorDim) ? ScalarPerVector : 1;
+    }
+};
+
+template <index_t VectorDim>
+struct lambda_scalar_step_in_vector
+{
+    __host__ __device__ constexpr auto operator()(index_t i) const
+    {
+        return (i == VectorDim) ? 1 : 0;
+    }
+};
+} // namespace detail
+
+// Assume:
+//   1. src:
+//     1. SrcDesc is known at compile-time
+//     2. SrcBuffer is StaticBuffer
+//     3. SrcSliceOrginIdx is known at compile-time
+//   2. dst:
+//     1. DstDesc is not known at compile-time
+//     2. DstBuffer is DynamicBuffer
+//     3. DstSliceOrginIdx is not known at compile time
+template <typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SliceLengths,
+          typename DimAccessOrder,
+          index_t DstVectorDim,
+          index_t DstScalarPerVector,
+          InMemoryDataOperationEnum_t DstInMemOp,
+          index_t DstScalarStrideInVector,
+          bool DstResetCoordinateAfterRun,
+          typename std::enable_if<SrcDesc::IsKnownAtCompileTime(), bool>::type = false>
+struct ThreadwiseDynamicTensorSliceTransfer_v1r3
+{
+    static constexpr index_t nDim = SliceLengths::Size();
+
+    using Index = MultiIndex<nDim>;
+
+    using DstCoord = decltype(make_dynamic_tensor_coordinate(DstDesc{}, Index{}));
+
+    using DstCoordIterator = decltype(make_dynamic_tensor_coordinate_iterator(DstDesc{}, Index{}));
+
+    __device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v1r3(
+        const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
+        : dst_coord_(make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_idx))
+    {
+        static_assert(SrcDesc::IsKnownAtCompileTime(),
+                      "wrong! SrcDesc need to known at compile-time");
+    }
+
+    __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
+    {
+        dst_coord_ = make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_idx);
+    }
+
+    template <typename SrcSliceOriginIdx,
+              typename SrcBuffer,
+              typename DstBuffer,
+              typename DstIteratorHacks>
+    __device__ void Run(const SrcDesc&,
+                        const SrcSliceOriginIdx&,
+                        const SrcBuffer& src_buf,
+                        const DstDesc& dst_desc,
+                        DstBuffer& dst_buf,
+                        const DstIteratorHacks& dst_iterator_hacks)
+    {
+        static_assert(SrcDesc::IsKnownAtCompileTime(),
+                      "wrong! SrcDesc need to known at compile-time");
+
+        static_assert(
+            is_known_at_compile_time<remove_cv_t<remove_reference_t<SrcSliceOriginIdx>>>::value,
+            "wrong! SrcSliceOrigin need to known at compile-time");
+
+        static_assert(SrcBuffer::IsStaticBuffer(), "wrong! SrcBuffer need to be StaticBuffer");
+
+        // static_assert(is_same<remove_cv_t<remove_reference_t<typename SrcBuffer::type>>,
+        // remove_cv_t<remove_reference_t<SrcData>>>::value,
+        //"wrong! SrcBuffer data type is wrong");
+
+        // SrcDesc and src_slice_origin_idx are known at compile-time
+        constexpr auto src_desc             = remove_cv_t<remove_reference_t<SrcDesc>>{};
+        constexpr auto src_slice_origin_idx = to_multi_index(SrcSliceOriginIdx{});
+
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto dst_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto dst_scalar_step_in_vector =
+            generate_sequence(detail::lambda_scalar_step_in_vector<DstVectorDim>{}, Number<nDim>{});
+
+        constexpr auto access_lengths = SliceLengths{} / dst_scalar_per_access;
+
+        constexpr auto dim_access_order = DimAccessOrder{};
+
+        constexpr auto ordered_access_lengths =
+            container_reorder_given_new2old(access_lengths, dim_access_order);
+
+        // make forward iterators
+        const auto dst_forward_iterators = generate_tuple(
+            [&](auto i) {
+                Index forward_step;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    forward_step(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
+                });
+
+                return make_dynamic_tensor_coordinate_iterator(
+                    dst_desc, forward_step, dst_iterator_hacks[I0][i]);
+            },
+            Number<nDim>{});
+
+        // make backward iterators
+        const auto dst_backward_iterators = generate_tuple(
+            [&](auto i) {
+                Index backward_step;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    backward_step(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
+                });
+
+                return make_dynamic_tensor_coordinate_iterator(
+                    dst_desc, backward_step, dst_iterator_hacks[I1][i]);
+            },
+            Number<nDim>{});
+
+        // loop over tensor and copy
+        static_ford<decltype(ordered_access_lengths)>{}([&](auto ordered_access_idx) {
+            // judge move forward or move backward
+            constexpr auto forward_sweep = [&]() {
+                StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+                forward_sweep_(I0) = true;
+
+                static_for<1, nDim, 1>{}([&](auto i) {
+                    index_t tmp = ordered_access_idx[I0];
+
+                    static_for<0, i, 1>{}([&](auto j) {
+                        tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j];
+                    });
+
+                    forward_sweep_(i) = tmp % 2 == 0;
+                });
+
+                return forward_sweep_;
+            }();
+
+            // calculate dst data index
+            constexpr auto dst_data_idx = [&]() {
+                Index ordered_idx;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    ordered_idx(i) = forward_sweep[i]
+                                         ? ordered_access_idx[i]
+                                         : ordered_access_lengths[i] - 1 - ordered_access_idx[i];
+                });
+
+                return container_reorder_given_old2new(ordered_idx, dim_access_order) *
+                       dst_scalar_per_access;
+            }();
+
+            typename vector_type_maker<DstData, DstScalarPerVector>::type dst_vector;
+
+            using dst_vector_t =
+                typename vector_type_maker<DstData, DstScalarPerVector>::type::type;
+
+            // copy data from src_buf into dst_vector
+            static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
+                constexpr index_t src_offset = src_desc.CalculateOffset(
+                    src_slice_origin_idx + dst_data_idx + i * dst_scalar_step_in_vector);
+
+                dst_vector.template AsType<DstData>()(i) =
+                    type_convert<DstData>{}(src_buf[Number<src_offset>{}]);
+            });
+
+            const bool is_dst_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
+
+            // copy data from dst_vector into dst_buf
+            dst_buf.template Set<dst_vector_t>(
+                dst_coord_.GetOffset(),
+                is_dst_valid,
+                dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
+
+            constexpr auto move_on_dim = [&]() constexpr
+            {
+                StaticallyIndexedArray<bool, nDim> move_on_dim_;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    move_on_dim_(i) = ordered_access_idx[i] < ordered_access_lengths[i] - 1;
+
+                    static_for<i + 1, nDim, 1>{}([&](auto j) {
+                        move_on_dim_(i) &= ordered_access_idx[j] == ordered_access_lengths[j] - 1;
+                    });
+                });
+
+                return move_on_dim_;
+            }
+            ();
+
+            // move
+            static_for<0, nDim, 1>{}([&](auto i) {
+                if constexpr(move_on_dim[i])
+                {
+                    if constexpr(forward_sweep[i])
+                    {
+                        move_dynamic_tensor_coordinate(
+                            dst_desc, dst_coord_, dst_forward_iterators[dim_access_order[i]]);
+                    }
+                    else
+                    {
+                        move_dynamic_tensor_coordinate(
+                            dst_desc, dst_coord_, dst_backward_iterators[dim_access_order[i]]);
+                    }
+                }
+            });
+        });
+
+        // move dst coordinate back to slice origin (or not)
+        if constexpr(DstResetCoordinateAfterRun)
+        {
+            const auto dst_reset_iterator =
+                make_dynamic_tensor_coordinate_iterator(dst_desc, GetDstCoordinateResetStep());
+
+            move_dynamic_tensor_coordinate(dst_desc, dst_coord_, dst_reset_iterator);
+        }
+    }
+
+    template <typename SrcSliceOriginIdx, typename SrcBuffer, typename DstBuffer>
+    __device__ void Run(const SrcDesc&,
+                        const SrcSliceOriginIdx&,
+                        const SrcBuffer& src_buf,
+                        const DstDesc& dst_desc,
+                        DstBuffer& dst_buf)
+    {
+        constexpr index_t ntransform_dst = DstDesc::GetNumOfTransform();
+
+        constexpr auto zeros = typename uniform_sequence_gen<ntransform_dst, 0>::type{};
+
+        constexpr auto dst_iterator_hacks =
+            make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
+                       generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
+
+        Run(SrcDesc{}, SrcSliceOriginIdx{}, src_buf, dst_desc, dst_buf, dst_iterator_hacks);
+    }
+
+    __device__ static constexpr auto GetDstCoordinateResetStep()
+    {
+        constexpr auto I0 = Number<0>{};
+
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto dst_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto access_lengths = SliceLengths{} / dst_scalar_per_access;
+
+        constexpr auto dim_access_order = DimAccessOrder{};
+
+        constexpr auto ordered_access_lengths =
+            container_reorder_given_new2old(access_lengths, dim_access_order);
+
+        // judge move forward or move backward during the last iteration
+        constexpr auto forward_sweep = [&]() {
+            StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+            forward_sweep_(I0) = true;
+
+            static_for<1, nDim, 1>{}([&](auto i) {
+                index_t tmp = ordered_access_lengths[I0] - 1;
+
+                static_for<0, i, 1>{}([&](auto j) {
+                    tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1;
+                });
+
+                forward_sweep_(i) = tmp % 2 == 0;
+            });
+
+            return forward_sweep_;
+        }();
+
+        // calculate dst data index after last iteration in Run(), if it has not being reset by
+        // RunWrite()
+        constexpr auto dst_data_idx = [&]() {
+            Index ordered_idx;
+
+            static_for<0, nDim, 1>{}([&](auto i) {
+                ordered_idx(i) = forward_sweep[i] ? ordered_access_lengths[i] - 1 : 0;
+            });
+
+            return container_reorder_given_old2new(ordered_idx, dim_access_order) *
+                   dst_scalar_per_access;
+        }();
+
+        //
+        constexpr auto reset_dst_data_step = [&]() {
+            Index reset_dst_data_step_;
+
+            static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; });
+
+            return reset_dst_data_step_;
+        }();
+
+        return reset_dst_data_step;
+    }
+
+    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
+                                       const Index& dst_slice_origin_step_idx)
+    {
+        // if dst coord was not reset by Run(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
+                                       : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step =
+            make_dynamic_tensor_coordinate_iterator(dst_desc, adjusted_step_idx);
+
+        move_dynamic_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
+    }
+
+    private:
+    DstCoord dst_coord_;
+}; // namespace ck
+
+// Assume:
+//   1. src:
+//     1. SrcDesc is not known at compile-time
+//     2. SrcBuffer is DynamicBuffer
+//     3. src_slice_origin_idx is not known at compile-time
+//   2. dst:
+//     1. DstDesc is known at compile-time
+//     2. DstBuffer is StaticBuffer
+//     3. dst_slice_origin_idx is known at compile-time
+template <typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SliceLengths,
+          typename DimAccessOrder,
+          index_t SrcVectorDim,
+          index_t SrcScalarPerVector,
+          index_t SrcScalarStrideInVector,
+          bool SrcResetCoordinateAfterRun,
+          typename std::enable_if<DstDesc::IsKnownAtCompileTime(), bool>::type = false>
+struct ThreadwiseDynamicTensorSliceTransfer_v2
+{
+    static constexpr index_t nDim = SliceLengths::Size();
+
+    using Index = MultiIndex<nDim>;
+
+    using SrcCoord = decltype(make_dynamic_tensor_coordinate(SrcDesc{}, Index{}));
+
+    using SrcCoordIterator = decltype(make_dynamic_tensor_coordinate_iterator(SrcDesc{}, Index{}));
+
+    __device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v2(const SrcDesc& src_desc,
+                                                                 const Index& src_slice_origin_idx)
+        : src_coord_(make_dynamic_tensor_coordinate(src_desc, src_slice_origin_idx))
+    {
+        static_assert(DstDesc::IsKnownAtCompileTime(),
+                      "wrong! SrcDesc need to known at compile-time");
+    }
+
+    __device__ void SetDstSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
+    {
+        src_coord_ = make_dynamic_tensor_coordinate(src_desc, src_slice_origin_idx);
+    }
+
+    template <typename SrcBuffer,
+              typename DstBuffer,
+              typename DstSliceOriginIdx,
+              typename SrcIteratorHacks>
+    __device__ void Run(const SrcDesc& src_desc,
+                        const SrcBuffer& src_buf,
+                        const DstDesc&,
+                        const DstSliceOriginIdx&,
+                        DstBuffer& dst_buf,
+                        const SrcIteratorHacks& src_iterator_hacks)
+    {
+        static_assert(DstDesc::IsKnownAtCompileTime(),
+                      "wrong! DstDesc need to known at compile-time");
+
+        static_assert(
+            is_known_at_compile_time<remove_cv_t<remove_reference_t<DstSliceOriginIdx>>>::value,
+            "wrong! DstSliceOrigin need to known at compile-time");
+
+        static_assert(is_same<remove_cv_t<remove_reference_t<typename DstBuffer::type>>,
+                              remove_cv_t<remove_reference_t<DstData>>>::value &&
+                      "wrong! inconsistent type");
+
+        // DstDesc and dst_slice_origin_idx are known at compile-time
+        constexpr auto dst_desc             = remove_cv_t<remove_reference_t<DstDesc>>{};
+        constexpr auto dst_slice_origin_idx = DstSliceOriginIdx{};
+
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto src_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto src_scalar_step_in_vector =
+            generate_sequence(detail::lambda_scalar_step_in_vector<SrcVectorDim>{}, Number<nDim>{});
+
+        constexpr auto access_lengths = SliceLengths{} / src_scalar_per_access;
+
+        constexpr auto dim_access_order = DimAccessOrder{};
+
+        constexpr auto ordered_access_lengths =
+            container_reorder_given_new2old(access_lengths, dim_access_order);
+
+        // make forward iterators
+        const auto src_forward_iterators = generate_tuple(
+            [&](auto i) {
+                Index forward_step;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    forward_step(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0;
+                });
+
+                return make_dynamic_tensor_coordinate_iterator(
+                    src_desc, forward_step, src_iterator_hacks[I0][i]);
+            },
+            Number<nDim>{});
+
+        // make backward iterators
+        const auto src_backward_iterators = generate_tuple(
+            [&](auto i) {
+                Index backward_step;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    backward_step(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0;
+                });
+
+                return make_dynamic_tensor_coordinate_iterator(
+                    src_desc, backward_step, src_iterator_hacks[I1][i]);
+            },
+            Number<nDim>{});
+
+        // loop over tensor and copy
+        static_ford<decltype(ordered_access_lengths)>{}([&](auto ordered_access_idx) {
+            // judge move forward or move backward
+            constexpr auto forward_sweep = [&]() {
+                StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+                forward_sweep_(I0) = true;
+
+                static_for<1, nDim, 1>{}([&](auto i) {
+                    index_t tmp = ordered_access_idx[I0];
+
+                    static_for<0, i, 1>{}([&](auto j) {
+                        tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j];
+                    });
+
+                    forward_sweep_(i) = tmp % 2 == 0;
+                });
+
+                return forward_sweep_;
+            }();
+
+            // calculate src data index
+            constexpr auto src_data_idx = [&]() {
+                Index ordered_idx;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    ordered_idx(i) = forward_sweep[i]
+                                         ? ordered_access_idx[i]
+                                         : ordered_access_lengths[i] - 1 - ordered_access_idx[i];
+                });
+
+                return container_reorder_given_old2new(ordered_idx, dim_access_order) *
+                       src_scalar_per_access;
+            }();
+
+            typename vector_type_maker<SrcData, SrcScalarPerVector>::type src_vector;
+
+            using src_vector_t =
+                typename vector_type_maker<SrcData, SrcScalarPerVector>::type::type;
+
+            const bool is_src_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_);
+
+            // copy data from src_buf into src_vector
+            src_vector.template AsType<src_vector_t>()(Number<0>{}) =
+                src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid);
+
+            // copy data from src_vector into dst_buf
+            static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
+                constexpr index_t dst_offset =
+                    dst_desc.CalculateOffset(to_multi_index(dst_slice_origin_idx) + src_data_idx +
+                                             i * src_scalar_step_in_vector);
+
+                dst_buf(Number<dst_offset>{}) = src_vector.template AsType<SrcData>()[i];
+            });
+
+            constexpr auto move_on_dim = [&]() constexpr
+            {
+                StaticallyIndexedArray<bool, nDim> move_on_dim_;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    move_on_dim_(i) = ordered_access_idx[i] < ordered_access_lengths[i] - 1;
+
+                    static_for<i + 1, nDim, 1>{}([&](auto j) {
+                        move_on_dim_(i) &= ordered_access_idx[j] == ordered_access_lengths[j] - 1;
+                    });
+                });
+
+                return move_on_dim_;
+            }
+            ();
+
+            // move
+            static_for<0, nDim, 1>{}([&](auto i) {
+                if constexpr(move_on_dim[i])
+                {
+                    if constexpr(forward_sweep[i])
+                    {
+                        move_dynamic_tensor_coordinate(
+                            src_desc, src_coord_, src_forward_iterators[dim_access_order[i]]);
+                    }
+                    else
+                    {
+                        move_dynamic_tensor_coordinate(
+                            src_desc, src_coord_, src_backward_iterators[dim_access_order[i]]);
+                    }
+                }
+            });
+        });
+
+        // move src coordinate back to slice origin (or not)
+        if constexpr(SrcResetCoordinateAfterRun)
+        {
+            const auto src_reset_iterator =
+                make_dynamic_tensor_coordinate_iterator(src_desc, GetSrcCoordinateResetStep());
+
+            move_dynamic_tensor_coordinate(src_desc, src_coord_, src_reset_iterator);
+        }
+    }
+
+    template <typename SrcBuffer, typename DstBuffer, typename DstSliceOriginIdx>
+    __device__ void Run(const SrcDesc& src_desc,
+                        const SrcBuffer& src_buf,
+                        const DstDesc&,
+                        const DstSliceOriginIdx&,
+                        DstBuffer& dst_buf)
+    {
+        constexpr index_t ntransform_src = SrcDesc::GetNumOfTransform();
+
+        constexpr auto zeros = typename uniform_sequence_gen<ntransform_src, 0>::type{};
+
+        constexpr auto src_iterator_hacks =
+            make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
+                       generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
+
+        Run(src_desc, src_buf, DstDesc{}, DstSliceOriginIdx{}, dst_buf, src_iterator_hacks);
+    }
+
+    __device__ static constexpr auto GetSrcCoordinateResetStep()
+    {
+        constexpr auto I0 = Number<0>{};
+
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto src_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto access_lengths = SliceLengths{} / src_scalar_per_access;
+
+        constexpr auto dim_access_order = DimAccessOrder{};
+
+        constexpr auto ordered_access_lengths =
+            container_reorder_given_new2old(access_lengths, dim_access_order);
+
+        // judge move forward or move backward during the last iteration
+        constexpr auto forward_sweep = [&]() {
+            StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+            forward_sweep_(I0) = true;
+
+            static_for<1, nDim, 1>{}([&](auto i) {
+                index_t tmp = ordered_access_lengths[I0] - 1;
+
+                static_for<0, i, 1>{}([&](auto j) {
+                    tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1;
+                });
+
+                forward_sweep_(i) = tmp % 2 == 0;
+            });
+
+            return forward_sweep_;
+        }();
+
+        // calculate src data index after last iteration in Run(), if it has not being reset by
+        // RunWrite()
+        constexpr auto src_data_idx = [&]() {
+            Index ordered_idx;
+
+            static_for<0, nDim, 1>{}([&](auto i) {
+                ordered_idx(i) = forward_sweep[i] ? ordered_access_lengths[i] - 1 : 0;
+            });
+
+            return container_reorder_given_old2new(ordered_idx, dim_access_order) *
+                   src_scalar_per_access;
+        }();
+
+        //
+        constexpr auto reset_src_data_step = [&]() {
+            Index reset_src_data_step_;
+
+            static_for<0, nDim, 1>{}([&](auto i) { reset_src_data_step_(i) = -src_data_idx[i]; });
+
+            return reset_src_data_step_;
+        }();
+
+        return reset_src_data_step;
+    }
+
+    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
+                                       const Index& src_slice_origin_step_idx)
+    {
+        // if src coord was not reset by Run(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
+                                       : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step =
+            make_dynamic_tensor_coordinate_iterator(src_desc, adjusted_step_idx);
+
+        move_dynamic_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+    }
+
+    private:
+    SrcCoord src_coord_;
+}; // namespace ck
+
+// Assume:
+//   1. src_desc and dst_desc are not known at compile-time
+//   2. SrcBuffer and DstBuffer are DynamicBuffer
+//   3. src_slice_origin and dst_slice_origin are not known at compile-time,
+//   4. Use thread buffer
+template <typename SliceLengths,
+          InMemoryDataOperationEnum_t DstInMemOp,
+          typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
+          index_t SrcVectorDim,
+          index_t DstVectorDim,
+          index_t SrcScalarPerVector,
+          index_t DstScalarPerVector,
+          index_t SrcScalarStrideInVector,
+          index_t DstScalarStrideInVector,
+          bool SrcResetCoordinateAfterRun, // control whether to move back src coordinate after each
+                                           // RunRead(),  will be fused with MoveSrcSliceWindow to
+                                           // save addr computation
+          bool DstResetCoordinateAfterRun> // control whether to move back dst coordinate after each
+                                           // RunWrite(),  will be fused with MoveDstSliceWindow to
+                                           // save addr computation
+struct ThreadwiseDynamicTensorSliceTransfer_v3
+{
+    static constexpr index_t nDim = SliceLengths::Size();
+    using Index                   = MultiIndex<nDim>;
+
+    using SrcCoord = decltype(make_dynamic_tensor_coordinate(SrcDesc{}, Index{}));
+    using DstCoord = decltype(make_dynamic_tensor_coordinate(DstDesc{}, Index{}));
+
+    using SrcCoordIterator = decltype(make_dynamic_tensor_coordinate_iterator(SrcDesc{}, Index{}));
+    using DstCoordIterator = decltype(make_dynamic_tensor_coordinate_iterator(DstDesc{}, Index{}));
+
+    __device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v3(const SrcDesc& src_desc,
+                                                                 const Index& src_slice_origin,
+                                                                 const DstDesc& dst_desc,
+                                                                 const Index& dst_slice_origin)
+        : src_coord_(make_dynamic_tensor_coordinate(src_desc, src_slice_origin)),
+          dst_coord_(make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin))
+    {
+        // TODO: fix this
+        static_assert(is_same<SrcData, DstData>::value,
+                      "wrong! current implementation assume SrcData and DstData are same type");
+    }
+
+    __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
+    {
+        src_coord_ = make_dynamic_tensor_coordinate(src_desc, src_slice_origin_idx);
+    }
+
+    __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
+    {
+        dst_coord_ = make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_idx);
+    }
+
+    template <typename SrcBuffer, typename SrcIteratorHacks>
+    __device__ void RunRead(const SrcDesc& src_desc,
+                            const SrcBuffer& src_buf,
+                            const SrcIteratorHacks& src_iterator_hacks)
+    {
+        static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
+                          SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
+                      "wrong!");
+
+        static_assert(is_same<remove_cv_t<remove_reference_t<typename SrcBuffer::type>>,
+                              remove_cv_t<remove_reference_t<SrcData>>>::value,
+                      "wrong! SrcBuffer and SrcData data type are inconsistent");
+
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto src_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto src_scalar_step_in_vector =
+            generate_sequence(detail::lambda_scalar_step_in_vector<SrcVectorDim>{}, Number<nDim>{});
+
+        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
+
+        constexpr auto src_dim_access_order = SrcDimAccessOrder{};
+
+        constexpr auto ordered_src_access_lengths =
+            container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
+
+        // make forward iterators
+        const auto src_forward_iterators = generate_tuple(
+            [&](auto i) {
+                Index forward_step;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    forward_step(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0;
+                });
+
+                return make_dynamic_tensor_coordinate_iterator(
+                    src_desc, forward_step, src_iterator_hacks[I0][i]);
+            },
+            Number<nDim>{});
+
+        // make backward iterators
+        const auto src_backward_iterators = generate_tuple(
+            [&](auto i) {
+                Index backward_step;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    backward_step(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0;
+                });
+
+                return make_dynamic_tensor_coordinate_iterator(
+                    src_desc, backward_step, src_iterator_hacks[I1][i]);
+            },
+            Number<nDim>{});
+
+        // loop over tensor and copy
+        static_ford<decltype(ordered_src_access_lengths)>{}([&](auto ordered_src_access_idx) {
+            // judge move forward or move backward
+            constexpr auto forward_sweep = [&]() {
+                StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+                forward_sweep_(I0) = true;
+
+                static_for<1, nDim, 1>{}([&](auto i) {
+                    index_t tmp = ordered_src_access_idx[I0];
+
+                    static_for<0, i, 1>{}([&](auto j) {
+                        tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j];
+                    });
+
+                    forward_sweep_(i) = tmp % 2 == 0;
+                });
+
+                return forward_sweep_;
+            }();
+
+            // calculate src data index
+            constexpr auto src_data_idx = [&]() {
+                Index ordered_idx;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    ordered_idx(i) = forward_sweep[i] ? ordered_src_access_idx[i]
+                                                      : ordered_src_access_lengths[i] - 1 -
+                                                            ordered_src_access_idx[i];
+                });
+
+                return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
+                       src_scalar_per_access;
+            }();
+
+            vector_type_maker_t<SrcData, SrcScalarPerVector> src_tmp_vector;
+
+            using src_vector_t = typename decltype(src_tmp_vector)::type;
+
+            const bool is_src_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_);
+
+            // copy data from src_buf to src_tmp_vector
+            src_tmp_vector.template AsType<src_vector_t>()(Number<0>{}) =
+                src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid);
+
+            // copy data from src_tmp_vector to buffer_
+            static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
+                constexpr index_t buffer_offset =
+                    buffer_desc_.CalculateOffset(src_data_idx + i * src_scalar_step_in_vector);
+
+                buffer_(Number<buffer_offset>{}) = src_tmp_vector.template AsType<SrcData>()[i];
+            });
+
+            constexpr auto move_on_dim = [&]() constexpr
+            {
+                StaticallyIndexedArray<bool, nDim> move_on_dim_;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    move_on_dim_(i) = ordered_src_access_idx[i] < ordered_src_access_lengths[i] - 1;
+
+                    static_for<i + 1, nDim, 1>{}([&](auto j) {
+                        move_on_dim_(i) &=
+                            ordered_src_access_idx[j] == ordered_src_access_lengths[j] - 1;
+                    });
+                });
+
+                return move_on_dim_;
+            }
+            ();
+
+            // move
+            static_for<0, nDim, 1>{}([&](auto i) {
+                if constexpr(move_on_dim[i])
+                {
+                    if constexpr(forward_sweep[i])
+                    {
+                        move_dynamic_tensor_coordinate(
+                            src_desc, src_coord_, src_forward_iterators[src_dim_access_order[i]]);
+                    }
+                    else
+                    {
+                        move_dynamic_tensor_coordinate(
+                            src_desc, src_coord_, src_backward_iterators[src_dim_access_order[i]]);
+                    }
+                }
+            });
+        });
+
+        // move src coordinate back to slice origin (or not)
+        if constexpr(SrcResetCoordinateAfterRun)
+        {
+            const auto src_reset_iterator =
+                make_dynamic_tensor_coordinate_iterator(src_desc, GetSrcCoordinateResetStep());
+
+            move_dynamic_tensor_coordinate(src_desc, src_coord_, src_reset_iterator);
+        }
+    }
+
+    template <typename DstBuffer, typename DstIteratorHacks>
+    __device__ void RunWrite(const DstDesc& dst_desc,
+                             DstBuffer& dst_buf,
+                             const DstIteratorHacks& dst_iterator_hacks)
+    {
+        static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
+                          DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
+                      "wrong!");
+
+        static_assert(is_same<remove_cv_t<remove_reference_t<typename DstBuffer::type>>,
+                              remove_cv_t<remove_reference_t<DstData>>>::value,
+                      "wrong! SrcBuffer or DstBuffer data type is wrong");
+
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+
+        // src scalar per access on each dim
+        // TODO: don't use this
+        constexpr auto dst_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto dst_scalar_step_in_vector =
+            generate_sequence(detail::lambda_scalar_step_in_vector<DstVectorDim>{}, Number<nDim>{});
+
+        constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
+
+        constexpr auto dst_dim_access_order = DstDimAccessOrder{};
+
+        constexpr auto ordered_dst_access_lengths =
+            container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
+
+        // make forward iterators
+        const auto dst_forward_iterators = generate_tuple(
+            [&](auto i) {
+                Index forward_step;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    forward_step(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
+                });
+
+                const auto forward_iterator = make_dynamic_tensor_coordinate_iterator(
+                    dst_desc, forward_step, dst_iterator_hacks[I0][i]);
+
+                return forward_iterator;
+            },
+            Number<nDim>{});
+
+        // make backward iterators
+        const auto dst_backward_iterators = generate_tuple(
+            [&](auto i) {
+                Index backward_step;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    backward_step(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
+                });
+
+                const auto backward_iterator = make_dynamic_tensor_coordinate_iterator(
+                    dst_desc, backward_step, dst_iterator_hacks[I1][i]);
+
+                return backward_iterator;
+            },
+            Number<nDim>{});
+
+        // loop over tensor and copy
+        static_ford<decltype(ordered_dst_access_lengths)>{}([&](auto ordered_dst_access_idx) {
+            // judge move forward or move backward
+            constexpr auto forward_sweep = [&]() {
+                StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+                forward_sweep_(I0) = true;
+
+                static_for<1, nDim, 1>{}([&](auto i) {
+                    index_t tmp = ordered_dst_access_idx[I0];
+
+                    static_for<0, i, 1>{}([&](auto j) {
+                        tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j];
+                    });
+
+                    forward_sweep_(i) = tmp % 2 == 0;
+                });
+
+                return forward_sweep_;
+            }();
+
+            // calculate dst data index
+            constexpr auto dst_data_idx = [&]() {
+                Index ordered_idx;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_idx[i]
+                                                      : ordered_dst_access_lengths[i] - 1 -
+                                                            ordered_dst_access_idx[i];
+                });
+
+                return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
+                       dst_scalar_per_access;
+            }();
+
+            vector_type_maker_t<DstData, DstScalarPerVector> dst_tmp_vector;
+
+            // copy data from buffer_ to dst_tmp_vector
+            static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
+                constexpr index_t buffer_offset =
+                    buffer_desc_.CalculateOffset(dst_data_idx + i * dst_scalar_step_in_vector);
+
+                dst_tmp_vector.template AsType<DstData>()(i) = buffer_[Number<buffer_offset>{}];
+            });
+
+            using dst_vector_t = typename decltype(dst_tmp_vector)::type;
+
+            // copy data from dst_tmp_vector to dst_buf
+            const bool is_dst_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
+
+            dst_buf.template Set<dst_vector_t>(
+                dst_coord_.GetOffset(),
+                is_dst_valid,
+                dst_tmp_vector.template AsType<dst_vector_t>()[Number<0>{}]);
+
+            constexpr auto move_on_dim = [&]() constexpr
+            {
+                StaticallyIndexedArray<bool, nDim> move_on_dim_;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    move_on_dim_(i) = ordered_dst_access_idx[i] < ordered_dst_access_lengths[i] - 1;
+
+                    static_for<i + 1, nDim, 1>{}([&](auto j) {
+                        move_on_dim_(i) &=
+                            ordered_dst_access_idx[j] == ordered_dst_access_lengths[j] - 1;
+                    });
+                });
+
+                return move_on_dim_;
+            }
+            ();
+
+            // move
+            static_for<0, nDim, 1>{}([&](auto i) {
+                if constexpr(move_on_dim[i])
+                {
+                    if constexpr(forward_sweep[i])
+                    {
+                        move_dynamic_tensor_coordinate(
+                            dst_desc, dst_coord_, dst_forward_iterators[dst_dim_access_order[i]]);
+                    }
+                    else
+                    {
+                        move_dynamic_tensor_coordinate(
+                            dst_desc, dst_coord_, dst_backward_iterators[dst_dim_access_order[i]]);
+                    }
+                }
+            });
+        });
+
+        // move dst coordinate back to slice origin (or not)
+        if constexpr(DstResetCoordinateAfterRun)
+        {
+            const auto dst_reset_iterator =
+                make_dynamic_tensor_coordinate_iterator(dst_desc, GetDstCoordinateResetStep());
+
+            move_dynamic_tensor_coordinate(dst_desc, dst_coord_, dst_reset_iterator);
+        }
+    }
+
+    template <typename SrcBuffer>
+    __device__ void RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf)
+    {
+        constexpr index_t ntransform_src = SrcDesc::GetNumOfTransform();
+
+        constexpr auto zeros = typename uniform_sequence_gen<ntransform_src, 0>::type{};
+
+        constexpr auto src_iterator_hacks =
+            make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
+                       generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
+
+        RunRead(src_desc, src_buf, src_iterator_hacks);
+    }
+
+    template <typename DstBuffer>
+    __device__ void RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf)
+    {
+        constexpr index_t ntransform_dst = DstDesc::GetNumOfTransform();
+
+        constexpr auto zeros = typename uniform_sequence_gen<ntransform_dst, 0>::type{};
+
+        constexpr auto dst_iterator_hacks =
+            make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
+                       generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
+
+        RunWrite(dst_desc, dst_buf, dst_iterator_hacks);
+    }
+
+    __device__ static constexpr auto GetSrcCoordinateResetStep()
+    {
+        constexpr auto I0 = Number<0>{};
+
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto src_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
+
+        constexpr auto src_dim_access_order = SrcDimAccessOrder{};
+
+        constexpr auto ordered_src_access_lengths =
+            container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
+
+        // judge move forward or move backward during the last iteration
+        constexpr auto forward_sweep = [&]() {
+            StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+            forward_sweep_(I0) = true;
+
+            static_for<1, nDim, 1>{}([&](auto i) {
+                index_t tmp = ordered_src_access_lengths[I0] - 1;
+
+                static_for<0, i, 1>{}([&](auto j) {
+                    tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1;
+                });
+
+                forward_sweep_(i) = tmp % 2 == 0;
+            });
+
+            return forward_sweep_;
+        }();
+
+        // calculate src data index after last iteration in RunRead(), if it has not being reset by
+        // RunRead()
+        constexpr auto src_data_idx = [&]() {
+            Index ordered_idx;
+
+            static_for<0, nDim, 1>{}([&](auto i) {
+                ordered_idx(i) = forward_sweep[i] ? ordered_src_access_lengths[i] - 1 : 0;
+            });
+
+            return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
+                   src_scalar_per_access;
+        }();
+
+        //
+        constexpr auto reset_src_data_step = [&]() {
+            Index reset_src_data_step_;
+
+            static_for<0, nDim, 1>{}([&](auto i) { reset_src_data_step_(i) = -src_data_idx[i]; });
+
+            return reset_src_data_step_;
+        }();
+
+        return reset_src_data_step;
+    }
+
+    __device__ static constexpr auto GetDstCoordinateResetStep()
+    {
+        constexpr auto I0 = Number<0>{};
+
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto dst_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
+
+        constexpr auto dst_dim_access_order = DstDimAccessOrder{};
+
+        constexpr auto ordered_dst_access_lengths =
+            container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
+
+        // judge move forward or move backward during the last iteration
+        constexpr auto forward_sweep = [&]() {
+            StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+            forward_sweep_(I0) = true;
+
+            static_for<1, nDim, 1>{}([&](auto i) {
+                index_t tmp = ordered_dst_access_lengths[I0] - 1;
+
+                static_for<0, i, 1>{}([&](auto j) {
+                    tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1;
+                });
+
+                forward_sweep_(i) = tmp % 2 == 0;
+            });
+
+            return forward_sweep_;
+        }();
+
+        // calculate dst data index after last iteration in RunWrite(), if it has not being reset by
+        // RunWrite()
+        constexpr auto dst_data_idx = [&]() {
+            Index ordered_idx;
+
+            static_for<0, nDim, 1>{}([&](auto i) {
+                ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_lengths[i] - 1 : 0;
+            });
+
+            return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
+                   dst_scalar_per_access;
+        }();
+
+        //
+        constexpr auto reset_dst_data_step = [&]() {
+            Index reset_dst_data_step_;
+
+            static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; });
+
+            return reset_dst_data_step_;
+        }();
+
+        return reset_dst_data_step;
+    }
+
+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
+                                       const Index& src_slice_origin_step_idx)
+    {
+        // if src coord was not reset by RunRead(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
+                                       : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step =
+            make_dynamic_tensor_coordinate_iterator(src_desc, adjusted_step_idx);
+
+        move_dynamic_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+    }
+
+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    template <typename SrcMoveSliceWindowIteratorHack>
+    __device__ void
+    MoveSrcSliceWindow(const SrcDesc& src_desc,
+                       const Index& src_slice_origin_step_idx,
+                       const SrcMoveSliceWindowIteratorHack& src_move_slice_window_iterator_hack)
+    {
+        // if src coord was not reset by RunRead(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
+                                       : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_dynamic_tensor_coordinate_iterator(
+            src_desc, adjusted_step_idx, src_move_slice_window_iterator_hack);
+
+        move_dynamic_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+    }
+    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
+                                       const Index& dst_slice_origin_step_idx)
+    {
+        // if dst coord was not reset by RunWrite(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
+                                       : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step =
+            make_dynamic_tensor_coordinate_iterator(dst_desc, adjusted_step_idx);
+
+        move_dynamic_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
+    }
+
+    private:
+    static constexpr auto buffer_desc_ =
+        make_dynamic_naive_tensor_descriptor_packed_v2(sequence_to_tuple_of_number(SliceLengths{}));
+
+    static constexpr auto buffer_size_ = buffer_desc_.GetElementSpaceSize();
+
+    StaticBuffer<AddressSpaceEnum_t::Vgpr, SrcData, buffer_size_> buffer_;
+
+    SrcCoord src_coord_;
+    DstCoord dst_coord_;
+};
+
+// Assume:
+//   1. src:
+//     1. SrcDesc is known at compile-time
+//     2. SrcBuffer is DynamicBuffer
+//     3. src_ref_idx is known at run-time
+//     4. SrcRefToOriginDisplacement is known at compile-time
+//     5. use #-iterator
+//   2. dst:
+//     1. DstDesc is known at compile-time
+//     2. DstBuffer is StaticBuffer
+//     3. DstOriginIdx is known at compile-time
+//     4. use direct address calculation
+//   3. vector access on src
+template <
+    typename SrcData,
+    typename DstData,
+    typename SrcDesc,
+    typename DstDesc,
+    typename SliceLengths,
+    typename DimAccessOrder,
+    index_t SrcVectorDim,
+    index_t SrcScalarPerVector,
+    index_t SrcScalarStrideInVector,
+    typename std::enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
+                            bool>::type = false>
+struct ThreadwiseDynamicTensorSliceTransfer_v4
+{
+    static constexpr index_t nDim = SliceLengths::Size();
+
+    using Index = MultiIndex<nDim>;
+
+    using SrcCoord = decltype(make_dynamic_tensor_coordinate(SrcDesc{}, Index{}));
+
+    using SrcCoordIterator = decltype(make_dynamic_tensor_coordinate_iterator(SrcDesc{}, Index{}));
+
+    __device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v4(const Index& src_ref_idx)
+        : src_ref_coord_(make_dynamic_tensor_coordinate(SrcDesc{}, src_ref_idx))
+    {
+        static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
+                      "wrong! SrcDesc and DstDesc need to known at compile-time");
+
+        static_assert(SliceLengths::At(Number<SrcVectorDim>{}) % SrcScalarPerVector == 0, "wrong!");
+    }
+
+    template <typename SrcRefToOriginDisplacement,
+              typename DstOriginIdx,
+              typename SrcBuffer,
+              typename DstBuffer>
+    __device__ void Run(const SrcDesc&,
+                        const SrcRefToOriginDisplacement&,
+                        const SrcBuffer& src_buf,
+                        const DstDesc&,
+                        const DstOriginIdx&,
+                        DstBuffer& dst_buf) const
+    {
+        static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
+                      "wrong! SrcDesc and DstDesc need to known at compile-time");
+
+        static_assert(is_same<remove_cv_t<remove_reference_t<typename SrcBuffer::type>>,
+                              remove_cv_t<remove_reference_t<SrcData>>>::value &&
+                          is_same<remove_cv_t<remove_reference_t<typename DstBuffer::type>>,
+                                  remove_cv_t<remove_reference_t<DstData>>>::value,
+                      "wrong! SrcBuffer or DstBuffer data type is wrong");
+
+        static_assert(DstBuffer::IsStaticBuffer(), "wrong! DstBuffer need to be StaticBuffer");
+
+        static_assert(
+            is_known_at_compile_time<
+                remove_cv_t<remove_reference_t<SrcRefToOriginDisplacement>>>::value &&
+                is_known_at_compile_time<remove_cv_t<remove_reference_t<DstOriginIdx>>>::value,
+            "wrong! SrcOriginToRefDistance and DstOriginToRefDistance need to be known "
+            "at compile-time");
+
+        // SrcDesc and DstDesc are known at compile-time
+        constexpr auto src_desc = remove_cv_t<remove_reference_t<SrcDesc>>{};
+        constexpr auto dst_desc = remove_cv_t<remove_reference_t<DstDesc>>{};
+
+        // SrcOriginToRefDisttance and DstOriginToRefDistance are known at compile-time
+        constexpr auto src_ref_to_origin_disp_idx = to_multi_index(SrcRefToOriginDisplacement{});
+        constexpr auto dst_origin_idx             = to_multi_index(DstOriginIdx{});
+
+        // scalar per access of each dim
+        constexpr auto src_scalar_per_access = generate_sequence_v2(
+            [&](auto i) constexpr {
+                if constexpr(i == SrcVectorDim)
+                {
+                    return Number<SrcScalarPerVector>{};
+                }
+                else
+                {
+                    return Number<1>{};
+                }
+            },
+            Number<nDim>{});
+
+        // scalar step (if steping on SrcVectorDim) of each dim
+        constexpr auto src_scalar_step_in_vector = generate_sequence_v2(
+            [&](auto i) constexpr {
+                if constexpr(i == SrcVectorDim)
+                {
+                    return Number<1>{};
+                }
+                else
+                {
+                    return Number<0>{};
+                }
+            },
+            Number<nDim>{});
+
+        constexpr auto access_lengths = SliceLengths{} / src_scalar_per_access;
+
+        constexpr auto dim_access_order = DimAccessOrder{};
+
+        constexpr auto ordered_access_lengths =
+            container_reorder_given_new2old(access_lengths, dim_access_order);
+
+        static_ford<decltype(ordered_access_lengths)>{}([&](auto ordered_access_idx) {
+#if 0
+            // TODO: unable to compile
+            // position in slice window
+            constexpr auto data_to_origin_disp_idx =
+                container_reorder_given_old2new(ordered_access_idx, dim_access_order) *
+                src_scalar_per_access;
+#else
+            // position in slice window
+            constexpr auto data_to_origin_disp_idx =
+                ordered_access_idx.ReorderGivenOld2New(dim_access_order) * src_scalar_per_access;
+#endif
+            // src coordinate
+            constexpr auto src_ref_to_data_disp_idx =
+                src_ref_to_origin_disp_idx + data_to_origin_disp_idx;
+
+            constexpr auto src_ref_to_data_disp_coord_iterator =
+                make_dynamic_tensor_coordinate_iterator(src_desc, src_ref_to_data_disp_idx);
+
+            auto src_data_coord = src_ref_coord_;
+
+            move_dynamic_tensor_coordinate(
+                src_desc, src_data_coord, src_ref_to_data_disp_coord_iterator);
+
+            vector_type_maker_t<SrcData, SrcScalarPerVector> src_tmp_vector;
+
+            using src_vector_t = typename decltype(src_tmp_vector)::type;
+
+            const bool is_src_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
+                src_desc, src_data_coord);
+
+            // copy data from src_buf into src_tmp_vector
+            src_tmp_vector.template AsType<src_vector_t>()(Number<0>{}) =
+                src_buf.template Get<src_vector_t>(src_data_coord.GetOffset(), is_src_valid);
+
+            // copy data from src_tmp_vector to dst_tmp_vector (data cast data from SrcData to
+            // DstData)
+            vector_type_maker_t<DstData, SrcScalarPerVector> dst_tmp_vector;
+
+            // TODO: if SrcData and DstData are vetor type, then static_cast may not compile
+            static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
+                dst_tmp_vector.template AsType<DstData>()(i) =
+                    type_convert<DstData>{}(src_tmp_vector.template AsType<SrcData>()[i]);
+            });
+
+            // copy data from dst_tmp_vector into dst_buf
+            static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
+                constexpr index_t dst_offset = dst_desc.CalculateOffset(
+                    dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
+
+                dst_buf(Number<dst_offset>{}) = dst_tmp_vector.template AsType<DstData>()[i];
+            });
+        });
+    }
+
+    template <typename SrcSliceMoveStepIdx>
+    __device__ void MoveSrcSliceWindow(const SrcDesc&,
+                                       const SrcSliceMoveStepIdx& src_slice_move_step_idx)
+    {
+        constexpr auto src_desc = SrcDesc{};
+
+        const auto src_slice_move_step_iter = make_dynamic_tensor_coordinate_iterator(
+            src_desc, to_multi_index(src_slice_move_step_idx));
+
+        move_dynamic_tensor_coordinate(SrcDesc{}, src_ref_coord_, src_slice_move_step_iter);
+    }
+
+    private:
+    SrcCoord src_ref_coord_;
+};
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer_v2.hpp b/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer_v2.hpp
new file mode 100644
index 0000000000..ba60e26c38
--- /dev/null
+++ b/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer_v2.hpp
@@ -0,0 +1,789 @@
+#ifndef CK_THREADWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_V2_HPP
+#define CK_THREADWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_V2_HPP
+
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// Assume:
+//   1. src_desc and dst_desc are not known at compile-time
+//   2. SrcBuffer and DstBuffer are DynamicBuffer
+//   3. src_slice_origin and dst_slice_origin are not known at compile-time,
+//   4. Use thread buffer
+template <typename SliceLengths,
+          InMemoryDataOperationEnum_t DstInMemOp,
+          typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
+          typename SrcVectorTensorLengths,
+          typename DstVectorTensorLengths,
+          typename SrcVectorTensorContiguousDimOrder,
+          typename DstVectorTensorContiguousDimOrder,
+          bool SrcResetCoordinateAfterRun, // control whether to move back src coordinate after each
+                                           // RunRead(),  will be fused with MoveSrcSliceWindow to
+                                           // save addr computation
+          bool DstResetCoordinateAfterRun> // control whether to move back dst coordinate after each
+                                           // RunWrite(),  will be fused with MoveDstSliceWindow to
+                                           // save addr computation
+struct ThreadwiseDynamicTensorSliceTransfer_v3r1
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
+    static constexpr index_t nDim = SliceLengths::Size();
+    using Index                   = MultiIndex<nDim>;
+
+    using SrcCoord = decltype(make_dynamic_tensor_coordinate(SrcDesc{}, Index{}));
+    using DstCoord = decltype(make_dynamic_tensor_coordinate(DstDesc{}, Index{}));
+
+    using SrcCoordIterator = decltype(make_dynamic_tensor_coordinate_iterator(SrcDesc{}, Index{}));
+    using DstCoordIterator = decltype(make_dynamic_tensor_coordinate_iterator(DstDesc{}, Index{}));
+
+    __device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v3r1(const SrcDesc& src_desc,
+                                                                   const Index& src_slice_origin,
+                                                                   const DstDesc& dst_desc,
+                                                                   const Index& dst_slice_origin)
+        : src_coord_(make_dynamic_tensor_coordinate(src_desc, src_slice_origin)),
+          dst_coord_(make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin))
+    {
+        // TODO: fix this
+        static_assert(is_same<SrcData, DstData>::value,
+                      "wrong! current implementation assume SrcData and DstData are same type");
+
+        static_for<0, nDim, 1>{}([](auto i) {
+            static_assert(SliceLengths::At(i) % SrcVectorTensorLengths::At(i) == 0 &&
+                              SliceLengths::At(i) % DstVectorTensorLengths::At(i) == 0,
+                          "wrong!");
+        });
+    }
+
+    __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
+    {
+        src_coord_ = make_dynamic_tensor_coordinate(src_desc, src_slice_origin_idx);
+    }
+
+    __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
+    {
+        dst_coord_ = make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_idx);
+    }
+
+    template <typename SrcBuffer, typename SrcIteratorHacks>
+    __device__ void RunRead(const SrcDesc& src_desc,
+                            const SrcBuffer& src_buf,
+                            const SrcIteratorHacks& src_iterator_hacks)
+    {
+        static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
+                          SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
+                      "wrong!");
+
+        static_assert(is_same<remove_cv_t<remove_reference_t<typename SrcBuffer::type>>,
+                              remove_cv_t<remove_reference_t<SrcData>>>::value,
+                      "wrong! SrcBuffer and SrcData data type are inconsistent");
+
+        // tensor descriptor for src_vector
+        constexpr auto src_vector_tensor_lengths = SrcVectorTensorLengths{};
+
+        constexpr auto src_vector_tensor_strides = container_reorder_given_old2new(
+            container_reverse_exclusive_scan(
+                container_reorder_given_new2old(src_vector_tensor_lengths,
+                                                SrcVectorTensorContiguousDimOrder{}),
+                math::multiplies_v2{},
+                I1),
+            SrcVectorTensorContiguousDimOrder{});
+
+        constexpr auto src_vector_desc = make_dynamic_naive_tensor_descriptor_v2(
+            sequence_to_tuple_of_number(src_vector_tensor_lengths),
+            sequence_to_tuple_of_number(src_vector_tensor_strides));
+
+        // access order and lengths
+        constexpr auto src_access_lengths = SliceLengths{} / src_vector_tensor_lengths;
+
+        constexpr auto src_dim_access_order = SrcDimAccessOrder{};
+
+        constexpr auto ordered_src_access_lengths =
+            container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
+
+        // make forward iterators
+        const auto src_forward_iterators = generate_tuple(
+            [&](auto i) {
+                Index forward_step;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    forward_step(j) = (i.value == j.value) ? src_vector_tensor_lengths[i] : 0;
+                });
+
+                return make_dynamic_tensor_coordinate_iterator(
+                    src_desc, forward_step, src_iterator_hacks[I0][i]);
+            },
+            Number<nDim>{});
+
+        // make backward iterators
+        const auto src_backward_iterators = generate_tuple(
+            [&](auto i) {
+                Index backward_step;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    backward_step(j) = (i.value == j.value) ? -src_vector_tensor_lengths[i] : 0;
+                });
+
+                return make_dynamic_tensor_coordinate_iterator(
+                    src_desc, backward_step, src_iterator_hacks[I1][i]);
+            },
+            Number<nDim>{});
+
+        // loop over tensor and copy
+        static_ford<decltype(ordered_src_access_lengths)>{}([&](auto ordered_src_access_idx) {
+            // judge move forward or move backward
+            constexpr auto forward_sweep = [&]() {
+                StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+                forward_sweep_(I0) = true;
+
+                static_for<1, nDim, 1>{}([&](auto i) {
+                    index_t tmp = ordered_src_access_idx[I0];
+
+                    static_for<0, i, 1>{}([&](auto j) {
+                        tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j];
+                    });
+
+                    forward_sweep_(i) = tmp % 2 == 0;
+                });
+
+                return forward_sweep_;
+            }();
+
+            // calculate src data index
+            constexpr auto src_data_idx = [&]() {
+                Index ordered_idx;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    ordered_idx(i) = forward_sweep[i] ? ordered_src_access_idx[i]
+                                                      : ordered_src_access_lengths[i] - 1 -
+                                                            ordered_src_access_idx[i];
+                });
+
+                return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
+                       src_vector_tensor_lengths;
+            }();
+
+            vector_type_maker_t<SrcData, src_vector_desc.GetElementSpaceSize()> src_vector;
+
+            using src_vector_t = typename decltype(src_vector)::type;
+
+            const bool is_src_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_);
+
+            // copy data from src_buf to src_vector
+            src_vector.template AsType<src_vector_t>()(I0) =
+                src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid);
+
+            // copy data from src_vector to buffer_
+            static_ford<SrcVectorTensorLengths>{}([&](auto src_vector_idx_) {
+                constexpr auto src_vector_idx = to_multi_index(src_vector_idx_);
+
+                constexpr index_t src_vector_offset =
+                    src_vector_desc.CalculateOffset(src_vector_idx);
+
+                constexpr index_t buffer_offset =
+                    buffer_desc_.CalculateOffset(src_data_idx + src_vector_idx);
+
+                buffer_(Number<buffer_offset>{}) =
+                    src_vector.template AsType<SrcData>()[Number<src_vector_offset>{}];
+            });
+
+            constexpr auto move_on_dim = [&]() constexpr
+            {
+                StaticallyIndexedArray<bool, nDim> move_on_dim_;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    move_on_dim_(i) = ordered_src_access_idx[i] < ordered_src_access_lengths[i] - 1;
+
+                    static_for<i + 1, nDim, 1>{}([&](auto j) {
+                        move_on_dim_(i) &=
+                            ordered_src_access_idx[j] == ordered_src_access_lengths[j] - 1;
+                    });
+                });
+
+                return move_on_dim_;
+            }
+            ();
+
+            // move
+            static_for<0, nDim, 1>{}([&](auto i) {
+                if constexpr(move_on_dim[i])
+                {
+                    if constexpr(forward_sweep[i])
+                    {
+                        move_dynamic_tensor_coordinate(
+                            src_desc, src_coord_, src_forward_iterators[src_dim_access_order[i]]);
+                    }
+                    else
+                    {
+                        move_dynamic_tensor_coordinate(
+                            src_desc, src_coord_, src_backward_iterators[src_dim_access_order[i]]);
+                    }
+                }
+            });
+        });
+
+        // move src coordinate back to slice origin (or not)
+        if constexpr(SrcResetCoordinateAfterRun)
+        {
+            const auto src_reset_iterator =
+                make_dynamic_tensor_coordinate_iterator(src_desc, GetSrcCoordinateResetStep());
+
+            move_dynamic_tensor_coordinate(src_desc, src_coord_, src_reset_iterator);
+        }
+    }
+
+    template <typename DstBuffer, typename DstIteratorHacks>
+    __device__ void RunWrite(const DstDesc& dst_desc,
+                             DstBuffer& dst_buf,
+                             const DstIteratorHacks& dst_iterator_hacks)
+    {
+        static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
+                          DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
+                      "wrong!");
+
+        static_assert(is_same<remove_cv_t<remove_reference_t<typename DstBuffer::type>>,
+                              remove_cv_t<remove_reference_t<DstData>>>::value,
+                      "wrong! SrcBuffer or DstBuffer data type is wrong");
+
+        // tensor descriptor for dst_vector
+        constexpr auto dst_vector_tensor_lengths = DstVectorTensorLengths{};
+
+        constexpr auto dst_vector_tensor_strides = container_reorder_given_old2new(
+            container_reverse_exclusive_scan(
+                container_reorder_given_new2old(dst_vector_tensor_lengths,
+                                                DstVectorTensorContiguousDimOrder{}),
+                math::multiplies_v2{},
+                I1),
+            DstVectorTensorContiguousDimOrder{});
+
+        constexpr auto dst_vector_desc = make_dynamic_naive_tensor_descriptor_v2(
+            sequence_to_tuple_of_number(dst_vector_tensor_lengths),
+            sequence_to_tuple_of_number(dst_vector_tensor_strides));
+
+        // dst access order and lengths
+        constexpr auto dst_access_lengths = SliceLengths{} / dst_vector_tensor_lengths;
+
+        constexpr auto dst_dim_access_order = DstDimAccessOrder{};
+
+        constexpr auto ordered_dst_access_lengths =
+            container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
+
+        // make forward iterators
+        const auto dst_forward_iterators = generate_tuple(
+            [&](auto i) {
+                Index forward_step;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    forward_step(j) = (i.value == j.value) ? dst_vector_tensor_lengths[i] : 0;
+                });
+
+                const auto forward_iterator = make_dynamic_tensor_coordinate_iterator(
+                    dst_desc, forward_step, dst_iterator_hacks[I0][i]);
+
+                return forward_iterator;
+            },
+            Number<nDim>{});
+
+        // make backward iterators
+        const auto dst_backward_iterators = generate_tuple(
+            [&](auto i) {
+                Index backward_step;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    backward_step(j) = (i.value == j.value) ? -dst_vector_tensor_lengths[i] : 0;
+                });
+
+                const auto backward_iterator = make_dynamic_tensor_coordinate_iterator(
+                    dst_desc, backward_step, dst_iterator_hacks[I1][i]);
+
+                return backward_iterator;
+            },
+            Number<nDim>{});
+
+        // loop over tensor and copy
+        static_ford<decltype(ordered_dst_access_lengths)>{}([&](auto ordered_dst_access_idx) {
+            // judge move forward or move backward
+            constexpr auto forward_sweep = [&]() {
+                StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+                forward_sweep_(I0) = true;
+
+                static_for<1, nDim, 1>{}([&](auto i) {
+                    index_t tmp = ordered_dst_access_idx[I0];
+
+                    static_for<0, i, 1>{}([&](auto j) {
+                        tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j];
+                    });
+
+                    forward_sweep_(i) = tmp % 2 == 0;
+                });
+
+                return forward_sweep_;
+            }();
+
+            // calculate dst data index
+            constexpr auto dst_data_idx = [&]() {
+                Index ordered_idx;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_idx[i]
+                                                      : ordered_dst_access_lengths[i] - 1 -
+                                                            ordered_dst_access_idx[i];
+                });
+
+                return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
+                       dst_vector_tensor_lengths;
+            }();
+
+            vector_type_maker_t<DstData, dst_vector_desc.GetElementSpaceSize()> dst_vector;
+
+            // copy data from buffer_ to dst_vector (also cast from SrcData to DstData)
+            static_ford<DstVectorTensorLengths>{}([&](auto dst_vector_idx_) {
+                constexpr auto dst_vector_idx = to_multi_index(dst_vector_idx_);
+
+                constexpr index_t buffer_offset =
+                    buffer_desc_.CalculateOffset(dst_data_idx + dst_vector_idx);
+
+                constexpr index_t dst_vector_offset =
+                    dst_vector_desc.CalculateOffset(dst_vector_idx);
+
+                dst_vector.template AsType<DstData>()(Number<dst_vector_offset>{}) =
+                    type_convert<DstData>{}(buffer_[Number<buffer_offset>{}]);
+            });
+
+            using dst_vector_t = typename decltype(dst_vector)::type;
+
+            // copy data from dst_vector to dst_buf
+            const bool is_dst_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
+
+            dst_buf.template Set<dst_vector_t>(
+                dst_coord_.GetOffset(),
+                is_dst_valid,
+                dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
+
+            constexpr auto move_on_dim = [&]() constexpr
+            {
+                StaticallyIndexedArray<bool, nDim> move_on_dim_;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    move_on_dim_(i) = ordered_dst_access_idx[i] < ordered_dst_access_lengths[i] - 1;
+
+                    static_for<i + 1, nDim, 1>{}([&](auto j) {
+                        move_on_dim_(i) &=
+                            ordered_dst_access_idx[j] == ordered_dst_access_lengths[j] - 1;
+                    });
+                });
+
+                return move_on_dim_;
+            }
+            ();
+
+            // move
+            static_for<0, nDim, 1>{}([&](auto i) {
+                if constexpr(move_on_dim[i])
+                {
+                    if constexpr(forward_sweep[i])
+                    {
+                        move_dynamic_tensor_coordinate(
+                            dst_desc, dst_coord_, dst_forward_iterators[dst_dim_access_order[i]]);
+                    }
+                    else
+                    {
+                        move_dynamic_tensor_coordinate(
+                            dst_desc, dst_coord_, dst_backward_iterators[dst_dim_access_order[i]]);
+                    }
+                }
+            });
+        });
+
+        // move dst coordinate back to slice origin (or not)
+        if constexpr(DstResetCoordinateAfterRun)
+        {
+            const auto dst_reset_iterator =
+                make_dynamic_tensor_coordinate_iterator(dst_desc, GetDstCoordinateResetStep());
+
+            move_dynamic_tensor_coordinate(dst_desc, dst_coord_, dst_reset_iterator);
+        }
+    }
+
+    template <typename SrcBuffer>
+    __device__ void RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf)
+    {
+        constexpr index_t ntransform_src = SrcDesc::GetNumOfTransform();
+
+        constexpr auto zeros = typename uniform_sequence_gen<ntransform_src, 0>::type{};
+
+        constexpr auto src_iterator_hacks =
+            make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
+                       generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
+
+        RunRead(src_desc, src_buf, src_iterator_hacks);
+    }
+
+    template <typename DstBuffer>
+    __device__ void RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf)
+    {
+        constexpr index_t ntransform_dst = DstDesc::GetNumOfTransform();
+
+        constexpr auto zeros = typename uniform_sequence_gen<ntransform_dst, 0>::type{};
+
+        constexpr auto dst_iterator_hacks =
+            make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
+                       generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
+
+        RunWrite(dst_desc, dst_buf, dst_iterator_hacks);
+    }
+
+    __device__ static constexpr auto GetSrcCoordinateResetStep()
+    {
+        constexpr auto src_vector_tensor_lengths = SrcVectorTensorLengths{};
+
+        constexpr auto src_access_lengths = SliceLengths{} / src_vector_tensor_lengths;
+
+        constexpr auto src_dim_access_order = SrcDimAccessOrder{};
+
+        constexpr auto ordered_src_access_lengths =
+            container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
+
+        // judge move forward or move backward during the last iteration
+        constexpr auto forward_sweep = [&]() {
+            StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+            forward_sweep_(I0) = true;
+
+            static_for<1, nDim, 1>{}([&](auto i) {
+                index_t tmp = ordered_src_access_lengths[I0] - 1;
+
+                static_for<0, i, 1>{}([&](auto j) {
+                    tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1;
+                });
+
+                forward_sweep_(i) = tmp % 2 == 0;
+            });
+
+            return forward_sweep_;
+        }();
+
+        // calculate src data index after last iteration in RunRead(), if it has not being reset by
+        // RunRead()
+        constexpr auto src_data_idx = [&]() {
+            Index ordered_idx;
+
+            static_for<0, nDim, 1>{}([&](auto i) {
+                ordered_idx(i) = forward_sweep[i] ? ordered_src_access_lengths[i] - 1 : 0;
+            });
+
+            return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
+                   src_vector_tensor_lengths;
+        }();
+
+        //
+        constexpr auto reset_src_data_step = [&]() {
+            Index reset_src_data_step_;
+
+            static_for<0, nDim, 1>{}([&](auto i) { reset_src_data_step_(i) = -src_data_idx[i]; });
+
+            return reset_src_data_step_;
+        }();
+
+        return reset_src_data_step;
+    }
+
+    __device__ static constexpr auto GetDstCoordinateResetStep()
+    {
+        constexpr auto dst_vector_tensor_lengths = DstVectorTensorLengths{};
+
+        constexpr auto dst_access_lengths = SliceLengths{} / dst_vector_tensor_lengths;
+
+        constexpr auto dst_dim_access_order = DstDimAccessOrder{};
+
+        constexpr auto ordered_dst_access_lengths =
+            container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
+
+        // judge move forward or move backward during the last iteration
+        constexpr auto forward_sweep = [&]() {
+            StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+            forward_sweep_(I0) = true;
+
+            static_for<1, nDim, 1>{}([&](auto i) {
+                index_t tmp = ordered_dst_access_lengths[I0] - 1;
+
+                static_for<0, i, 1>{}([&](auto j) {
+                    tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1;
+                });
+
+                forward_sweep_(i) = tmp % 2 == 0;
+            });
+
+            return forward_sweep_;
+        }();
+
+        // calculate dst data index after last iteration in RunWrite(), if it has not being reset by
+        // RunWrite()
+        constexpr auto dst_data_idx = [&]() {
+            Index ordered_idx;
+
+            static_for<0, nDim, 1>{}([&](auto i) {
+                ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_lengths[i] - 1 : 0;
+            });
+
+            return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
+                   dst_vector_tensor_lengths;
+        }();
+
+        //
+        constexpr auto reset_dst_data_step = [&]() {
+            Index reset_dst_data_step_;
+
+            static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; });
+
+            return reset_dst_data_step_;
+        }();
+
+        return reset_dst_data_step;
+    }
+
+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
+                                       const Index& src_slice_origin_step_idx)
+    {
+        // if src coord was not reset by RunRead(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
+                                       : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step =
+            make_dynamic_tensor_coordinate_iterator(src_desc, adjusted_step_idx);
+
+        move_dynamic_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+    }
+
+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    template <typename SrcMoveSliceWindowIteratorHack>
+    __device__ void
+    MoveSrcSliceWindow(const SrcDesc& src_desc,
+                       const Index& src_slice_origin_step_idx,
+                       const SrcMoveSliceWindowIteratorHack& src_move_slice_window_iterator_hack)
+    {
+        // if src coord was not reset by RunRead(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
+                                       : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_dynamic_tensor_coordinate_iterator(
+            src_desc, adjusted_step_idx, src_move_slice_window_iterator_hack);
+
+        move_dynamic_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+    }
+    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
+                                       const Index& dst_slice_origin_step_idx)
+    {
+        // if dst coord was not reset by RunWrite(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
+                                       : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step =
+            make_dynamic_tensor_coordinate_iterator(dst_desc, adjusted_step_idx);
+
+        move_dynamic_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
+    }
+
+    private:
+    static constexpr auto buffer_desc_ =
+        make_dynamic_naive_tensor_descriptor_packed_v2(sequence_to_tuple_of_number(SliceLengths{}));
+
+    static constexpr auto buffer_size_ = buffer_desc_.GetElementSpaceSize();
+
+    StaticBuffer<AddressSpaceEnum_t::Vgpr, SrcData, buffer_size_> buffer_;
+
+    SrcCoord src_coord_;
+    DstCoord dst_coord_;
+};
+
+// Assume:
+//   1. src:
+//     1. SrcDesc is known at compile-time
+//     2. SrcBuffer is DynamicBuffer
+//     3. src_ref_idx is known at run-time
+//     4. SrcRefToOriginDisplacement is known at compile-time
+//     5. use #-iterator
+//   2. dst:
+//     1. DstDesc is known at compile-time
+//     2. DstBuffer is StaticBuffer
+//     3. DstOriginIdx is known at compile-time
+//     4. use direct address calculation
+//   3. vector access on src
+template <
+    typename SrcData,
+    typename DstData,
+    typename SrcDesc,
+    typename DstDesc,
+    typename SliceLengths,
+    typename DimAccessOrder,
+    typename SrcVectorTensorLengths,
+    typename SrcVectorTensorContiguousDimOrder,
+    typename std::enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
+                            bool>::type = false>
+struct ThreadwiseDynamicTensorSliceTransfer_v4r1
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
+    static constexpr index_t nDim = SliceLengths::Size();
+
+    using Index = MultiIndex<nDim>;
+
+    using SrcCoord = decltype(make_dynamic_tensor_coordinate(SrcDesc{}, Index{}));
+
+    using SrcCoordIterator = decltype(make_dynamic_tensor_coordinate_iterator(SrcDesc{}, Index{}));
+
+    __device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v4r1(const Index& src_ref_idx)
+        : src_ref_coord_(make_dynamic_tensor_coordinate(SrcDesc{}, src_ref_idx))
+    {
+        static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
+                      "wrong! SrcDesc and DstDesc need to known at compile-time");
+
+        static_for<0, nDim, 1>{}([](auto i) {
+            static_assert(SliceLengths::At(i) % SrcVectorTensorLengths::At(i) == 0, "wrong!");
+        });
+    }
+
+    template <typename SrcRefToOriginDisplacement,
+              typename DstOriginIdx,
+              typename SrcBuffer,
+              typename DstBuffer>
+    __device__ void Run(const SrcDesc&,
+                        const SrcRefToOriginDisplacement&,
+                        const SrcBuffer& src_buf,
+                        const DstDesc&,
+                        const DstOriginIdx&,
+                        DstBuffer& dst_buf) const
+    {
+        static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
+                      "wrong! SrcDesc and DstDesc need to known at compile-time");
+
+        static_assert(is_same<remove_cv_t<remove_reference_t<typename SrcBuffer::type>>,
+                              remove_cv_t<remove_reference_t<SrcData>>>::value &&
+                          is_same<remove_cv_t<remove_reference_t<typename DstBuffer::type>>,
+                                  remove_cv_t<remove_reference_t<DstData>>>::value,
+                      "wrong! SrcBuffer or DstBuffer data type is wrong");
+
+        static_assert(DstBuffer::IsStaticBuffer(), "wrong! DstBuffer need to be StaticBuffer");
+
+        static_assert(
+            is_known_at_compile_time<
+                remove_cv_t<remove_reference_t<SrcRefToOriginDisplacement>>>::value &&
+                is_known_at_compile_time<remove_cv_t<remove_reference_t<DstOriginIdx>>>::value,
+            "wrong! SrcOriginToRefDistance and DstOriginToRefDistance need to be known "
+            "at compile-time");
+
+        // SrcDesc and DstDesc are known at compile-time
+        constexpr auto src_desc = remove_cv_t<remove_reference_t<SrcDesc>>{};
+        constexpr auto dst_desc = remove_cv_t<remove_reference_t<DstDesc>>{};
+
+        // SrcOriginToRefDisttance and DstOriginToRefDistance are known at compile-time
+        constexpr auto src_ref_to_origin_disp_idx = to_multi_index(SrcRefToOriginDisplacement{});
+        constexpr auto dst_origin_idx             = to_multi_index(DstOriginIdx{});
+
+        // tensor descriptor for src_vector
+        constexpr auto src_vector_tensor_lengths = SrcVectorTensorLengths{};
+
+        constexpr auto src_vector_tensor_strides = container_reorder_given_old2new(
+            container_reverse_exclusive_scan(
+                container_reorder_given_new2old(src_vector_tensor_lengths,
+                                                SrcVectorTensorContiguousDimOrder{}),
+                math::multiplies_v2{},
+                I1),
+            SrcVectorTensorContiguousDimOrder{});
+
+        constexpr auto src_vector_desc = make_dynamic_naive_tensor_descriptor_v2(
+            sequence_to_tuple_of_number(src_vector_tensor_lengths),
+            sequence_to_tuple_of_number(src_vector_tensor_strides));
+
+        // access order and lengths
+        constexpr auto access_lengths = SliceLengths{} / src_vector_tensor_lengths;
+
+        constexpr auto dim_access_order = DimAccessOrder{};
+
+        constexpr auto ordered_access_lengths =
+            container_reorder_given_new2old(access_lengths, dim_access_order);
+
+        static_ford<decltype(ordered_access_lengths)>{}([&](auto ordered_access_idx) {
+            // position in slice window
+            constexpr auto data_to_origin_disp_idx =
+                ordered_access_idx.ReorderGivenOld2New(dim_access_order) *
+                src_vector_tensor_lengths;
+
+            // src coordinate at starting point of src_vector
+            constexpr auto src_ref_to_data_disp_idx =
+                src_ref_to_origin_disp_idx + data_to_origin_disp_idx;
+
+            constexpr auto src_ref_to_data_disp_coord_iterator =
+                make_dynamic_tensor_coordinate_iterator(src_desc, src_ref_to_data_disp_idx);
+
+            auto src_data_coord = src_ref_coord_;
+
+            move_dynamic_tensor_coordinate(
+                src_desc, src_data_coord, src_ref_to_data_disp_coord_iterator);
+
+            vector_type_maker_t<SrcData, src_vector_desc.GetElementSpaceSize()> src_vector;
+
+            using src_vector_t = typename decltype(src_vector)::type;
+
+            const bool is_src_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
+                src_desc, src_data_coord);
+
+            // copy data from src_buf into src_vector
+            src_vector.template AsType<src_vector_t>()(I0) =
+                src_buf.template Get<src_vector_t>(src_data_coord.GetOffset(), is_src_valid);
+
+            // copy data from src_vector into dst_buf (also cast from SrcData to DstData)
+            static_ford<SrcVectorTensorLengths>{}([&](auto src_vector_idx_) {
+                constexpr auto src_vector_idx = to_multi_index(src_vector_idx_);
+
+                constexpr index_t src_vector_offset =
+                    src_vector_desc.CalculateOffset(src_vector_idx);
+
+                constexpr index_t dst_offset = dst_desc.CalculateOffset(
+                    dst_origin_idx + data_to_origin_disp_idx + src_vector_idx);
+
+                dst_buf(Number<dst_offset>{}) = type_convert<DstData>{}(
+                    src_vector.template AsType<DstData>()[Number<src_vector_offset>{}]);
+            });
+        });
+    }
+
+    template <typename SrcSliceMoveStepIdx>
+    __device__ void MoveSrcSliceWindow(const SrcDesc&,
+                                       const SrcSliceMoveStepIdx& src_slice_move_step_idx)
+    {
+        constexpr auto src_desc = SrcDesc{};
+
+        const auto src_slice_move_step_iter = make_dynamic_tensor_coordinate_iterator(
+            src_desc, to_multi_index(src_slice_move_step_idx));
+
+        move_dynamic_tensor_coordinate(SrcDesc{}, src_ref_coord_, src_slice_move_step_iter);
+    }
+
+    private:
+    SrcCoord src_ref_coord_;
+};
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp b/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp
new file mode 100644
index 0000000000..153d512df7
--- /dev/null
+++ b/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp
@@ -0,0 +1,162 @@
+#ifndef CK_THREADWISE_GEMM_DLOPS_V3_HPP
+#define CK_THREADWISE_GEMM_DLOPS_V3_HPP
+
+#include "common_header.hpp"
+#include "math.hpp"
+
+namespace ck {
+
+// C[M, N] += transpose(A[K, M]) * B[K, N]
+//   Element of matrix can be vectorized data
+// Assume:
+//   1. ADesc, BDesc, CDesc are known at compile-time
+//   2. AOriginIdx, BOriginIdx, COriginIdx are known at compile-time
+template <typename FloatA,
+          typename FloatB,
+          typename FloatC,
+          typename ADesc,
+          typename BDesc,
+          typename CDesc,
+          index_t H,
+          index_t W,
+          typename std::enable_if<ADesc::IsKnownAtCompileTime() && BDesc::IsKnownAtCompileTime() &&
+                                      CDesc::IsKnownAtCompileTime(),
+                                  bool>::type = false>
+struct ThreadwiseGemmDlops_km_kn_mn_v3
+{
+    template <typename ABuffer,
+              typename AOriginIdx,
+              typename BBuffer,
+              typename BOriginIdx,
+              typename CBuffer,
+              typename COriginIdx>
+    __device__ static void Run(const ABuffer& a_buf,
+                               AOriginIdx,
+                               const BBuffer& b_buf,
+                               BOriginIdx,
+                               CBuffer& c_buf,
+                               COriginIdx)
+    {
+        static_assert(ADesc::IsKnownAtCompileTime() && BDesc::IsKnownAtCompileTime() &&
+                          CDesc::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        static_assert(
+            is_known_at_compile_time<remove_cv_t<remove_reference_t<AOriginIdx>>>::value &&
+                is_known_at_compile_time<remove_cv_t<remove_reference_t<BOriginIdx>>>::value &&
+                is_known_at_compile_time<remove_cv_t<remove_reference_t<COriginIdx>>>::value,
+            "wrong! AOriginIdx, BOriginIdx, COringinIdx should be known at compile-time");
+
+        static_assert(is_same<remove_cv_t<remove_reference_t<typename ABuffer::type>>,
+                              remove_cv_t<remove_reference_t<FloatA>>>::value &&
+                      is_same<remove_cv_t<remove_reference_t<typename BBuffer::type>>,
+                              remove_cv_t<remove_reference_t<FloatB>>>::value &&
+                      is_same<remove_cv_t<remove_reference_t<typename CBuffer::type>>,
+                              remove_cv_t<remove_reference_t<FloatC>>>::value &&
+                      "wrong! inconsistent type");
+
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+
+        constexpr auto E = ADesc{}.GetLength(I0);
+        constexpr auto K = ADesc{}.GetLength(I1);
+
+        constexpr auto a_origin_idx = to_multi_index(AOriginIdx{});
+        constexpr auto b_origin_idx = to_multi_index(BOriginIdx{});
+        constexpr auto c_origin_idx = to_multi_index(COriginIdx{});
+
+        static_for<0, E, 1>{}([&](auto e) {
+            static_for<0, K, 1>{}([&](auto k) {
+                constexpr index_t a_offset =
+                    ADesc{}.CalculateOffset(a_origin_idx + make_tuple(e, k));
+
+                if constexpr(H == 2 && W == 2)
+                {
+                    constexpr index_t b_offset_0 =
+                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 0, 0));
+                    constexpr index_t b_offset_1 =
+                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 0, 1));
+                    constexpr index_t b_offset_2 =
+                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 1, 0));
+                    constexpr index_t b_offset_3 =
+                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 1, 1));
+
+                    constexpr index_t c_offset_0 =
+                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 0, 0));
+                    constexpr index_t c_offset_1 =
+                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 0, 1));
+                    constexpr index_t c_offset_2 =
+                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 1, 0));
+                    constexpr index_t c_offset_3 =
+                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 1, 1));
+
+                    amd_assembly_outer_product_1x4(a_buf[Number<a_offset>{}],
+                                                   b_buf[Number<b_offset_0>{}],
+                                                   b_buf[Number<b_offset_1>{}],
+                                                   b_buf[Number<b_offset_2>{}],
+                                                   b_buf[Number<b_offset_3>{}],
+                                                   c_buf(Number<c_offset_0>{}),
+                                                   c_buf(Number<c_offset_1>{}),
+                                                   c_buf(Number<c_offset_2>{}),
+                                                   c_buf(Number<c_offset_3>{}));
+                }
+                else if constexpr(H == 4 && W == 1)
+                {
+                    constexpr index_t b_offset_0 =
+                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 0, 0));
+                    constexpr index_t b_offset_1 =
+                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 1, 0));
+                    constexpr index_t b_offset_2 =
+                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 2, 0));
+                    constexpr index_t b_offset_3 =
+                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 3, 0));
+
+                    constexpr index_t c_offset_0 =
+                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 0, 0));
+                    constexpr index_t c_offset_1 =
+                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 1, 0));
+                    constexpr index_t c_offset_2 =
+                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 2, 0));
+                    constexpr index_t c_offset_3 =
+                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 3, 0));
+
+                    amd_assembly_outer_product_1x4(a_buf[Number<a_offset>{}],
+                                                   b_buf[Number<b_offset_0>{}],
+                                                   b_buf[Number<b_offset_1>{}],
+                                                   b_buf[Number<b_offset_2>{}],
+                                                   b_buf[Number<b_offset_3>{}],
+                                                   c_buf(Number<c_offset_0>{}),
+                                                   c_buf(Number<c_offset_1>{}),
+                                                   c_buf(Number<c_offset_2>{}),
+                                                   c_buf(Number<c_offset_3>{}));
+                }
+                else
+                {
+                    static_for<0, H, 1>{}([&](auto h) {
+                        static_for<0, W, 1>{}([&](auto w) {
+                            constexpr index_t b_offset =
+                                BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, h, w));
+
+                            constexpr index_t c_offset =
+                                CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, h, w));
+
+#if 0
+                            c_buf(Number<c_offset>{}) += inner_product_with_conversion<FloatC>{}(
+                                a_buf[Number<a_offset>{}], b_buf[Number<b_offset>{}]);
+#else
+                            amd_assembly_inner_product(a_buf[Number<a_offset>{}],
+                                                       b_buf[Number<b_offset>{}],
+                                                       c_buf(Number<c_offset>{}));
+#endif
+                        });
+                    });
+                }
+            });
+        });
+    }
+};
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/tensor_operation/xdlops_gemm.hpp b/composable_kernel/include/tensor_operation/xdlops_gemm.hpp
new file mode 100644
index 0000000000..876a1174e7
--- /dev/null
+++ b/composable_kernel/include/tensor_operation/xdlops_gemm.hpp
@@ -0,0 +1,801 @@
+#ifndef CK_XDLOPS_GEMM_HPP
+#define CK_XDLOPS_GEMM_HPP
+
+#include "common_header.hpp"
+#include "math.hpp"
+#include "amd_xdlops.hpp"
+
+namespace ck {
+
+enum struct mfma_instr
+{
+    /// fp32
+    mfma_f32_32x32x1xf32 = 0,
+    mfma_f32_16x16x1xf32,
+    mfma_f32_4x4x1xf32,
+    mfma_f32_32x32x2xf32, // k reduction
+    mfma_f32_16x16x4xf32, // k reduction
+                          /// fp16
+    mfma_f32_32x32x4f16,
+    mfma_f32_16x16x4f16,
+    mfma_f32_4x4x4f16,
+    mfma_f32_32x32x8f16,  // k reduction
+    mfma_f32_16x16x16f16, // k reduction
+                          /// bfp16
+    mfma_f32_32x32x2bf16,
+    mfma_f32_16x16x2bf16,
+    mfma_f32_4x4x2bf16,
+    mfma_f32_32x32x4bf16, // k reduction
+    mfma_f32_16x16x8bf16, // k reduction
+};
+
+template <mfma_instr instr>
+struct mfma_info;
+
+template <>
+struct mfma_info<mfma_instr::mfma_f32_32x32x1xf32>
+{
+    static constexpr index_t group_size      = 4;
+    static constexpr index_t num_groups_blk  = 4;
+    static constexpr index_t num_regs_blk    = group_size * num_groups_blk;
+    static constexpr index_t num_threads_blk = 32;
+    static constexpr index_t wave_size       = 64;
+    static constexpr index_t num_input_blks  = wave_size / num_threads_blk;
+    static constexpr index_t num_output_blks = 2;
+    static constexpr index_t num_regs_xdlops = num_regs_blk * num_output_blks;
+    static constexpr index_t m               = 32;
+    static constexpr index_t n               = 32;
+    static constexpr index_t k               = 1;
+    static constexpr index_t cycles          = 64;
+    static constexpr index_t k_base          = 1;
+
+    template <index_t MPerXdlops,
+              index_t NPerXdlops,
+              index_t COffset,
+              class FloatA,
+              class FloatB,
+              class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_32x32x1f32<MPerXdlops, NPerXdlops, COffset>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_info<mfma_instr::mfma_f32_32x32x2xf32>
+{
+    static constexpr index_t group_size      = 4;
+    static constexpr index_t num_groups_blk  = 4;
+    static constexpr index_t num_regs_blk    = group_size * num_groups_blk;
+    static constexpr index_t num_threads_blk = 32;
+    static constexpr index_t wave_size       = 64;
+    static constexpr index_t num_input_blks  = wave_size / num_threads_blk;
+    static constexpr index_t num_output_blks = 1;
+    static constexpr index_t num_regs_xdlops = num_regs_blk * num_output_blks;
+    static constexpr index_t m               = 32;
+    static constexpr index_t n               = 32;
+    static constexpr index_t k               = 2;
+    static constexpr index_t cycles          = 64;
+    static constexpr index_t k_base          = 1;
+
+    template <index_t MPerXdlops,
+              index_t NPerXdlops,
+              index_t COffset,
+              class FloatA,
+              class FloatB,
+              class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_32x32x2f32<MPerXdlops, NPerXdlops, COffset>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_info<mfma_instr::mfma_f32_16x16x4xf32>
+{
+    static constexpr index_t group_size      = 4;
+    static constexpr index_t num_groups_blk  = 1;
+    static constexpr index_t num_regs_blk    = group_size * num_groups_blk;
+    static constexpr index_t num_threads_blk = 16;
+    static constexpr index_t wave_size       = 64;
+    static constexpr index_t num_input_blks  = wave_size / num_threads_blk;
+    static constexpr index_t num_output_blks = 1;
+    static constexpr index_t num_regs_xdlops = num_regs_blk * num_output_blks;
+    static constexpr index_t m               = 16;
+    static constexpr index_t n               = 16;
+    static constexpr index_t k               = 4;
+    static constexpr index_t cycles          = 32;
+    static constexpr index_t k_base          = 1;
+
+    template <index_t MPerXdlops,
+              index_t NPerXdlops,
+              index_t COffset,
+              class FloatA,
+              class FloatB,
+              class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_16x16x4f32<MPerXdlops, NPerXdlops, COffset>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_info<mfma_instr::mfma_f32_16x16x1xf32>
+{
+    static constexpr index_t group_size      = 4;
+    static constexpr index_t num_groups_blk  = 1;
+    static constexpr index_t num_regs_blk    = group_size * num_groups_blk;
+    static constexpr index_t num_threads_blk = 16;
+    static constexpr index_t wave_size       = 64;
+    static constexpr index_t num_input_blks  = wave_size / num_threads_blk;
+    static constexpr index_t num_output_blks = 4;
+    static constexpr index_t num_regs_xdlops = num_regs_blk * num_output_blks;
+    static constexpr index_t m               = 16;
+    static constexpr index_t n               = 16;
+    static constexpr index_t k               = 1;
+    static constexpr index_t cycles          = 32;
+    static constexpr index_t k_base          = 1;
+
+    template <index_t MPerXdlops,
+              index_t NPerXdlops,
+              index_t COffset,
+              class FloatA,
+              class FloatB,
+              class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_16x16x1f32<MPerXdlops, NPerXdlops, COffset>::Run(a, b, reg_c);
+    }
+};
+
+// treat 4x4x1 as a single-blk 4x64 mfma
+template <>
+struct mfma_info<mfma_instr::mfma_f32_4x4x1xf32>
+{
+    static constexpr index_t group_size      = 4;
+    static constexpr index_t num_groups_blk  = 1;
+    static constexpr index_t num_regs_blk    = group_size * num_groups_blk;
+    static constexpr index_t num_threads_blk = 64;
+    static constexpr index_t wave_size       = 64;
+    static constexpr index_t num_input_blks  = 1;
+    static constexpr index_t num_output_blks = 1;
+    static constexpr index_t num_regs_xdlops = 4;
+    static constexpr index_t m               = 4;
+    static constexpr index_t n               = 64;
+    static constexpr index_t k               = 1;
+    static constexpr index_t cycles          = 8;
+    static constexpr index_t k_base          = 1;
+
+    template <index_t MPerXdlops,
+              index_t NPerXdlops,
+              index_t COffset,
+              class FloatA,
+              class FloatB,
+              class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_4x4x1f32<MPerXdlops, NPerXdlops, COffset>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_info<mfma_instr::mfma_f32_32x32x4f16>
+{
+    static constexpr index_t group_size      = 4;
+    static constexpr index_t num_groups_blk  = 4;
+    static constexpr index_t num_regs_blk    = group_size * num_groups_blk;
+    static constexpr index_t num_threads_blk = 32;
+    static constexpr index_t wave_size       = 64;
+    static constexpr index_t num_input_blks  = wave_size / num_threads_blk;
+    static constexpr index_t num_output_blks = 2;
+    static constexpr index_t num_regs_xdlops = num_regs_blk * num_output_blks;
+    static constexpr index_t m               = 32;
+    static constexpr index_t n               = 32;
+    static constexpr index_t k               = 4;
+    static constexpr index_t cycles          = 64;
+    static constexpr index_t k_base          = 4;
+
+    template <index_t MPerXdlops,
+              index_t NPerXdlops,
+              index_t COffset,
+              class FloatA,
+              class FloatB,
+              class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_32x32x4f16<MPerXdlops, NPerXdlops, COffset>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_info<mfma_instr::mfma_f32_32x32x8f16>
+{
+    static constexpr index_t group_size      = 4;
+    static constexpr index_t num_groups_blk  = 4;
+    static constexpr index_t num_regs_blk    = group_size * num_groups_blk;
+    static constexpr index_t num_threads_blk = 32;
+    static constexpr index_t wave_size       = 64;
+    static constexpr index_t num_input_blks  = wave_size / num_threads_blk;
+    static constexpr index_t num_output_blks = 1;
+    static constexpr index_t num_regs_xdlops = num_regs_blk * num_output_blks;
+    static constexpr index_t m               = 32;
+    static constexpr index_t n               = 32;
+    static constexpr index_t k               = 8;
+    static constexpr index_t cycles          = 64;
+    static constexpr index_t k_base          = 4;
+
+    template <index_t MPerXdlops,
+              index_t NPerXdlops,
+              index_t COffset,
+              class FloatA,
+              class FloatB,
+              class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_32x32x8f16<MPerXdlops, NPerXdlops, COffset>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_info<mfma_instr::mfma_f32_16x16x16f16>
+{
+    static constexpr index_t group_size      = 4;
+    static constexpr index_t num_groups_blk  = 1;
+    static constexpr index_t num_regs_blk    = group_size * num_groups_blk;
+    static constexpr index_t num_threads_blk = 16;
+    static constexpr index_t wave_size       = 64;
+    static constexpr index_t num_input_blks  = wave_size / num_threads_blk;
+    static constexpr index_t num_output_blks = 1;
+    static constexpr index_t num_regs_xdlops = num_regs_blk * num_output_blks;
+    static constexpr index_t m               = 16;
+    static constexpr index_t n               = 16;
+    static constexpr index_t k               = 16;
+    static constexpr index_t cycles          = 32;
+    static constexpr index_t k_base          = 4;
+
+    template <index_t MPerXdlops,
+              index_t NPerXdlops,
+              index_t COffset,
+              class FloatA,
+              class FloatB,
+              class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_16x16x16f16<MPerXdlops, NPerXdlops, COffset>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_info<mfma_instr::mfma_f32_16x16x4f16>
+{
+    static constexpr index_t group_size      = 4;
+    static constexpr index_t num_groups_blk  = 1;
+    static constexpr index_t num_regs_blk    = group_size * num_groups_blk;
+    static constexpr index_t num_threads_blk = 16;
+    static constexpr index_t wave_size       = 64;
+    static constexpr index_t num_input_blks  = wave_size / num_threads_blk;
+    static constexpr index_t num_output_blks = 4;
+    static constexpr index_t num_regs_xdlops = num_regs_blk * num_output_blks;
+    static constexpr index_t m               = 16;
+    static constexpr index_t n               = 16;
+    static constexpr index_t k               = 4;
+    static constexpr index_t cycles          = 32;
+    static constexpr index_t k_base          = 4;
+
+    template <index_t MPerXdlops,
+              index_t NPerXdlops,
+              index_t COffset,
+              class FloatA,
+              class FloatB,
+              class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_16x16x4f16<MPerXdlops, NPerXdlops, COffset>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_info<mfma_instr::mfma_f32_4x4x4f16>
+{
+    static constexpr index_t group_size      = 4;
+    static constexpr index_t num_groups_blk  = 1;
+    static constexpr index_t num_regs_blk    = group_size * num_groups_blk;
+    static constexpr index_t num_threads_blk = 64;
+    static constexpr index_t wave_size       = 64;
+    static constexpr index_t num_input_blks  = 1;
+    static constexpr index_t num_output_blks = 1;
+    static constexpr index_t num_regs_xdlops = 4;
+    static constexpr index_t m               = 4;
+    static constexpr index_t n               = 64;
+    static constexpr index_t k               = 4;
+    static constexpr index_t cycles          = 8;
+    static constexpr index_t k_base          = 4;
+
+    template <index_t MPerXdlops,
+              index_t NPerXdlops,
+              index_t COffset,
+              class FloatA,
+              class FloatB,
+              class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_4x4x4f16<MPerXdlops, NPerXdlops, COffset>::Run(a, b, reg_c);
+    }
+};
+
+#if 0
+template <>
+struct mfma_info<mfma_instr::mfma_f32_32x32x2bf16>
+{
+    static constexpr index_t group_size      = 4;
+    static constexpr index_t num_groups_blk  = 4;
+    static constexpr index_t num_regs_blk    = group_size * num_groups_blk;
+    static constexpr index_t num_threads_blk = 32;
+    static constexpr index_t wave_size       = 64;
+    static constexpr index_t num_input_blks  = wave_size / num_threads_blk;
+    static constexpr index_t num_output_blks = 2;
+    static constexpr index_t num_regs_xdlops = num_regs_blk * num_output_blks;
+    static constexpr index_t m               = 32;
+    static constexpr index_t n               = 32;
+    static constexpr index_t k               = 2;
+    static constexpr index_t cycles          = 64;
+    static constexpr index_t k_base          = 2;
+
+    template <index_t MPerXdlops,
+              index_t NPerXdlops,
+              index_t AStride,
+              index_t BStride,
+              class FloatA,
+              class FloatB,
+              class FloatC>
+    __device__ FloatC run(const FloatA* a, const FloatB* b, FloatC reg_c) const
+    {
+        const auto p_a = reinterpret_cast<const ushort2_t*>(a);
+        const auto p_b = reinterpret_cast<const ushort2_t*>(b);
+
+        return intrin_mfma_f32_32x32x2bf16<MPerXdlops, NPerXdlops, AStride, BStride>::run(
+            p_a, p_b, reg_c);
+    }
+};
+
+template <>
+struct mfma_info<mfma_instr::mfma_f32_32x32x4bf16>
+{
+    static constexpr index_t group_size      = 4;
+    static constexpr index_t num_groups_blk  = 4;
+    static constexpr index_t num_regs_blk    = group_size * num_groups_blk;
+    static constexpr index_t num_threads_blk = 32;
+    static constexpr index_t wave_size       = 64;
+    static constexpr index_t num_input_blks  = wave_size / num_threads_blk;
+    static constexpr index_t num_output_blks = 1;
+    static constexpr index_t num_regs_xdlops = num_regs_blk * num_output_blks;
+    static constexpr index_t m               = 32;
+    static constexpr index_t n               = 32;
+    static constexpr index_t k               = 4;
+    static constexpr index_t cycles          = 64;
+    static constexpr index_t k_base          = 2;
+
+    template <index_t MPerXdlops,
+              index_t NPerXdlops,
+              index_t AStride,
+              index_t BStride,
+              class FloatA,
+              class FloatB,
+              class FloatC>
+    __device__ FloatC run(const FloatA* a, const FloatB* b, FloatC reg_c) const
+    {
+        const auto p_a = reinterpret_cast<const ushort2_t*>(a);
+        const auto p_b = reinterpret_cast<const ushort2_t*>(b);
+
+        return intrin_mfma_f32_32x32x4bf16(p_a, p_b, reg_c);
+    }
+};
+
+template <>
+struct mfma_info<mfma_instr::mfma_f32_16x16x8bf16>
+{
+    static constexpr index_t group_size      = 4;
+    static constexpr index_t num_groups_blk  = 1;
+    static constexpr index_t num_regs_blk    = group_size * num_groups_blk;
+    static constexpr index_t num_threads_blk = 16;
+    static constexpr index_t wave_size       = 64;
+    static constexpr index_t num_input_blks  = wave_size / num_threads_blk;
+    static constexpr index_t num_output_blks = 1;
+    static constexpr index_t num_regs_xdlops = num_regs_blk * num_output_blks;
+    static constexpr index_t m               = 16;
+    static constexpr index_t n               = 16;
+    static constexpr index_t k               = 8;
+    static constexpr index_t cycles          = 32;
+    static constexpr index_t k_base          = 2;
+
+    template <index_t MPerXdlops,
+              index_t NPerXdlops,
+              index_t AStride,
+              index_t BStride,
+              class FloatA,
+              class FloatB,
+              class FloatC>
+    __device__ FloatC run(const FloatA* a, const FloatB* b, FloatC reg_c) const
+    {
+        const auto p_a = reinterpret_cast<const ushort2_t*>(a);
+        const auto p_b = reinterpret_cast<const ushort2_t*>(b);
+
+        return intrin_mfma_f32_16x16x8bf16(p_a, p_b, reg_c);
+    }
+};
+
+template <>
+struct mfma_info<mfma_instr::mfma_f32_16x16x2bf16>
+{
+    static constexpr index_t group_size      = 4;
+    static constexpr index_t num_groups_blk  = 1;
+    static constexpr index_t num_regs_blk    = group_size * num_groups_blk;
+    static constexpr index_t num_threads_blk = 16;
+    static constexpr index_t wave_size       = 64;
+    static constexpr index_t num_input_blks  = wave_size / num_threads_blk;
+    static constexpr index_t num_output_blks = 4;
+    static constexpr index_t num_regs_xdlops = num_regs_blk * num_output_blks;
+    static constexpr index_t m               = 16;
+    static constexpr index_t n               = 16;
+    static constexpr index_t k               = 2;
+    static constexpr index_t cycles          = 32;
+    static constexpr index_t k_base          = 2;
+
+    template <index_t MPerXdlops,
+              index_t NPerXdlops,
+              index_t AStride,
+              index_t BStride,
+              class FloatA,
+              class FloatB,
+              class FloatC>
+    __device__ FloatC run(const FloatA* a, const FloatB* b, FloatC reg_c) const
+    {
+        const auto p_a = reinterpret_cast<const ushort2_t*>(a);
+        const auto p_b = reinterpret_cast<const ushort2_t*>(b);
+
+        return intrin_mfma_f32_16x16x2bf16<MPerXdlops, NPerXdlops>(p_a, p_b, reg_c);
+    }
+};
+
+template <>
+struct mfma_info<mfma_instr::mfma_f32_4x4x2bf16>
+{
+    static constexpr index_t group_size      = 4;
+    static constexpr index_t num_groups_blk  = 1;
+    static constexpr index_t num_regs_blk    = group_size * num_groups_blk;
+    static constexpr index_t num_threads_blk = 64;
+    static constexpr index_t wave_size       = 64;
+    static constexpr index_t num_input_blks  = 1;
+    static constexpr index_t num_output_blks = 1;
+    static constexpr index_t num_regs_xdlops = 4;
+    static constexpr index_t m               = 4;
+    static constexpr index_t n               = 64;
+    static constexpr index_t k               = 2;
+    static constexpr index_t cycles          = 8;
+    static constexpr index_t k_base          = 2;
+
+    template <index_t MPerXdlops,
+              index_t NPerXdlops,
+              index_t AStride,
+              index_t BStride,
+              class FloatA,
+              class FloatB,
+              class FloatC>
+    __device__ FloatC run(const FloatA* a, const FloatB* b, FloatC reg_c) const
+    {
+        const auto p_a = reinterpret_cast<const ushort2_t*>(a);
+        const auto p_b = reinterpret_cast<const ushort2_t*>(b);
+
+        return intrin_mfma_f32_4x4x2bf16<MPerXdlops, NPerXdlops>::run(p_a, p_b, reg_c);
+    }
+};
+#endif
+
+template <mfma_instr instr, index_t MPerXdlops_, index_t NPerXdlops_>
+struct xdlops_info
+{
+    static constexpr auto mfma_type = mfma_info<instr>{};
+
+    static constexpr index_t MPerXdlops = MPerXdlops_;
+    static constexpr index_t NPerXdlops = NPerXdlops_;
+
+    static constexpr bool IsABroadcast()
+    {
+        static_assert(NPerXdlops >= MPerXdlops, "only support ABroadcast");
+        return true;
+    }
+
+    static constexpr bool IsKReduction()
+    {
+        return (mfma_type.num_output_blks == 1) && (mfma_type.num_input_blks > 1);
+    }
+
+    static constexpr index_t GetKPerXdlops()
+    {
+        return IsKReduction() ? mfma_type.num_input_blks : 1;
+    }
+
+    static constexpr index_t GetNumCRegs() { return MPerXdlops * NPerXdlops / mfma_type.wave_size; }
+};
+
+template <class base_type, index_t MPerWave, index_t NPerWave, index_t KPack>
+struct XdlopsGemm
+{
+    template <class base_type_  = base_type,
+              index_t MPerWave_ = MPerWave,
+              index_t NPerWave_ = NPerWave>
+    static constexpr auto GetXdlopsInfo();
+
+    template <>
+    static constexpr auto GetXdlopsInfo<float, 64, 64>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_32x32x1xf32, 64, 64>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<float, 32, 64>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_32x32x1xf32, 32, 64>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<float, 16, 64>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_16x16x1xf32, 16, 64>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<float, 8, 64>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_4x4x1xf32, 8, 64>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<float, 4, 64>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_4x4x1xf32, 4, 64>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<float, 32, 32>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_32x32x2xf32, 32, 32>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<float, 16, 16>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_16x16x4xf32, 16, 16>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<half_t, 64, 64>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_32x32x4f16, 64, 64>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<half_t, 32, 64>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_32x32x4f16, 32, 64>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<half_t, 32, 32>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_32x32x8f16, 32, 32>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<half_t, 16, 16>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_16x16x16f16, 16, 16>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<half_t, 16, 64>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_16x16x4f16, 16, 64>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<half_t, 8, 64>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_4x4x4f16, 8, 64>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<half_t, 4, 64>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_4x4x4f16, 4, 64>{};
+    }
+
+#if 0
+    template <>
+    static constexpr auto GetXdlopsInfo<ushort, 128, 64>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_32x32x2bf16, 64, 64, 2, 1, c_vec32_4_t>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<ushort, 64, 128>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_32x32x2bf16, 64, 64, 1, 2, c_vec32_4_t>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<ushort, 64, 64>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_32x32x2bf16, 64, 64, 1, 1, c_vec32_2_t>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<ushort, 64, 32>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_32x32x2bf16, 64, 32, 1, 1, c_vec32_1_t>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<ushort, 32, 64>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_32x32x2bf16, 32, 64, 1, 1, c_vec32_1_t>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<ushort, 64, 16>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_16x16x2bf16, 64, 16, 1, 1, c_vec16_1_t>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<ushort, 16, 64>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_16x16x2bf16, 16, 64, 1, 1, c_vec16_1_t>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<ushort, 8, 64>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_4x4x2bf16, 8, 64, 1, 1, c_vec4_2_t>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<ushort, 4, 64>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_4x4x2bf16, 4, 64, 1, 1, c_vec4_1_t>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<ushort, 32, 32>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_32x32x4bf16, 32, 32, 1, 1, c_vec16_1_t>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<ushort, 16, 16>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_16x16x8bf16, 16, 16, 1, 1, c_vec4_1_t>{};
+    }
+#endif
+
+    using CIndex = MultiIndex<2>;
+
+    __device__ static constexpr index_t GetNumBlks() { return mfma_type.num_output_blks; }
+
+    __device__ static constexpr index_t GetNumXdlops()
+    {
+        return MPerXdlops * NPerXdlops / (mfma_type.m * mfma_type.n * mfma_type.num_output_blks);
+    }
+
+    __host__ __device__ constexpr XdlopsGemm()
+    {
+        static_assert(NPerXdlops == 4 || NPerXdlops == 8 || NPerXdlops == 16 || NPerXdlops == 32 ||
+                          NPerXdlops == 64,
+                      "Only support GemmNPerXdlops == 4, 8, 16, 32 or 64 for xdlops");
+
+        static_assert(MPerXdlops == 4 || MPerXdlops == 8 || MPerXdlops == 16 || MPerXdlops == 32 ||
+                          MPerXdlops == 64,
+                      "Only support GemmMPerXdlops == 4, 8, 16, 32 or 64 for xdlops");
+
+        static_assert(mfma_type.num_threads_blk == mfma_type.n, "n != num_threads_blk");
+        static_assert(mfma_type.num_regs_blk * mfma_type.num_input_blks == mfma_type.m,
+                      "m != num_input_blks * num_regs_blk");
+        static_assert(mfma_type.num_output_blks == mfma_type.num_input_blks ||
+                          mfma_type.num_output_blks == 1,
+                      "incorrect num_output_blks");
+        static_assert(mfma_type.num_regs_blk * mfma_type.wave_size == mfma_type.m * mfma_type.n,
+                      "num_regs_blk incorrect");
+
+        static_assert(mfma_type.k % mfma_type.k_base == 0, "k % kbase != 0!");
+    }
+
+    __device__ static constexpr index_t GetRegSizePerXdlops()
+    {
+        return MPerXdlops * NPerXdlops / mfma_type.wave_size;
+    }
+
+    template <class ADesc,
+              class BDesc,
+              class CDesc,
+              index_t m0,
+              index_t n0,
+              class FloatA,
+              class FloatB,
+              class FloatC>
+    __device__ void Run(const FloatA& p_a_wave, const FloatB& p_b_wave, FloatC& p_c_thread) const
+    {
+        static_assert(is_same<base_type, float>::value || is_same<base_type, half_t>::value ||
+                          is_same<base_type, ushort>::value,
+                      "base base_type must be float, half, ushort!");
+
+        static_assert(KPack % mfma_type.k_base == 0, "KPack cannot be divided by k_base");
+
+        constexpr index_t c_offset = CDesc{}.CalculateOffset(make_tuple(m0, n0)) * GetNumXdlops();
+
+        static_for<0, KPack, mfma_type.k_base>{}([&](auto k) {
+            constexpr index_t a_offset = ADesc{}.CalculateOffset(make_tuple(0, m0, 0, k));
+            constexpr index_t b_offset = BDesc{}.CalculateOffset(make_tuple(0, n0, 0, k));
+
+            mfma_type.template run<MPerXdlops, NPerXdlops, c_offset>(
+                p_a_wave[Number<a_offset / mfma_type.k_base>{}],
+                p_b_wave[Number<b_offset / mfma_type.k_base>{}],
+                p_c_thread);
+        });
+    }
+
+    __device__ static CIndex GetBeginOfThreadBlk(index_t xdlops_i, index_t blk_i)
+    {
+        const index_t laneId = get_thread_local_1d_id() % mfma_type.wave_size;
+        const index_t blk_id = laneId / mfma_type.num_threads_blk;
+        const index_t blk_td = laneId % mfma_type.num_threads_blk;
+
+        index_t n_offset = blk_i * mfma_type.n + blk_td;
+        index_t m_offset = xdlops_i * mfma_type.m + blk_id * mfma_type.group_size;
+
+        return CIndex{m_offset, n_offset};
+    }
+
+    static constexpr index_t MRepeats   = GetXdlopsInfo().MRepeats;
+    static constexpr index_t NRepeats   = GetXdlopsInfo().NRepeats;
+    static constexpr index_t MPerXdlops = GetXdlopsInfo().MPerXdlops;
+    static constexpr index_t NPerXdlops = GetXdlopsInfo().NPerXdlops;
+
+    static constexpr bool IsKReduction  = GetXdlopsInfo().IsKReduction();
+    static constexpr bool IsABroadcast  = GetXdlopsInfo().IsABroadcast();
+    static constexpr index_t KPerXdlops = GetXdlopsInfo().GetKPerXdlops();
+
+    static constexpr auto GetBlkId(const index_t lane_id)
+    {
+        return lane_id / mfma_type.num_threads_blk;
+    }
+
+    static constexpr auto GetBlkTd(const index_t lane_id)
+    {
+        return lane_id % mfma_type.num_threads_blk;
+    }
+
+    static constexpr auto mfma_type = GetXdlopsInfo().mfma_type;
+
+    struct CLayout
+    {
+        __host__ __device__ static constexpr index_t M1() { return mfma_type.num_groups_blk; }
+        __host__ __device__ static constexpr index_t M0() { return mfma_type.group_size; }
+        __host__ __device__ static constexpr index_t N1() { return mfma_type.num_input_blks; }
+        __host__ __device__ static constexpr index_t N0() { return mfma_type.num_threads_blk; }
+
+        __device__ static constexpr index_t GetBlkSize() { return mfma_type.num_regs_blk; }
+
+        __device__ static constexpr index_t GetNumBlks() { return mfma_type.num_output_blks; }
+
+        __device__ static constexpr index_t GetNumXdlops()
+        {
+            return MPerXdlops * NPerXdlops /
+                   (mfma_type.m * mfma_type.n * mfma_type.num_output_blks);
+        }
+    };
+
+    __host__ __device__ static constexpr auto GetCLayout() { return CLayout{}; }
+};
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/amd_buffer_addressing_v2.hpp b/composable_kernel/include/utility/amd_buffer_addressing_v2.hpp
new file mode 100644
index 0000000000..0139bceb61
--- /dev/null
+++ b/composable_kernel/include/utility/amd_buffer_addressing_v2.hpp
@@ -0,0 +1,654 @@
+#ifndef CK_AMD_BUFFER_ADDRESSING_V2_HPP
+#define CK_AMD_BUFFER_ADDRESSING_V2_HPP
+
+#include "data_type.hpp"
+
+namespace ck {
+
+template <typename T>
+union BufferResource_v2
+{
+    // 128 bit SGPRs to supply buffer resource in buffer instructions
+    // https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
+    int32x4_t data;
+    StaticallyIndexedArray<T*, 2> address;
+    StaticallyIndexedArray<int32_t, 4> range;
+    StaticallyIndexedArray<int32_t, 4> config;
+};
+
+template <typename T>
+__device__ int32x4_t make_wave_buffer_resource(T* p_wave, index_t data_space_size)
+{
+    BufferResource_v2<T> wave_buffer_resource;
+
+    // wavewise base address (64 bit)
+    wave_buffer_resource.address(Number<0>{}) = const_cast<remove_cv_t<T>*>(p_wave);
+    // wavewise range (32 bit)
+    wave_buffer_resource.range(Number<2>{}) = data_space_size * sizeof(T);
+    // wavewise setting (32 bit)
+    wave_buffer_resource.config(Number<3>{}) = CK_BUFFER_RESOURCE_3RD_DWORD;
+
+    return wave_buffer_resource.data;
+}
+
+// load
+__device__ int8_t
+llvm_amdgcn_raw_buffer_load_i8(int32x4_t srsrc,
+                               index_t voffset,
+                               index_t soffset,
+                               index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8");
+
+__device__ int8x2_t
+llvm_amdgcn_raw_buffer_load_i8x2(int32x4_t srsrc,
+                                 index_t voffset,
+                                 index_t soffset,
+                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8");
+
+__device__ int8x4_t
+llvm_amdgcn_raw_buffer_load_i8x4(int32x4_t srsrc,
+                                 index_t voffset,
+                                 index_t soffset,
+                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8");
+
+__device__ int16_t
+llvm_amdgcn_raw_buffer_load_i16(int32x4_t srsrc,
+                                index_t voffset,
+                                index_t soffset,
+                                index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32");
+__device__ int32_t
+llvm_amdgcn_raw_buffer_load_i32(int32x4_t srsrc,
+                                index_t voffset,
+                                index_t soffset,
+                                index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32");
+
+__device__ int32x2_t
+llvm_amdgcn_raw_buffer_load_i32x2(int32x4_t srsrc,
+                                  index_t voffset,
+                                  index_t soffset,
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32");
+
+__device__ int32x4_t
+llvm_amdgcn_raw_buffer_load_i32x4(int32x4_t srsrc,
+                                  index_t voffset,
+                                  index_t soffset,
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32");
+// half
+__device__ half_t
+llvm_amdgcn_raw_buffer_load_fp16(int32x4_t srsrc,
+                                 index_t voffset,
+                                 index_t soffset,
+                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16");
+
+__device__ half2_t
+llvm_amdgcn_raw_buffer_load_fp16x2(int32x4_t srsrc,
+                                   index_t voffset,
+                                   index_t soffset,
+                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16");
+
+__device__ half4_t
+llvm_amdgcn_raw_buffer_load_fp16x4(int32x4_t srsrc,
+                                   index_t voffset,
+                                   index_t soffset,
+                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16");
+
+// float
+__device__ float
+llvm_amdgcn_raw_buffer_load_fp32(int32x4_t srsrc,
+                                 index_t voffset,
+                                 index_t soffset,
+                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32");
+
+__device__ float2_t
+llvm_amdgcn_raw_buffer_load_fp32x2(int32x4_t srsrc,
+                                   index_t voffset,
+                                   index_t soffset,
+                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32");
+
+__device__ float4_t
+llvm_amdgcn_raw_buffer_load_fp32x4(int32x4_t srsrc,
+                                   index_t voffset,
+                                   index_t soffset,
+                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32");
+
+// store
+__device__ void
+llvm_amdgcn_raw_buffer_store_i8(int8_t vdata,
+                                int32x4_t rsrc,
+                                index_t voffset,
+                                index_t soffset,
+                                index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8");
+
+__device__ void
+llvm_amdgcn_raw_buffer_store_i8x2(int8x2_t vdata,
+                                  int32x4_t rsrc,
+                                  index_t voffset,
+                                  index_t soffset,
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8");
+
+__device__ void
+llvm_amdgcn_raw_buffer_store_i8x4(int8x4_t vdata,
+                                  int32x4_t rsrc,
+                                  index_t voffset,
+                                  index_t soffset,
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8");
+
+__device__ void
+llvm_amdgcn_raw_buffer_store_i16(int16_t vdata,
+                                 int32x4_t rsrc,
+                                 index_t voffset,
+                                 index_t soffset,
+                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16");
+
+__device__ void
+llvm_amdgcn_raw_buffer_store_i32(int32_t vdata,
+                                 int32x4_t rsrc,
+                                 index_t voffset,
+                                 index_t soffset,
+                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32");
+
+__device__ void
+llvm_amdgcn_raw_buffer_store_i32x2(int32x2_t vdata,
+                                   int32x4_t rsrc,
+                                   index_t voffset,
+                                   index_t soffset,
+                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32");
+
+__device__ void
+llvm_amdgcn_raw_buffer_store_i32x4(int32x4_t vdata,
+                                   int32x4_t rsrc,
+                                   index_t voffset,
+                                   index_t soffset,
+                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32");
+
+// half
+__device__ void
+llvm_amdgcn_raw_buffer_store_fp16(half_t vdata,
+                                  int32x4_t rsrc,
+                                  index_t voffset,
+                                  index_t soffset,
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16");
+
+__device__ void
+llvm_amdgcn_raw_buffer_store_fp16x2(half2_t vdata,
+                                    int32x4_t rsrc,
+                                    index_t voffset,
+                                    index_t soffset,
+                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16");
+
+__device__ void
+llvm_amdgcn_raw_buffer_store_fp16x4(half4_t vdata,
+                                    int32x4_t rsrc,
+                                    index_t voffset,
+                                    index_t soffset,
+                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16");
+// float
+__device__ void
+llvm_amdgcn_raw_buffer_store_fp32(float vdata,
+                                  int32x4_t rsrc,
+                                  index_t voffset,
+                                  index_t soffset,
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32");
+
+__device__ void
+llvm_amdgcn_raw_buffer_store_fp32x2(float2_t vdata,
+                                    int32x4_t rsrc,
+                                    index_t voffset,
+                                    index_t soffset,
+                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32");
+
+__device__ void
+llvm_amdgcn_raw_buffer_store_fp32x4(float4_t vdata,
+                                    int32x4_t rsrc,
+                                    index_t voffset,
+                                    index_t soffset,
+                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32");
+
+template <typename T, index_t N>
+__device__ typename vector_type<T, N>::type
+amd_buffer_load_impl_v2(int32x4_t src_wave_buffer_resource,
+                        index_t src_thread_addr_offset,
+                        index_t src_wave_addr_offset)
+{
+    static_assert(
+        (is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
+            (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (is_same<T, half_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
+            (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)),
+        "wrong! not implemented");
+
+    if constexpr(is_same<T, float>::value)
+    {
+        if constexpr(N == 1)
+        {
+            return llvm_amdgcn_raw_buffer_load_fp32(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+        }
+        else if constexpr(N == 2)
+        {
+            return llvm_amdgcn_raw_buffer_load_fp32x2(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+        }
+        else if constexpr(N == 4)
+        {
+            return llvm_amdgcn_raw_buffer_load_fp32x4(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+        }
+        else if constexpr(N == 8)
+        {
+            vector_type<float, 8> tmp;
+
+            tmp.AsType<float4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_fp32x4(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+
+            tmp.AsType<float4_t>()(Number<1>{}) =
+                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset + 4 * sizeof(float),
+                                                   0);
+
+            return tmp.AsType<float8_t>()(Number<0>{});
+        }
+    }
+    else if constexpr(is_same<T, half_t>::value)
+    {
+        if constexpr(N == 1)
+        {
+            return llvm_amdgcn_raw_buffer_load_fp16(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+        }
+        else if constexpr(N == 2)
+        {
+            return llvm_amdgcn_raw_buffer_load_fp16x2(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+        }
+        else if constexpr(N == 4)
+        {
+            return llvm_amdgcn_raw_buffer_load_fp16x4(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+        }
+        else if constexpr(N == 8)
+        {
+#if 0
+            vector_type<half_t, 8> tmp;
+
+            tmp.AsType<half4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_fp16x4(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+
+            tmp.AsType<half4_t>()(Number<1>{}) =
+                llvm_amdgcn_raw_buffer_load_fp16x4(src_wave_buffer_resource,
+                                                     src_thread_addr_offset,
+                                                     src_wave_addr_offset + 4 * sizeof(half_t),
+                                                     0);
+
+            return tmp.AsType<half8_t>()(Number<0>{});
+#else
+            float4_t tmp = llvm_amdgcn_raw_buffer_load_fp32x4(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+
+            return as_type<half8_t>(tmp);
+#endif
+        }
+    }
+    else if constexpr(is_same<T, int32_t>::value)
+    {
+        if constexpr(N == 1)
+        {
+            return llvm_amdgcn_raw_buffer_load_i32(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+        }
+        else if constexpr(N == 2)
+        {
+            return llvm_amdgcn_raw_buffer_load_i32x2(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+        }
+        else if constexpr(N == 4)
+        {
+            return llvm_amdgcn_raw_buffer_load_i32x4(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+        }
+        else if constexpr(N == 8)
+        {
+            vector_type<int32_t, 8> tmp;
+
+            tmp.AsType<int32x4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_i32x4(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+
+            tmp.AsType<int32x4_t>()(Number<1>{}) =
+                llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
+                                                  src_thread_addr_offset,
+                                                  src_wave_addr_offset + 4 * sizeof(int32_t),
+                                                  0);
+            return tmp.AsType<int32x8_t>()(Number<0>{});
+        }
+    }
+    else if constexpr(is_same<T, int8_t>::value)
+    {
+        if constexpr(N == 1)
+        {
+            return llvm_amdgcn_raw_buffer_load_i8(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+        }
+        else if constexpr(N == 2)
+        {
+#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
+            return llvm_amdgcn_raw_buffer_load_i8x2(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+#else
+            int16_t tmp = llvm_amdgcn_raw_buffer_load_i16(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+
+            return as_type<int8x2_t>(tmp);
+#endif
+        }
+        else if constexpr(N == 4)
+        {
+#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
+            return llvm_amdgcn_raw_buffer_load_i8x4(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+#else
+            int32_t tmp = llvm_amdgcn_raw_buffer_load_i32(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+
+            return as_type<int8x4_t>(tmp);
+#endif
+        }
+        else if constexpr(N == 8)
+        {
+#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
+            vector_type<int8_t, 8> tmp;
+
+            tmp.AsType<int8x4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_i8x4(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+
+            tmp.AsType<int8x4_t>()(Number<1>{}) =
+                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
+                                                 src_thread_addr_offset,
+                                                 src_wave_addr_offset + 4 * sizeof(int8_t),
+                                                 0);
+
+            return tmp.AsType<int8x8_t>()(Number<0>{});
+#else
+            int32x2_t tmp = llvm_amdgcn_raw_buffer_load_i32x2(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+
+            return as_type<int8x8_t>(tmp);
+#endif
+        }
+        else if constexpr(N == 16)
+        {
+#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
+            vector_type<int8_t, 16> tmp;
+
+            tmp.AsType<int8x4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_i8x4(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+
+            tmp.AsType<int8x4_t>()(Number<1>{}) =
+                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
+                                                 src_thread_addr_offset,
+                                                 src_wave_addr_offset + 4 * sizeof(int8_t),
+                                                 0);
+
+            tmp.AsType<int8x4_t>()(Number<2>{}) =
+                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
+                                                 src_thread_addr_offset,
+                                                 src_wave_addr_offset + 8 * sizeof(int8_t),
+                                                 0);
+
+            tmp.AsType<int8x4_t>()(Number<3>{}) =
+                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
+                                                 src_thread_addr_offset,
+                                                 src_wave_addr_offset + 12 * sizeof(int8_t),
+                                                 0);
+
+            return tmp.AsType<int8x16_t>()(Number<0>{});
+#else
+            int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+
+            return as_type<int8x16_t>(tmp);
+#endif
+        }
+    }
+}
+
+template <typename T, index_t N>
+__device__ void amd_buffer_store_impl_v2(const typename vector_type<T, N>::type src_thread_data,
+                                         int32x4_t dst_wave_buffer_resource,
+                                         index_t dst_thread_addr_offset,
+                                         index_t dst_wave_addr_offset)
+{
+    static_assert(
+        (is_same<T, float>::value && (N == 1 || N == 2 || N == 4)) ||
+            (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4)) ||
+            (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (is_same<T, half_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)),
+        "wrong! not implemented");
+
+    if constexpr(is_same<T, float>::value)
+    {
+        if constexpr(N == 1)
+        {
+            llvm_amdgcn_raw_buffer_store_fp32(src_thread_data,
+                                              dst_wave_buffer_resource,
+                                              dst_thread_addr_offset,
+                                              dst_wave_addr_offset,
+                                              0);
+        }
+        else if constexpr(N == 2)
+        {
+            llvm_amdgcn_raw_buffer_store_fp32x2(src_thread_data,
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                0);
+        }
+        else if constexpr(N == 4)
+        {
+            llvm_amdgcn_raw_buffer_store_fp32x4(src_thread_data,
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                0);
+        }
+    }
+    else if constexpr(is_same<T, int32_t>::value)
+    {
+        if constexpr(N == 1)
+        {
+            llvm_amdgcn_raw_buffer_store_i32(src_thread_data,
+                                             dst_wave_buffer_resource,
+                                             dst_thread_addr_offset,
+                                             dst_wave_addr_offset,
+                                             0);
+        }
+        else if constexpr(N == 2)
+        {
+            llvm_amdgcn_raw_buffer_store_i32x2(src_thread_data,
+                                               dst_wave_buffer_resource,
+                                               dst_thread_addr_offset,
+                                               dst_wave_addr_offset,
+                                               0);
+        }
+        else if constexpr(N == 4)
+        {
+            llvm_amdgcn_raw_buffer_store_i32x4(src_thread_data,
+                                               dst_wave_buffer_resource,
+                                               dst_thread_addr_offset,
+                                               dst_wave_addr_offset,
+                                               0);
+        }
+    }
+    else if constexpr(is_same<T, int8_t>::value)
+    {
+        if constexpr(N == 1)
+        {
+            llvm_amdgcn_raw_buffer_store_i8(src_thread_data,
+                                            dst_wave_buffer_resource,
+                                            dst_thread_addr_offset,
+                                            dst_wave_addr_offset,
+                                            0);
+        }
+        else if constexpr(N == 2)
+        {
+#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
+            llvm_amdgcn_raw_buffer_store_i8x2(src_thread_data,
+                                              dst_wave_buffer_resource,
+                                              dst_thread_addr_offset,
+                                              dst_wave_addr_offset,
+                                              0);
+#else
+            llvm_amdgcn_raw_buffer_store_i16(as_type<int16_t>(src_thread_data),
+                                             dst_wave_buffer_resource,
+                                             dst_thread_addr_offset,
+                                             dst_wave_addr_offset,
+                                             0);
+#endif
+        }
+        else if constexpr(N == 4)
+        {
+#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
+            llvm_amdgcn_raw_buffer_store_i8x4(src_thread_data,
+                                              dst_wave_buffer_resource,
+                                              dst_thread_addr_offset,
+                                              dst_wave_addr_offset,
+                                              0);
+#else
+            llvm_amdgcn_raw_buffer_store_i32(as_type<int32_t>(src_thread_data),
+                                             dst_wave_buffer_resource,
+                                             dst_thread_addr_offset,
+                                             dst_wave_addr_offset,
+                                             0);
+#endif
+        }
+        else if constexpr(N == 8)
+        {
+            llvm_amdgcn_raw_buffer_store_i32x2(as_type<int32x2_t>(src_thread_data),
+                                               dst_wave_buffer_resource,
+                                               dst_thread_addr_offset,
+                                               dst_wave_addr_offset,
+                                               0);
+        }
+        else if constexpr(N == 16)
+        {
+            llvm_amdgcn_raw_buffer_store_i32x4(as_type<int32x4_t>(src_thread_data),
+                                               dst_wave_buffer_resource,
+                                               dst_thread_addr_offset,
+                                               dst_wave_addr_offset,
+                                               0);
+        }
+    }
+    else if constexpr(is_same<T, half_t>::value)
+    {
+        if constexpr(N == 1)
+        {
+            llvm_amdgcn_raw_buffer_store_fp16(src_thread_data,
+                                              dst_wave_buffer_resource,
+                                              dst_thread_addr_offset,
+                                              dst_wave_addr_offset,
+                                              0);
+        }
+        else if constexpr(N == 2)
+        {
+            llvm_amdgcn_raw_buffer_store_fp16x2(src_thread_data,
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                0);
+        }
+        else if constexpr(N == 4)
+        {
+            llvm_amdgcn_raw_buffer_store_fp16x4(src_thread_data,
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                0);
+        }
+        else if constexpr(N == 8)
+        {
+            vector_type<half_t, 8> tmp{src_thread_data};
+
+            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType<half4_t>()[Number<0>{}],
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                0);
+
+            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType<half4_t>()[Number<1>{}],
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset + 4 * sizeof(half_t),
+                                                0);
+        }
+    }
+}
+
+// buffer_load requires:
+//   1) p_src_wave must be in global memory space
+//   2) p_src_wave to be a wavewise pointer.
+// It is user's responsibility to make sure that is true.
+template <typename T, index_t N>
+__device__ typename vector_type_maker<T, N>::type::type
+amd_buffer_load_v2(const T* p_src_wave,
+                   index_t src_thread_data_offset,
+                   bool src_thread_data_valid,
+                   index_t src_element_space)
+{
+    const int32x4_t src_wave_buffer_resource =
+        make_wave_buffer_resource(p_src_wave, src_element_space);
+
+    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(T);
+
+    using vector_t                = typename vector_type_maker<T, N>::type::type;
+    using scalar_t                = typename scalar_type<vector_t>::type;
+    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
+
+#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
+    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
+
+    return amd_buffer_load_impl_v2<scalar_t, vector_size>(
+        src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);
+#else
+    vector_t tmp = amd_buffer_load_impl_v2<scalar_t, vector_size>(
+        src_wave_buffer_resource, src_thread_addr_offset, 0);
+
+    return src_thread_data_valid ? tmp : vector_t(0);
+#endif
+}
+
+// buffer_store requires:
+//   1) p_dst_wave must be global memory
+//   2) p_dst_wave to be a wavewise pointer.
+// It is user's responsibility to make sure that is true.
+template <typename T, index_t N>
+__device__ void
+amd_buffer_store_v2(const typename vector_type_maker<T, N>::type::type src_thread_data,
+                    T* p_dst_wave,
+                    const index_t dst_thread_data_offset,
+                    const bool dst_thread_data_valid,
+                    const index_t dst_element_space)
+{
+    const int32x4_t dst_wave_buffer_resource =
+        make_wave_buffer_resource(p_dst_wave, dst_element_space);
+
+    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(T);
+
+    using vector_t                = typename vector_type_maker<T, N>::type::type;
+    using scalar_t                = typename scalar_type<vector_t>::type;
+    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
+
+#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
+    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
+
+    amd_buffer_store_impl_v2<scalar_t, vector_size>(
+        src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
+#else
+    if(dst_thread_data_valid)
+    {
+        amd_buffer_store_impl_v2<scalar_t, vector_size>(
+            src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
+    }
+#endif
+}
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/amd_dlop.hpp b/composable_kernel/include/utility/amd_dlop.hpp
new file mode 100644
index 0000000000..8ce19012e9
--- /dev/null
+++ b/composable_kernel/include/utility/amd_dlop.hpp
@@ -0,0 +1,188 @@
+#ifndef CK_AMD_DLOP_HPP
+#define CK_AMD_DLOP_HPP
+
+#include "data_type.hpp"
+
+namespace ck {
+
+template <typename TA, typename TB, typename TC>
+__device__ void amd_inner_product_dlop(const TA& a, const TB& b, TC& c);
+
+template <>
+__device__ void
+amd_inner_product_dlop<float, float, float>(const float& a, const float& b, float& c)
+{
+#if CK_USE_AMD_DLOP_INLINE_ASM
+    asm volatile("\n \
+            v_fmac_f32 %0, %1, %2 \n \
+            "
+                 : "=v"(c)
+                 : "v"(a), "v"(b), "0"(c));
+#else
+    c += a * b;
+#endif
+}
+
+template <>
+__device__ void
+amd_inner_product_dlop<float2_t, float2_t, float>(const float2_t& a, const float2_t& b, float& c)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+
+    amd_inner_product_dlop(vector_type<float, 2>{a}.AsType<float>()[I0],
+                           vector_type<float, 2>{b}.AsType<float>()[I0],
+                           c);
+
+    amd_inner_product_dlop(vector_type<float, 2>{a}.AsType<float>()[I1],
+                           vector_type<float, 2>{b}.AsType<float>()[I1],
+                           c);
+}
+
+template <>
+__device__ void
+amd_inner_product_dlop<float4_t, float4_t, float>(const float4_t& a, const float4_t& b, float& c)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    amd_inner_product_dlop(vector_type<float, 4>{a}.AsType<float>()[I0],
+                           vector_type<float, 4>{b}.AsType<float>()[I0],
+                           c);
+
+    amd_inner_product_dlop(vector_type<float, 4>{a}.AsType<float>()[I1],
+                           vector_type<float, 4>{b}.AsType<float>()[I1],
+                           c);
+
+    amd_inner_product_dlop(vector_type<float, 4>{a}.AsType<float>()[I2],
+                           vector_type<float, 4>{b}.AsType<float>()[I2],
+                           c);
+
+    amd_inner_product_dlop(vector_type<float, 4>{a}.AsType<float>()[I3],
+                           vector_type<float, 4>{b}.AsType<float>()[I3],
+                           c);
+}
+
+#if CK_USE_AMD_DLOP
+template <>
+__device__ void
+amd_inner_product_dlop<half2_t, half2_t, float>(const half2_t& a, const half2_t& b, float& c)
+{
+#if CK_USE_AMD_DLOP_INLINE_ASM
+    asm volatile("\n \
+            v_dot2_f32_f16 %0, %1, %2, %0\n \
+            "
+                 : "=v"(c)
+                 : "v"(a), "v"(b), "0"(c));
+#else
+    c = __builtin_amdgcn_sdot2(a, b, c, false);
+#endif
+}
+
+template <>
+__device__ void
+amd_inner_product_dlop<half4_t, half4_t, float>(const half4_t& a, const half4_t& b, float& c)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+
+    amd_inner_product_dlop(vector_type<half_t, 4>{a}.AsType<half2_t>()[I0],
+                           vector_type<half_t, 4>{b}.AsType<half2_t>()[I0],
+                           c);
+
+    amd_inner_product_dlop(vector_type<half_t, 4>{a}.AsType<half2_t>()[I1],
+                           vector_type<half_t, 4>{b}.AsType<half2_t>()[I1],
+                           c);
+}
+
+template <>
+__device__ void
+amd_inner_product_dlop<half8_t, half8_t, float>(const half8_t& a, const half8_t& b, float& c)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    amd_inner_product_dlop(vector_type<half_t, 8>{a}.AsType<half2_t>()[I0],
+                           vector_type<half_t, 8>{b}.AsType<half2_t>()[I0],
+                           c);
+
+    amd_inner_product_dlop(vector_type<half_t, 8>{a}.AsType<half2_t>()[I1],
+                           vector_type<half_t, 8>{b}.AsType<half2_t>()[I1],
+                           c);
+
+    amd_inner_product_dlop(vector_type<half_t, 8>{a}.AsType<half2_t>()[I2],
+                           vector_type<half_t, 8>{b}.AsType<half2_t>()[I2],
+                           c);
+
+    amd_inner_product_dlop(vector_type<half_t, 8>{a}.AsType<half2_t>()[I3],
+                           vector_type<half_t, 8>{b}.AsType<half2_t>()[I3],
+                           c);
+}
+
+template <>
+__device__ void amd_inner_product_dlop<int8x4_t, int8x4_t, int32_t>(const int8x4_t& a,
+                                                                    const int8x4_t& b,
+                                                                    int32_t& c)
+{
+#if CK_USE_AMD_DLOP_INLINE_ASM
+    asm volatile("\n \
+            v_dot4_i32_i8 %0, %1, %2, %0\n \
+            "
+                 : "=v"(c)
+                 : "v"(as_type<int32_t>(a)), "v"(as_type<int32_t>(b)), "0"(c));
+#else
+    c = __builtin_amdgcn_sdot4(as_type<int32_t>(a), as_type<int32_t>(b), c, false);
+#endif
+}
+
+template <>
+__device__ void amd_inner_product_dlop<int8x8_t, int8x8_t, int32_t>(const int8x8_t& a,
+                                                                    const int8x8_t& b,
+                                                                    int32_t& c)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+
+    amd_inner_product_dlop(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I0],
+                           vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I0],
+                           c);
+
+    amd_inner_product_dlop(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I1],
+                           vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I1],
+                           c);
+}
+
+template <>
+__device__ void amd_inner_product_dlop<int8x16_t, int8x16_t, int32_t>(const int8x16_t& a,
+                                                                      const int8x16_t& b,
+                                                                      int32_t& c)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    amd_inner_product_dlop(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I0],
+                           vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I0],
+                           c);
+
+    amd_inner_product_dlop(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I1],
+                           vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I1],
+                           c);
+
+    amd_inner_product_dlop(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I2],
+                           vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I2],
+                           c);
+
+    amd_inner_product_dlop(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I3],
+                           vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I3],
+                           c);
+}
+#endif // CK_USE_AMD_DLOP
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/amd_inline_asm.hpp b/composable_kernel/include/utility/amd_inline_asm.hpp
new file mode 100644
index 0000000000..ce80fc0549
--- /dev/null
+++ b/composable_kernel/include/utility/amd_inline_asm.hpp
@@ -0,0 +1,353 @@
+#ifndef CK_AMD_INLINE_ASM_HPP
+#define CK_AMD_INLINE_ASM_HPP
+
+#include "data_type.hpp"
+
+namespace ck {
+
+// c0 += inner_product(a, b0)
+// c1 += inner_product(a, b1)
+__device__ void amd_assembly_outer_product_1x2(float a, float b0, float b1, float& c0, float& c1)
+{
+    asm volatile("\n \
+            v_fmac_f32 %0, %2, %3 \n \
+            v_fmac_f32 %1, %2, %4 \n \
+            "
+                 : "=v"(c0), "=v"(c1)
+                 : "v"(a), "v"(b0), "v"(b1), "0"(c0), "1"(c1));
+}
+
+// c0 += inner_product(a, b0)
+// c1 += inner_product(a, b1)
+// c2 += inner_product(a, b2)
+// c3 += inner_product(a, b3)
+__device__ void amd_assembly_outer_product_1x4(
+    float a, float b0, float b1, float b2, float b3, float& c0, float& c1, float& c2, float& c3)
+{
+    asm volatile("\n \
+            v_fmac_f32 %0, %4, %5 \n \
+            v_fmac_f32 %1, %4, %6 \n \
+            v_fmac_f32 %2, %4, %7 \n \
+            v_fmac_f32 %3, %4, %8 \n \
+            "
+                 : "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3)
+                 : "v"(a), "v"(b0), "v"(b1), "v"(b2), "v"(b3), "0"(c0), "1"(c1), "2"(c2), "3"(c3));
+}
+
+// c0 += inner_product(a, b0)
+// c1 += inner_product(a, b1)
+__device__ void
+amd_assembly_outer_product_1x2(half2_t a, half2_t b0, half2_t b1, float& c0, float& c1)
+{
+    asm volatile("\n \
+            v_dot2_f32_f16 %0, %2, %3, %0\n \
+            v_dot2_f32_f16 %1, %2, %4, %1\n \
+            "
+                 : "=v"(c0), "=v"(c1)
+                 : "v"(a), "v"(b0), "v"(b1), "0"(c0), "1"(c1));
+}
+
+// c0 += inner_product(a, b0)
+// c1 += inner_product(a, b1)
+__device__ void
+amd_assembly_outer_product_1x2(half4_t a, half4_t b0, half4_t b1, float& c0, float& c1)
+{
+    // TODO remove pointer casting
+    const half2_t* p_a_half2  = reinterpret_cast<const half2_t*>(&a);
+    const half2_t* p_b0_half2 = reinterpret_cast<const half2_t*>(&b0);
+    const half2_t* p_b1_half2 = reinterpret_cast<const half2_t*>(&b1);
+
+    // do dot2 two times
+    asm volatile("\n \
+            v_dot2_f32_f16 %0, %2, %4, %0\n \
+            v_dot2_f32_f16 %1, %2, %6, %1\n \
+            v_dot2_f32_f16 %0, %3, %5, %0\n \
+            v_dot2_f32_f16 %1, %3, %7, %1\n \
+            "
+                 : "=v"(c0), "=v"(c1)
+                 : "v"(p_a_half2[0]),
+                   "v"(p_a_half2[1]),
+                   "v"(p_b0_half2[0]),
+                   "v"(p_b0_half2[1]),
+                   "v"(p_b1_half2[0]),
+                   "v"(p_b1_half2[1]),
+                   "0"(c0),
+                   "1"(c1));
+}
+
+// c0 += inner_product(a, b0)
+// c1 += inner_product(a, b1)
+// c2 += inner_product(a, b2)
+// c3 += inner_product(a, b3)
+__device__ void amd_assembly_outer_product_1x4(half2_t a,
+                                               half2_t b0,
+                                               half2_t b1,
+                                               half2_t b2,
+                                               half2_t b3,
+                                               float& c0,
+                                               float& c1,
+                                               float& c2,
+                                               float& c3)
+{
+    asm volatile("\n \
+            v_dot2_f32_f16 %0, %4, %5, %0\n \
+            v_dot2_f32_f16 %1, %4, %6, %1\n \
+            v_dot2_f32_f16 %2, %4, %7, %2\n \
+            v_dot2_f32_f16 %3, %4, %8, %3\n \
+            "
+                 : "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3)
+                 : "v"(a), "v"(b0), "v"(b1), "v"(b2), "v"(b3), "0"(c0), "1"(c1), "2"(c2), "3"(c3));
+}
+
+// c0 += inner_product(a, b0)
+// c1 += inner_product(a, b1)
+// c2 += inner_product(a, b2)
+// c3 += inner_product(a, b3)
+__device__ void amd_assembly_outer_product_1x4(half4_t a,
+                                               half4_t b0,
+                                               half4_t b1,
+                                               half4_t b2,
+                                               half4_t b3,
+                                               float& c0,
+                                               float& c1,
+                                               float& c2,
+                                               float& c3)
+{
+    // TODO remove pointer casting
+    const half2_t* p_a_half2  = reinterpret_cast<const half2_t*>(&a);
+    const half2_t* p_b0_half2 = reinterpret_cast<const half2_t*>(&b0);
+    const half2_t* p_b1_half2 = reinterpret_cast<const half2_t*>(&b1);
+    const half2_t* p_b2_half2 = reinterpret_cast<const half2_t*>(&b2);
+    const half2_t* p_b3_half2 = reinterpret_cast<const half2_t*>(&b3);
+
+    // do dot2 two times
+    asm volatile("\n \
+            v_dot2_f32_f16 %0, %4, %6,  %0\n \
+            v_dot2_f32_f16 %1, %4, %8,  %1\n \
+            v_dot2_f32_f16 %2, %4, %10, %2\n \
+            v_dot2_f32_f16 %3, %4, %12, %3\n \
+            v_dot2_f32_f16 %0, %5, %7,  %0\n \
+            v_dot2_f32_f16 %1, %5, %9,  %1\n \
+            v_dot2_f32_f16 %2, %5, %11, %2\n \
+            v_dot2_f32_f16 %3, %5, %13, %3\n \
+            "
+                 : "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3)
+                 : "v"(p_a_half2[0]),
+                   "v"(p_a_half2[1]),
+                   "v"(p_b0_half2[0]),
+                   "v"(p_b0_half2[1]),
+                   "v"(p_b1_half2[0]),
+                   "v"(p_b1_half2[1]),
+                   "v"(p_b2_half2[0]),
+                   "v"(p_b2_half2[1]),
+                   "v"(p_b3_half2[0]),
+                   "v"(p_b3_half2[1]),
+                   "0"(c0),
+                   "1"(c1),
+                   "2"(c2),
+                   "3"(c3));
+}
+
+__device__ void amd_assembly_outer_product_1x4(half8_t a,
+                                               half8_t b0,
+                                               half8_t b1,
+                                               half8_t b2,
+                                               half8_t b3,
+                                               float& c0,
+                                               float& c1,
+                                               float& c2,
+                                               float& c3)
+{
+
+    // TODO remove pointer casting
+    const half4_t* p_a_half4  = reinterpret_cast<const half4_t*>(&a);
+    const half4_t* p_b0_half4 = reinterpret_cast<const half4_t*>(&b0);
+    const half4_t* p_b1_half4 = reinterpret_cast<const half4_t*>(&b1);
+    const half4_t* p_b2_half4 = reinterpret_cast<const half4_t*>(&b2);
+    const half4_t* p_b3_half4 = reinterpret_cast<const half4_t*>(&b3);
+
+    amd_assembly_outer_product_1x4(
+        p_a_half4[0], p_b0_half4[0], p_b1_half4[0], p_b2_half4[0], p_b3_half4[0], c0, c1, c2, c3);
+
+    amd_assembly_outer_product_1x4(
+        p_a_half4[1], p_b0_half4[1], p_b1_half4[1], p_b2_half4[1], p_b3_half4[1], c0, c1, c2, c3);
+}
+
+__device__ void amd_assembly_outer_product_1x4(half16_t a,
+                                               half16_t b0,
+                                               half16_t b1,
+                                               half16_t b2,
+                                               half16_t b3,
+                                               float& c0,
+                                               float& c1,
+                                               float& c2,
+                                               float& c3)
+{
+    // TODO remove pointer casting
+    const half8_t* p_a_half8  = reinterpret_cast<const half8_t*>(&a);
+    const half8_t* p_b0_half8 = reinterpret_cast<const half8_t*>(&b0);
+    const half8_t* p_b1_half8 = reinterpret_cast<const half8_t*>(&b1);
+    const half8_t* p_b2_half8 = reinterpret_cast<const half8_t*>(&b2);
+    const half8_t* p_b3_half8 = reinterpret_cast<const half8_t*>(&b3);
+
+    amd_assembly_outer_product_1x4(
+        p_a_half8[0], p_b0_half8[0], p_b1_half8[0], p_b2_half8[0], p_b3_half8[0], c0, c1, c2, c3);
+
+    amd_assembly_outer_product_1x4(
+        p_a_half8[1], p_b0_half8[1], p_b1_half8[1], p_b2_half8[1], p_b3_half8[1], c0, c1, c2, c3);
+}
+
+// c0 += inner_product(a, b0)
+// c1 += inner_product(a, b1)
+__device__ void
+amd_assembly_outer_product_1x2(int8x4_t a, int8x4_t b0, int8x4_t b1, int32_t& c0, int32_t& c1)
+{
+#if 1
+    asm volatile("\n \
+            v_dot4_i32_i8 %0, %2, %3, %0\n \
+            v_dot4_i32_i8 %1, %2, %4, %1\n \
+            "
+                 : "=v"(c0), "=v"(c1)
+                 : "v"(as_type<int32_t>(a)),
+                   "v"(as_type<int32_t>(b0)),
+                   "v"(as_type<int32_t>(b1)),
+                   "0"(c0),
+                   "1"(c1));
+#else
+    c0 = __builtin_amdgcn_sdot4(as_type<int32_t>(a), as_type<int32_t>(b0), c0, false);
+    c1 = __builtin_amdgcn_sdot4(as_type<int32_t>(a), as_type<int32_t>(b1), c1, false);
+#endif
+}
+
+// c0 += inner_product(a, b0)
+// c1 += inner_product(a, b1)
+// c2 += inner_product(a, b2)
+// c3 += inner_product(a, b3)
+__device__ void amd_assembly_outer_product_1x4(int8x4_t a,
+                                               int8x4_t b0,
+                                               int8x4_t b1,
+                                               int8x4_t b2,
+                                               int8x4_t b3,
+                                               int32_t& c0,
+                                               int32_t& c1,
+                                               int32_t& c2,
+                                               int32_t& c3)
+{
+#if 1
+    asm volatile("\n \
+            v_dot4_i32_i8 %0, %4, %5, %0\n \
+            v_dot4_i32_i8 %1, %4, %6, %1\n \
+            v_dot4_i32_i8 %2, %4, %7, %2\n \
+            v_dot4_i32_i8 %3, %4, %8, %3\n \
+            "
+                 : "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3)
+                 : "v"(as_type<int32_t>(a)),
+                   "v"(as_type<int32_t>(b0)),
+                   "v"(as_type<int32_t>(b1)),
+                   "v"(as_type<int32_t>(b2)),
+                   "v"(as_type<int32_t>(b3)),
+                   "0"(c0),
+                   "1"(c1),
+                   "2"(c2),
+                   "3"(c3));
+#else
+    c0 = __builtin_amdgcn_sdot4(as_type<int32_t>(a), as_type<int32_t>(b0), c0, false);
+    c1 = __builtin_amdgcn_sdot4(as_type<int32_t>(a), as_type<int32_t>(b1), c1, false);
+    c2 = __builtin_amdgcn_sdot4(as_type<int32_t>(a), as_type<int32_t>(b2), c2, false);
+    c3 = __builtin_amdgcn_sdot4(as_type<int32_t>(a), as_type<int32_t>(b3), c3, false);
+#endif
+}
+
+__device__ void amd_assembly_outer_product_1x4(int8x8_t a,
+                                               int8x8_t b0,
+                                               int8x8_t b1,
+                                               int8x8_t b2,
+                                               int8x8_t b3,
+                                               int32_t& c0,
+                                               int32_t& c1,
+                                               int32_t& c2,
+                                               int32_t& c3)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+
+    amd_assembly_outer_product_1x4(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I0],
+                                   vector_type<int8_t, 8>{b0}.AsType<int8x4_t>()[I0],
+                                   vector_type<int8_t, 8>{b1}.AsType<int8x4_t>()[I0],
+                                   vector_type<int8_t, 8>{b2}.AsType<int8x4_t>()[I0],
+                                   vector_type<int8_t, 8>{b3}.AsType<int8x4_t>()[I0],
+                                   c0,
+                                   c1,
+                                   c2,
+                                   c3);
+
+    amd_assembly_outer_product_1x4(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I1],
+                                   vector_type<int8_t, 8>{b0}.AsType<int8x4_t>()[I1],
+                                   vector_type<int8_t, 8>{b1}.AsType<int8x4_t>()[I1],
+                                   vector_type<int8_t, 8>{b2}.AsType<int8x4_t>()[I1],
+                                   vector_type<int8_t, 8>{b3}.AsType<int8x4_t>()[I1],
+                                   c0,
+                                   c1,
+                                   c2,
+                                   c3);
+}
+
+__device__ void amd_assembly_outer_product_1x4(int8x16_t a,
+                                               int8x16_t b0,
+                                               int8x16_t b1,
+                                               int8x16_t b2,
+                                               int8x16_t b3,
+                                               int32_t& c0,
+                                               int32_t& c1,
+                                               int32_t& c2,
+                                               int32_t& c3)
+
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    amd_assembly_outer_product_1x4(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I0],
+                                   vector_type<int8_t, 16>{b0}.AsType<int8x4_t>()[I0],
+                                   vector_type<int8_t, 16>{b1}.AsType<int8x4_t>()[I0],
+                                   vector_type<int8_t, 16>{b2}.AsType<int8x4_t>()[I0],
+                                   vector_type<int8_t, 16>{b3}.AsType<int8x4_t>()[I0],
+                                   c0,
+                                   c1,
+                                   c2,
+                                   c3);
+
+    amd_assembly_outer_product_1x4(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I1],
+                                   vector_type<int8_t, 16>{b0}.AsType<int8x4_t>()[I1],
+                                   vector_type<int8_t, 16>{b1}.AsType<int8x4_t>()[I1],
+                                   vector_type<int8_t, 16>{b2}.AsType<int8x4_t>()[I1],
+                                   vector_type<int8_t, 16>{b3}.AsType<int8x4_t>()[I1],
+                                   c0,
+                                   c1,
+                                   c2,
+                                   c3);
+
+    amd_assembly_outer_product_1x4(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I2],
+                                   vector_type<int8_t, 16>{b0}.AsType<int8x4_t>()[I2],
+                                   vector_type<int8_t, 16>{b1}.AsType<int8x4_t>()[I2],
+                                   vector_type<int8_t, 16>{b2}.AsType<int8x4_t>()[I2],
+                                   vector_type<int8_t, 16>{b3}.AsType<int8x4_t>()[I2],
+                                   c0,
+                                   c1,
+                                   c2,
+                                   c3);
+
+    amd_assembly_outer_product_1x4(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I3],
+                                   vector_type<int8_t, 16>{b0}.AsType<int8x4_t>()[I3],
+                                   vector_type<int8_t, 16>{b1}.AsType<int8x4_t>()[I3],
+                                   vector_type<int8_t, 16>{b2}.AsType<int8x4_t>()[I3],
+                                   vector_type<int8_t, 16>{b3}.AsType<int8x4_t>()[I3],
+                                   c0,
+                                   c1,
+                                   c2,
+                                   c3);
+}
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/amd_llvm_intrinsic.hpp b/composable_kernel/include/utility/amd_llvm_intrinsic.hpp
new file mode 100644
index 0000000000..841d48f81c
--- /dev/null
+++ b/composable_kernel/include/utility/amd_llvm_intrinsic.hpp
@@ -0,0 +1,11 @@
+#ifndef CK_AMD_LLVM_INTRINSIC_HPP
+#define CK_AMD_LLVM_INTRINSIC_HPP
+
+#include "data_type.hpp"
+
+namespace ck {
+
+__device__ int32_t llvm_amdgcn_readfirstlane_i32(int32_t i) __asm("llvm.amdgcn.readfirstlane");
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/amd_xdlops.hpp b/composable_kernel/include/utility/amd_xdlops.hpp
new file mode 100644
index 0000000000..da74fe1d48
--- /dev/null
+++ b/composable_kernel/include/utility/amd_xdlops.hpp
@@ -0,0 +1,499 @@
+#ifndef CK_AMD_XDLOPS_HPP
+#define CK_AMD_XDLOPS_HPP
+
+#include "data_type.hpp"
+
+namespace ck {
+
+// A, B, C, cbsz, abid, blgp
+extern "C" __device__ float32_t llvm_intrin_amdgcn_mfma_f32_32x32x1f32(
+    float, float, float32_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.32x32x1f32");
+
+extern "C" __device__ float16_t llvm_intrin_amdgcn_mfma_f32_32x32x2f32(
+    float, float, float16_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.32x32x2f32");
+
+extern "C" __device__ float4_t llvm_intrin_amdgcn_mfma_f32_16x16x4f32(
+    float, float, float4_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.16x16x4f32");
+
+extern "C" __device__ float16_t llvm_intrin_amdgcn_mfma_f32_16x16x1f32(
+    float, float, float16_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.16x16x1f32");
+
+extern "C" __device__ float4_t llvm_intrin_amdgcn_mfma_f32_4x4x1f32(
+    float, float, float4_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.4x4x1f32");
+
+extern "C" __device__ float32_t llvm_intrin_amdgcn_mfma_f32_32x32x4f16(
+    half4_t, half4_t, float32_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.32x32x4f16");
+
+extern "C" __device__ float16_t llvm_intrin_amdgcn_mfma_f32_32x32x8f16(
+    half4_t, half4_t, float16_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.32x32x8f16");
+
+extern "C" __device__ float4_t llvm_intrin_amdgcn_mfma_f32_16x16x16f16(
+    half4_t, half4_t, float4_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.16x16x16f16");
+
+extern "C" __device__ float16_t llvm_intrin_amdgcn_mfma_f32_16x16x4f16(
+    half4_t, half4_t, float16_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.16x16x4f16");
+
+extern "C" __device__ float4_t llvm_intrin_amdgcn_mfma_f32_4x4x4f16(
+    half4_t, half4_t, float4_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.4x4x4f16");
+
+extern "C" __device__ float32_t llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(
+    ushort2_t, ushort2_t, float32_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.32x32x2bf16");
+
+extern "C" __device__ float16_t llvm_intrin_amdgcn_mfma_f32_32x32x4bf16(
+    ushort2_t, ushort2_t, float16_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.32x32x4bf16");
+
+extern "C" __device__ float4_t llvm_intrin_amdgcn_mfma_f32_16x16x8bf16(
+    ushort2_t, ushort2_t, float4_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.16x16x8bf16");
+
+extern "C" __device__ float16_t llvm_intrin_amdgcn_mfma_f32_16x16x2bf16(
+    ushort2_t, ushort2_t, float16_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.16x16x2bf16");
+
+extern "C" __device__ float4_t llvm_intrin_amdgcn_mfma_f32_4x4x2bf16(
+    ushort2_t, ushort2_t, float4_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.4x4x2bf16");
+
+template <index_t MPerWave, index_t NPerWave, index_t COffset>
+struct intrin_mfma_f32_32x32x1f32;
+
+template <index_t COffset>
+struct intrin_mfma_f32_32x32x1f32<64, 64, COffset>
+{
+    template <class FloatC>
+    __device__ static void Run(const float& reg_a, const float& reg_b, FloatC& reg_c)
+    {
+        reg_c(Number<COffset>{}).template AsType<float32_t>()(Number<0>{}) =
+            llvm_intrin_amdgcn_mfma_f32_32x32x1f32(
+                reg_a,
+                reg_b,
+                reg_c[Number<COffset>{}].template AsType<float32_t>()[Number<0>{}],
+                1,
+                0,
+                0);
+        reg_c(Number<COffset + 1>{}).template AsType<float32_t>()(Number<0>{}) =
+            llvm_intrin_amdgcn_mfma_f32_32x32x1f32(
+                reg_a,
+                reg_b,
+                reg_c[Number<COffset + 1>{}].template AsType<float32_t>()[Number<0>{}],
+                1,
+                1,
+                0);
+    }
+};
+
+template <index_t COffset>
+struct intrin_mfma_f32_32x32x1f32<32, 64, COffset>
+{
+    template <class FloatC>
+    __device__ static void Run(const float& reg_a, const float& reg_b, FloatC& reg_c)
+    {
+        reg_c(Number<COffset>{}).template AsType<float32_t>()(Number<0>{}) =
+            llvm_intrin_amdgcn_mfma_f32_32x32x1f32(
+                reg_a,
+                reg_b,
+                reg_c[Number<COffset>{}].template AsType<float32_t>()[Number<0>{}],
+                1,
+                0,
+                0);
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave, index_t COffset>
+struct intrin_mfma_f32_32x32x2f32;
+
+template <index_t COffset>
+struct intrin_mfma_f32_32x32x2f32<32, 32, COffset>
+{
+    template <class FloatC>
+    __device__ static void Run(const float& reg_a, const float& reg_b, FloatC& reg_c)
+    {
+        reg_c(Number<COffset>{}).template AsType<float16_t>()(Number<0>{}) =
+            llvm_intrin_amdgcn_mfma_f32_32x32x2f32(
+                reg_a,
+                reg_b,
+                reg_c[Number<COffset>{}].template AsType<float16_t>()[Number<0>{}],
+                0,
+                0,
+                0);
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave, index_t COffset>
+struct intrin_mfma_f32_16x16x4f32;
+
+template <index_t COffset>
+struct intrin_mfma_f32_16x16x4f32<16, 16, COffset>
+{
+    template <class FloatC>
+    __device__ static void Run(const float& reg_a, const float& reg_b, FloatC& reg_c)
+    {
+        reg_c(Number<COffset>{}).template AsType<float4_t>()(Number<0>{}) =
+            llvm_intrin_amdgcn_mfma_f32_16x16x4f32(
+                reg_a,
+                reg_b,
+                reg_c[Number<COffset>{}].template AsType<float4_t>()[Number<0>{}],
+                0,
+                0,
+                0);
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave, index_t COffset>
+struct intrin_mfma_f32_16x16x1f32;
+
+template <index_t COffset>
+struct intrin_mfma_f32_16x16x1f32<16, 64, COffset>
+{
+    template <class FloatC>
+    __device__ static void Run(const float& reg_a, const float& reg_b, FloatC& reg_c)
+    {
+
+        reg_c(Number<COffset>{}).template AsType<float16_t>()(Number<0>{}) =
+            llvm_intrin_amdgcn_mfma_f32_16x16x1f32(
+                reg_a,
+                reg_b,
+                reg_c[Number<COffset>{}].template AsType<float16_t>()[Number<0>{}],
+                2,
+                0,
+                0);
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave, index_t COffset>
+struct intrin_mfma_f32_4x4x1f32;
+
+template <index_t COffset>
+struct intrin_mfma_f32_4x4x1f32<4, 64, COffset>
+{
+    template <class FloatC>
+    __device__ static void Run(const float& reg_a, const float& reg_b, FloatC& reg_c)
+    {
+        reg_c(Number<COffset>{}).template AsType<float4_t>()(Number<0>{}) =
+            llvm_intrin_amdgcn_mfma_f32_4x4x1f32(
+                reg_a,
+                reg_b,
+                reg_c[Number<COffset>{}].template AsType<float4_t>()[Number<0>{}],
+                4,
+                0,
+                0);
+    }
+};
+
+template <index_t COffset>
+struct intrin_mfma_f32_4x4x1f32<8, 64, COffset>
+{
+    template <class FloatC>
+    __device__ static void Run(const float& reg_a, const float& reg_b, FloatC& reg_c)
+    {
+        reg_c(Number<COffset>{}).template AsType<float4_t>()(Number<0>{}) =
+            llvm_intrin_amdgcn_mfma_f32_4x4x1f32(
+                reg_a,
+                reg_b,
+                reg_c[Number<COffset>{}].template AsType<float4_t>()[Number<0>{}],
+                4,
+                0,
+                0);
+        reg_c(Number<COffset + 1>{}).template AsType<float4_t>()(Number<0>{}) =
+            llvm_intrin_amdgcn_mfma_f32_4x4x1f32(
+                reg_a,
+                reg_b,
+                reg_c[Number<COffset + 1>{}].template AsType<float4_t>()[Number<0>{}],
+                4,
+                1,
+                0);
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave, index_t COffset>
+struct intrin_mfma_f32_32x32x4f16;
+
+template <index_t COffset>
+struct intrin_mfma_f32_32x32x4f16<64, 64, COffset>
+{
+    template <class FloatC>
+    __device__ static void Run(const half4_t& reg_a, const half4_t& reg_b, FloatC& reg_c)
+    {
+        reg_c(Number<COffset>{}).template AsType<float32_t>()(Number<0>{}) =
+            llvm_intrin_amdgcn_mfma_f32_32x32x4f16(
+                reg_a,
+                reg_b,
+                reg_c[Number<COffset>{}].template AsType<float32_t>()[Number<0>{}],
+                1,
+                0,
+                0);
+        reg_c(Number<COffset + 1>{}).template AsType<float32_t>()(Number<0>{}) =
+            llvm_intrin_amdgcn_mfma_f32_32x32x4f16(
+                reg_a,
+                reg_b,
+                reg_c[Number<COffset + 1>{}].template AsType<float32_t>()[Number<0>{}],
+                1,
+                1,
+                0);
+    }
+};
+
+template <index_t COffset>
+struct intrin_mfma_f32_32x32x4f16<32, 64, COffset>
+{
+    template <class FloatC>
+    __device__ static void Run(const half4_t& reg_a, const half4_t& reg_b, FloatC& reg_c)
+    {
+        reg_c(Number<COffset>{}).template AsType<float32_t>()(Number<0>{}) =
+            llvm_intrin_amdgcn_mfma_f32_32x32x4f16(
+                reg_a,
+                reg_b,
+                reg_c[Number<COffset>{}].template AsType<float32_t>()[Number<0>{}],
+                1,
+                0,
+                0);
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave, index_t COffset>
+struct intrin_mfma_f32_32x32x8f16;
+
+template <index_t COffset>
+struct intrin_mfma_f32_32x32x8f16<32, 32, COffset>
+{
+    template <class FloatC>
+    __device__ static void Run(const half4_t& reg_a, const half4_t& reg_b, FloatC& reg_c)
+    {
+        reg_c(Number<COffset>{}).template AsType<float16_t>()(Number<0>{}) =
+            llvm_intrin_amdgcn_mfma_f32_32x32x8f16(
+                reg_a,
+                reg_b,
+                reg_c[Number<COffset>{}].template AsType<float16_t>()[Number<0>{}],
+                0,
+                0,
+                0);
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave, index_t COffset>
+struct intrin_mfma_f32_16x16x16f16;
+
+template <index_t COffset>
+struct intrin_mfma_f32_16x16x16f16<16, 16, COffset>
+{
+    template <class FloatC>
+    __device__ static void Run(const half4_t& reg_a, const half4_t& reg_b, FloatC& reg_c)
+    {
+        reg_c(Number<COffset>{}).template AsType<float4_t>()(Number<0>{}) =
+            llvm_intrin_amdgcn_mfma_f32_16x16x16f16(
+                reg_a,
+                reg_b,
+                reg_c[Number<COffset>{}].template AsType<float4_t>()[Number<0>{}],
+                0,
+                0,
+                0);
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave, index_t COffset>
+struct intrin_mfma_f32_16x16x4f16;
+
+template <index_t COffset>
+struct intrin_mfma_f32_16x16x4f16<16, 64, COffset>
+{
+    template <class FloatC>
+    __device__ static void Run(const half4_t& reg_a, const half4_t& reg_b, FloatC& reg_c)
+    {
+        reg_c(Number<COffset>{}).template AsType<float16_t>()(Number<0>{}) =
+            llvm_intrin_amdgcn_mfma_f32_16x16x4f16(
+                reg_a,
+                reg_b,
+                reg_c[Number<COffset>{}].template AsType<float16_t>()[Number<0>{}],
+                2,
+                0,
+                0);
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave, index_t COffset>
+struct intrin_mfma_f32_4x4x4f16;
+
+template <index_t COffset>
+struct intrin_mfma_f32_4x4x4f16<4, 64, COffset>
+{
+    template <class FloatC>
+    __device__ static void Run(const half4_t& reg_a, const half4_t& reg_b, FloatC& reg_c)
+    {
+        reg_c(Number<COffset>{}).template AsType<float4_t>()(Number<0>{}) =
+            llvm_intrin_amdgcn_mfma_f32_4x4x4f16(
+                reg_a,
+                reg_b,
+                reg_c[Number<COffset>{}].template AsType<float4_t>()[Number<0>{}],
+                4,
+                0,
+                0);
+    }
+};
+
+template <index_t COffset>
+struct intrin_mfma_f32_4x4x4f16<8, 64, COffset>
+{
+    template <class FloatC>
+    __device__ static void Run(const half4_t& reg_a, const half4_t& reg_b, FloatC& reg_c)
+    {
+        reg_c(Number<COffset>{}).template AsType<float4_t>()(Number<0>{}) =
+            llvm_intrin_amdgcn_mfma_f32_4x4x4f16(
+                reg_a,
+                reg_b,
+                reg_c[Number<COffset>{}].template AsType<float4_t>()[Number<0>{}],
+                4,
+                0,
+                0);
+        reg_c(Number<COffset + 1>{}).template AsType<float4_t>()(Number<0>{}) =
+            llvm_intrin_amdgcn_mfma_f32_4x4x4f16(
+                reg_a,
+                reg_b,
+                reg_c[Number<COffset + 1>{}].template AsType<float4_t>()[Number<0>{}],
+                4,
+                1,
+                0);
+    }
+};
+
+#if 0
+template <index_t MPerWave, index_t NPerWave, index_t AStride, index_t BStride>
+struct intrin_mfma_f32_32x32x2bf16;
+
+template <index_t AStride, index_t BStride>
+struct intrin_mfma_f32_32x32x2bf16<128, 64, AStride, BStride>
+{
+    __device__ static c_vec32_4_t::VecType
+    run(const ushort2_t* reg_a, const ushort2_t* reg_b, c_vec32_4_t::VecType reg_c)
+    {
+        reg_c.s.x = llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(reg_a[0], reg_b[0], reg_c.s.x, 1, 0, 0);
+        reg_c.s.y = llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(reg_a[0], reg_b[0], reg_c.s.y, 1, 1, 0);
+
+        reg_c.s.z =
+            llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(reg_a[AStride], reg_b[0], reg_c.s.z, 1, 0, 0);
+        reg_c.s.w =
+            llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(reg_a[AStride], reg_b[0], reg_c.s.w, 1, 1, 0);
+
+        return reg_c;
+    }
+};
+
+template <index_t AStride, index_t BStride>
+struct intrin_mfma_f32_32x32x2bf16<64, 128, AStride, BStride>
+{
+    __device__ static c_vec32_4_t::VecType
+    run(const ushort2_t* reg_a, const ushort2_t* reg_b, c_vec32_4_t::VecType reg_c)
+    {
+        reg_c.s.x = llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(reg_a[0], reg_b[0], reg_c.s.x, 1, 0, 0);
+        reg_c.s.y = llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(reg_a[0], reg_b[0], reg_c.s.y, 1, 1, 0);
+
+        reg_c.s.z =
+            llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(reg_a[0], reg_b[BStride], reg_c.s.z, 1, 0, 0);
+        reg_c.s.w =
+            llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(reg_a[0], reg_b[BStride], reg_c.s.w, 1, 1, 0);
+
+        return reg_c;
+    }
+};
+
+template <index_t AStride, index_t BStride>
+struct intrin_mfma_f32_32x32x2bf16<64, 64, AStride, BStride>
+{
+    __device__ static c_vec32_2_t::VecType
+    run(const ushort2_t* reg_a, const ushort2_t* reg_b, c_vec32_2_t::VecType reg_c)
+    {
+        reg_c.s.x = llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(reg_a[0], reg_b[0], reg_c.s.x, 1, 0, 0);
+        reg_c.s.y = llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(reg_a[0], reg_b[0], reg_c.s.y, 1, 1, 0);
+
+        return reg_c;
+    }
+};
+
+template <index_t AStride, index_t BStride>
+struct intrin_mfma_f32_32x32x2bf16<64, 32, AStride, BStride>
+{
+    __device__ static c_vec32_1_t::VecType
+    run(const ushort2_t* reg_a, const ushort2_t* reg_b, c_vec32_1_t::VecType reg_c)
+    {
+        reg_c.s.x = llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(reg_a[0], reg_b[0], reg_c.s.x, 0, 0, 1);
+
+        return reg_c;
+    }
+};
+
+template <index_t AStride, index_t BStride>
+struct intrin_mfma_f32_32x32x2bf16<32, 64, AStride, BStride>
+{
+    __device__ static c_vec32_1_t::VecType
+    run(const ushort2_t* reg_a, const ushort2_t* reg_b, c_vec32_1_t::VecType reg_c)
+    {
+        reg_c.s.x = llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(reg_a[0], reg_b[0], reg_c.s.x, 1, 0, 0);
+        return reg_c;
+    }
+};
+
+__device__ c_vec16_1_t::VecType intrin_mfma_f32_32x32x4bf16(const ushort2_t* reg_a,
+                                                            const ushort2_t* reg_b,
+                                                            c_vec16_1_t::VecType reg_c)
+{
+    reg_c.s.x = llvm_intrin_amdgcn_mfma_f32_32x32x4bf16(reg_a[0], reg_b[0], reg_c.s.x, 0, 0, 0);
+    return reg_c;
+}
+
+__device__ c_vec4_1_t::VecType intrin_mfma_f32_16x16x8bf16(const ushort2_t* reg_a,
+                                                           const ushort2_t* reg_b,
+                                                           c_vec4_1_t::VecType reg_c)
+{
+    reg_c.s.x = llvm_intrin_amdgcn_mfma_f32_16x16x8bf16(reg_a[0], reg_b[0], reg_c.s.x, 0, 0, 0);
+    return reg_c;
+}
+
+template <index_t MPerWave, index_t NPerWave>
+__device__ c_vec16_1_t::VecType intrin_mfma_f32_16x16x2bf16(const ushort2_t* reg_a,
+                                                            const ushort2_t* reg_b,
+                                                            c_vec16_1_t::VecType reg_c);
+
+template <>
+__device__ c_vec16_1_t::VecType intrin_mfma_f32_16x16x2bf16<16, 64>(const ushort2_t* reg_a,
+                                                                    const ushort2_t* reg_b,
+                                                                    c_vec16_1_t::VecType reg_c)
+{
+    reg_c.s.x = llvm_intrin_amdgcn_mfma_f32_16x16x2bf16(reg_a[0], reg_b[0], reg_c.s.x, 2, 0, 0);
+    return reg_c;
+}
+
+template <>
+__device__ c_vec16_1_t::VecType intrin_mfma_f32_16x16x2bf16<64, 16>(const ushort2_t* reg_a,
+                                                                    const ushort2_t* reg_b,
+                                                                    c_vec16_1_t::VecType reg_c)
+{
+    reg_c.s.x = llvm_intrin_amdgcn_mfma_f32_16x16x2bf16(reg_a[0], reg_b[0], reg_c.s.x, 0, 0, 4);
+    return reg_c;
+}
+
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f32_4x4x2bf16;
+
+template <>
+struct intrin_mfma_f32_4x4x2bf16<4, 64>
+{
+    __device__ static c_vec4_1_t::VecType
+    run(const ushort2_t* reg_a, const ushort2_t* reg_b, c_vec4_1_t::VecType reg_c)
+    {
+        reg_c.s.x = llvm_intrin_amdgcn_mfma_f32_4x4x2bf16(reg_a[0], reg_b[0], reg_c.s.x, 4, 0, 0);
+        return reg_c;
+    }
+};
+
+template <>
+struct intrin_mfma_f32_4x4x2bf16<8, 64>
+{
+    __device__ static c_vec4_2_t::VecType
+    run(const ushort2_t* reg_a, const ushort2_t* reg_b, c_vec4_2_t::VecType reg_c)
+    {
+        reg_c.s.x = llvm_intrin_amdgcn_mfma_f32_4x4x2bf16(reg_a[0], reg_b[0], reg_c.s.x, 4, 0, 0);
+        reg_c.s.y = llvm_intrin_amdgcn_mfma_f32_4x4x2bf16(reg_a[0], reg_b[0], reg_c.s.y, 4, 1, 0);
+        return reg_c;
+    }
+};
+
+#endif
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/array.hpp b/composable_kernel/include/utility/array.hpp
new file mode 100644
index 0000000000..7271094d39
--- /dev/null
+++ b/composable_kernel/include/utility/array.hpp
@@ -0,0 +1,63 @@
+#ifndef CK_ARRAY_HPP
+#define CK_ARRAY_HPP
+
+#include "functional2.hpp"
+#include "sequence.hpp"
+
+namespace ck {
+
+template <typename TData, index_t NSize>
+struct Array
+{
+    using type      = Array;
+    using data_type = TData;
+
+    TData mData[NSize];
+
+    __host__ __device__ static constexpr index_t Size() { return NSize; }
+
+    __host__ __device__ constexpr const TData& At(index_t i) const { return mData[i]; }
+
+    __host__ __device__ constexpr TData& At(index_t i) { return mData[i]; }
+
+    __host__ __device__ constexpr const TData& operator[](index_t i) const { return At(i); }
+
+    __host__ __device__ constexpr TData& operator()(index_t i) { return At(i); }
+
+    template <typename T>
+    __host__ __device__ constexpr auto operator=(const T& a)
+    {
+        static_assert(T::Size() == Size(), "wrong! size not the same");
+
+        static_for<0, Size(), 1>{}([&](auto i) { operator()(i) = a[i]; });
+
+        return *this;
+    }
+};
+
+// empty Array
+template <typename TData>
+struct Array<TData, 0>
+{
+    using type      = Array;
+    using data_type = TData;
+
+    __host__ __device__ static constexpr index_t Size() { return 0; }
+};
+
+template <typename X, typename... Xs>
+__host__ __device__ constexpr auto make_array(X&& x, Xs&&... xs)
+{
+    using data_type = remove_cv_t<remove_reference_t<X>>;
+    return Array<data_type, sizeof...(Xs) + 1>{{std::forward<X>(x), std::forward<Xs>(xs)...}};
+}
+
+// make empty array
+template <typename X>
+__host__ __device__ constexpr auto make_array()
+{
+    return Array<X, 0>{};
+}
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/array_multi_index.hpp b/composable_kernel/include/utility/array_multi_index.hpp
new file mode 100644
index 0000000000..f692fb5143
--- /dev/null
+++ b/composable_kernel/include/utility/array_multi_index.hpp
@@ -0,0 +1,77 @@
+#ifndef CK_ARRAY_MULTI_INDEX_HPP
+#define CK_ARRAY_MULTI_INDEX_HPP
+
+#include "common_header.hpp"
+
+namespace ck {
+
+template <index_t N>
+using MultiIndex = Array<index_t, N>;
+
+template <typename... Xs>
+__host__ __device__ constexpr auto make_multi_index(Xs&&... xs)
+{
+    return make_array<index_t>(index_t{xs}...);
+}
+
+template <index_t NSize>
+__host__ __device__ constexpr auto make_zero_multi_index()
+{
+    return unpack([](auto... xs) { return make_multi_index(xs...); },
+                  typename uniform_sequence_gen<NSize, 0>::type{});
+}
+
+template <typename T>
+__host__ __device__ constexpr auto to_multi_index(const T& x)
+{
+    return unpack([](auto... ys) { return make_multi_index(ys...); }, x);
+}
+
+template <index_t NSize, typename X>
+__host__ __device__ constexpr auto operator+=(MultiIndex<NSize>& y, const X& x)
+{
+    static_assert(X::Size() == NSize, "wrong! size not the same");
+    static_for<0, NSize, 1>{}([&](auto i) { y(i) += x[i]; });
+    return y;
+}
+
+template <index_t NSize, typename X>
+__host__ __device__ constexpr auto operator-=(MultiIndex<NSize>& y, const X& x)
+{
+    static_assert(X::Size() == NSize, "wrong! size not the same");
+    static_for<0, NSize, 1>{}([&](auto i) { y(i) -= x[i]; });
+    return y;
+}
+
+template <index_t NSize, typename T>
+__host__ __device__ constexpr auto operator+(const MultiIndex<NSize>& a, const T& b)
+{
+    using type = MultiIndex<NSize>;
+    static_assert(T::Size() == NSize, "wrong! size not the same");
+    type r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = a[i] + b[i]; });
+    return r;
+}
+
+template <index_t NSize, typename T>
+__host__ __device__ constexpr auto operator-(const MultiIndex<NSize>& a, const T& b)
+{
+    using type = MultiIndex<NSize>;
+    static_assert(T::Size() == NSize, "wrong! size not the same");
+    type r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = a[i] - b[i]; });
+    return r;
+}
+
+template <index_t NSize, typename T>
+__host__ __device__ constexpr auto operator*(const MultiIndex<NSize>& a, const T& b)
+{
+    using type = MultiIndex<NSize>;
+    static_assert(T::Size() == NSize, "wrong! size not the same");
+    type r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = a[i] * b[i]; });
+    return r;
+}
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/common_header.hpp b/composable_kernel/include/utility/common_header.hpp
new file mode 100644
index 0000000000..5ff7688a1c
--- /dev/null
+++ b/composable_kernel/include/utility/common_header.hpp
@@ -0,0 +1,45 @@
+#ifndef CK_COMMON_HEADER_HPP
+#define CK_COMMON_HEADER_HPP
+
+#include "config.hpp"
+#include "array.hpp"
+#include "container_helper.hpp"
+#include "statically_indexed_array.hpp"
+#include "container_element_picker.hpp"
+#include "multi_index.hpp"
+#include "data_type_enum.hpp"
+#include "data_type.hpp"
+#include "data_type_helper.hpp"
+#include "functional.hpp"
+#include "functional2.hpp"
+#include "functional3.hpp"
+#include "functional4.hpp"
+#include "integral_constant.hpp"
+#include "math.hpp"
+#include "number.hpp"
+#include "sequence.hpp"
+#include "sequence_helper.hpp"
+#include "synchronization.hpp"
+#include "tuple.hpp"
+#include "tuple_helper.hpp"
+#include "type.hpp"
+#include "utility.hpp"
+#include "magic_division.hpp"
+#include "amd_buffer_addressing_v2.hpp"
+#include "static_buffer.hpp"
+#include "dynamic_buffer.hpp"
+
+// TODO: remove this
+#if CK_USE_AMD_INLINE_ASM
+#include "amd_inline_asm.hpp"
+#endif
+
+#if CK_USE_AMD_DLOP
+#include "amd_dlop.hpp"
+#endif
+
+#if CK_USE_AMD_XDLOPS
+#include "amd_xdlops.hpp"
+#endif
+
+#endif
diff --git a/composable_kernel/include/utility/config.hpp b/composable_kernel/include/utility/config.hpp
new file mode 100644
index 0000000000..4908d8d818
--- /dev/null
+++ b/composable_kernel/include/utility/config.hpp
@@ -0,0 +1,142 @@
+#ifndef CK_CONFIG_AMD_HPP
+#define CK_CONFIG_AMD_HPP
+
+#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
+#include "hip/hip_runtime.h"
+#include "hip/hip_fp16.h"
+#endif
+#include "bfloat16_dev.hpp"
+
+// address space for kernel parameter
+#define CONSTANT __attribute__((address_space(4)))
+
+// GPU target
+// should enable one and only one GPU target
+#if !(defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900) || defined(CK_AMD_GPU_GFX906) || \
+      defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90A) || defined(CK_AMD_GPU_GFX1030))
+#error Need to define a single GPU target
+#endif
+
+// HIP version
+#ifndef CK_HIP_VERSION_FLAT
+#define CK_HIP_VERSION_FLAT 0
+#endif
+
+// launch bounds
+#define CK_USE_LAUNCH_BOUNDS 1
+
+#ifdef CK_USE_LAUNCH_BOUNDS
+#define CK_MAX_THREAD_PER_BLOCK 256
+#define CK_MIN_BLOCK_PER_CU 2
+#endif
+
+// buffer resourse
+#if defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900) || defined(CK_AMD_GPU_GFX906) || \
+    defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90A)
+#define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000
+#elif defined(CK_AMD_GPU_GFX1030)
+#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
+#endif
+
+// multi index
+#define CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0
+
+// AMD inline asm
+#ifndef CK_USE_AMD_INLINE_ASM
+#define CK_USE_AMD_INLINE_ASM 1
+#endif
+
+// AMD DLOPS
+#ifndef CK_USE_AMD_DLOP
+#define CK_USE_AMD_DLOP 1
+#endif
+
+#ifndef CK_USE_AMD_DLOP_INLINE_ASM
+#define CK_USE_AMD_DLOP_INLINE_ASM 1
+#endif
+
+// AMD buffer addressing
+#ifndef CK_USE_AMD_BUFFER_ADDRESSING
+#define CK_USE_AMD_BUFFER_ADDRESSING 1
+#endif
+
+// only gfx908 support native floating point atomic add
+#ifndef CK_USE_AMD_BUFFER_ATOMIC_FADD
+#define CK_USE_AMD_BUFFER_ATOMIC_FADD 0
+#endif
+
+// AMD XDLOPS
+#ifndef CK_USE_AMD_XDLOPS
+#define CK_USE_AMD_XDLOPS 0
+#endif
+
+// block synchronization only s_wait lgkmcnt(0), not vmcnt(0)
+#ifndef CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
+#define CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1
+#endif
+
+// experimental implementation
+#ifndef CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 0
+#endif
+
+#ifndef CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
+#define CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK 1
+#endif
+
+#ifndef CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK_OFFSET_TRICK
+#define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK_OFFSET_TRICK 1
+#endif
+
+// pass tensor descriptor by value or void*
+#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 0
+#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 1
+
+// merge transformation use magic number division
+#define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 0
+
+// hack: have underlying assumption that need to be satsified, otherwise it's a bug
+// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
+// thread-invariant, otherwise it's a bug
+// TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread"
+#ifndef CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
+#define CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0
+#endif
+
+// workaround for compiler crash when compiling recursive lambda
+#ifndef CK_WORKAROUND_SWDEV_275126
+#define CK_WORKAROUND_SWDEV_275126 1
+#endif
+
+// workaround for compiler crash when using buffer load/store for i8
+#ifndef CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
+#define CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE 1
+#endif
+
+// workaround for compiler crash when using buffer load/store for i8
+#ifndef CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
+#define CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE 1
+#endif
+
+namespace ck {
+
+enum AddressSpaceEnum_t
+{
+    Generic,
+    Global,
+    Lds,
+    Sgpr,
+    Vgpr
+};
+
+enum InMemoryDataOperationEnum_t
+{
+    Set,
+    AtomicAdd
+};
+
+// index type
+using index_t = int32_t;
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/container_element_picker.hpp b/composable_kernel/include/utility/container_element_picker.hpp
new file mode 100644
index 0000000000..54915125ac
--- /dev/null
+++ b/composable_kernel/include/utility/container_element_picker.hpp
@@ -0,0 +1,155 @@
+#ifndef CK_CONTAINER_ELEMENT_PICKER_HPP
+#define CK_CONTAINER_ELEMENT_PICKER_HPP
+
+#include "functional2.hpp"
+#include "sequence.hpp"
+
+namespace ck {
+
+// Arr: Array or StaticallyIndexedArray
+// Picks: Sequence<...>
+template <typename Arr, typename Picks>
+struct ContainerElementPicker
+{
+    using type = ContainerElementPicker;
+#if 0
+    using data_type = typename Arr::data_type;
+#endif
+
+    __host__ __device__ constexpr ContainerElementPicker() = delete;
+
+    __host__ __device__ constexpr ContainerElementPicker(Arr& array) : mArray{array}
+    {
+        constexpr index_t imax =
+            reduce_on_sequence(Picks{}, math::maximize<index_t>{}, Number<0>{});
+
+        static_assert(imax < Arr::Size(), "wrong! exceeding # array element");
+    }
+
+    __host__ __device__ static constexpr auto Size() { return Picks::Size(); }
+
+    template <index_t I>
+    __host__ __device__ constexpr const auto& At(Number<I> i) const
+    {
+        static_assert(I < Size(), "wrong!");
+
+        constexpr auto IP = Picks{}[i];
+        return mArray[IP];
+    }
+
+    template <index_t I>
+    __host__ __device__ constexpr auto& At(Number<I> i)
+    {
+        static_assert(I < Size(), "wrong!");
+
+        constexpr auto IP = Picks{}[i];
+        return mArray(IP);
+    }
+
+    template <index_t I>
+    __host__ __device__ constexpr const auto& operator[](Number<I> i) const
+    {
+        return At(i);
+    }
+
+    template <index_t I>
+    __host__ __device__ constexpr auto& operator()(Number<I> i)
+    {
+        return At(i);
+    }
+
+    template <typename T>
+    __host__ __device__ constexpr auto operator=(const T& a)
+    {
+        static_assert(T::Size() == Size(), "wrong! size not the same");
+
+        static_for<0, Size(), 1>{}([&](auto i) { operator()(i) = a[i]; });
+
+        return *this;
+    }
+
+    private:
+    Arr& mArray;
+};
+
+// Arr: Array or StaticallyIndexedArray
+// Picks: Sequence<...>
+template <typename Arr, typename Picks>
+struct ConstantContainerElementPicker
+{
+    using type = ConstantContainerElementPicker;
+#if 0
+    using data_type = typename Arr::data_type;
+#endif
+
+    __host__ __device__ constexpr ConstantContainerElementPicker() = delete;
+
+    __host__ __device__ constexpr ConstantContainerElementPicker(const Arr& array) : mArray{array}
+    {
+        constexpr index_t imax =
+            reduce_on_sequence(Picks{}, math::maximize<index_t>{}, Number<0>{});
+
+        static_assert(imax < Arr::Size(), "wrong! exceeding # array element");
+    }
+
+    __host__ __device__ static constexpr auto Size() { return Picks::Size(); }
+
+    template <index_t I>
+    __host__ __device__ constexpr const auto& At(Number<I> i) const
+    {
+        static_assert(I < Size(), "wrong!");
+
+        constexpr auto IP = Picks{}[i];
+        return mArray[IP];
+    }
+
+    template <index_t I>
+    __host__ __device__ constexpr const auto& operator[](Number<I> i) const
+    {
+        return At(i);
+    }
+
+    private:
+    const Arr& mArray;
+};
+
+template <typename Arr, typename Picks, typename X>
+__host__ __device__ constexpr auto operator+=(ContainerElementPicker<Arr, Picks>& y, const X& x)
+{
+    using Y                 = ContainerElementPicker<Arr, Picks>;
+    constexpr index_t nsize = Y::Size();
+
+    static_assert(nsize == X::Size(), "wrong! size not the same");
+
+    static_for<0, nsize, 1>{}([&](auto i) { y(i) += x[i]; });
+
+    return y;
+}
+
+template <typename Arr, typename Picks, typename X>
+__host__ __device__ constexpr auto operator-=(ContainerElementPicker<Arr, Picks>& y, const X& x)
+{
+    using Y                 = ContainerElementPicker<Arr, Picks>;
+    constexpr index_t nsize = Y::Size();
+
+    static_assert(nsize == X::Size(), "wrong! size not the same");
+
+    static_for<0, nsize, 1>{}([&](auto i) { y(i) -= x[i]; });
+
+    return y;
+}
+
+template <typename Arr, typename Picks>
+__host__ __device__ constexpr auto pick_container_element(Arr& a, Picks)
+{
+    return ContainerElementPicker<Arr, Picks>(a);
+}
+
+template <typename Arr, typename Picks>
+__host__ __device__ constexpr auto pick_container_element(const Arr& a, Picks)
+{
+    return ConstantContainerElementPicker<Arr, Picks>(a);
+}
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/container_helper.hpp b/composable_kernel/include/utility/container_helper.hpp
new file mode 100644
index 0000000000..a7ed8ec059
--- /dev/null
+++ b/composable_kernel/include/utility/container_helper.hpp
@@ -0,0 +1,403 @@
+#ifndef CK_CONTAINER_HELPER_HPP
+#define CK_CONTAINER_HELPER_HPP
+
+#include "sequence.hpp"
+#include "sequence_helper.hpp"
+#include "array.hpp"
+#include "tuple.hpp"
+#include "tuple_helper.hpp"
+#include "statically_indexed_array.hpp"
+#include "container_element_picker.hpp"
+
+namespace ck {
+
+template <typename TData, index_t NSize>
+__host__ __device__ constexpr auto container_push_back(const Array<TData, NSize>& a, const TData& x)
+{
+    Array<TData, NSize + 1> r;
+
+    static_for<0, NSize, 1>{}([&r, &a ](auto i) constexpr { r(i) = a[i]; });
+
+    r(Number<NSize>{}) = x;
+
+    return r;
+}
+
+template <typename... Ts, typename T>
+__host__ __device__ constexpr auto container_push_front(const Tuple<Ts...>& a, const T& x)
+{
+    return container_concat(make_tuple(x), a);
+}
+
+template <typename... Ts, typename T>
+__host__ __device__ constexpr auto container_push_back(const Tuple<Ts...>& a, const T& x)
+{
+    return container_concat(a, make_tuple(x));
+}
+
+template <typename TData, index_t NSize, index_t... IRs>
+__host__ __device__ constexpr auto
+container_reorder_given_new2old(const Array<TData, NSize>& old_array, Sequence<IRs...> /*new2old*/)
+{
+    static_assert(NSize == sizeof...(IRs), "wrong! size not consistent");
+
+    static_assert(is_valid_sequence_map<Sequence<IRs...>>{}, "wrong! invalid reorder map");
+
+    return make_array(old_array[Number<IRs>{}]...);
+}
+
+template <typename TData, index_t NSize, index_t... IRs>
+__host__ __device__ constexpr auto
+container_reorder_given_old2new(const Array<TData, NSize>& old_array, Sequence<IRs...> old2new)
+{
+    return container_reorder_given_new2old(
+        old_array, typename sequence_map_inverse<decltype(old2new)>::type{});
+}
+
+template <typename... Ts, index_t... IRs>
+__host__ __device__ constexpr auto container_reorder_given_new2old(const Tuple<Ts...>& old_tuple,
+                                                                   Sequence<IRs...> /*new2old*/)
+{
+    static_assert(sizeof...(Ts) == sizeof...(IRs), "wrong! size not consistent");
+
+    static_assert(is_valid_sequence_map<Sequence<IRs...>>{}, "wrong! invalid reorder map");
+
+    return make_tuple(old_tuple[Number<IRs>{}]...);
+}
+
+template <typename... Ts, index_t... IRs>
+__host__ __device__ constexpr auto container_reorder_given_old2new(const Tuple<Ts...>& old_tuple,
+                                                                   Sequence<IRs...> old2new)
+{
+    return container_reorder_given_new2old(
+        old_tuple, typename sequence_map_inverse<decltype(old2new)>::type{});
+}
+
+template <index_t... Is, index_t... IRs>
+__host__ __device__ constexpr auto container_reorder_given_new2old(Sequence<Is...> /* old_seq */,
+                                                                   Sequence<IRs...> /*new2old*/)
+{
+    static_assert(sizeof...(Is) == sizeof...(IRs), "wrong! size not consistent");
+
+    static_assert(is_valid_sequence_map<Sequence<IRs...>>{}, "wrong! invalid reorder map");
+
+    return Sequence<Sequence<Is...>::At(Number<IRs>{})...>{};
+}
+
+template <index_t... Is, index_t... IRs>
+__host__ __device__ constexpr auto container_reorder_given_old2new(Sequence<Is...> old_seq,
+                                                                   Sequence<IRs...> /* old2new */)
+{
+    static_assert(sizeof...(Is) == sizeof...(IRs), "wrong! size not consistent");
+
+    static_assert(is_valid_sequence_map<Sequence<IRs...>>{}, "wrong! invalid reorder map");
+
+    constexpr auto new2old = typename sequence_map_inverse<Sequence<IRs...>>::type{};
+
+    return container_reorder_given_new2old(old_seq, new2old);
+}
+
+#if !CK_WORKAROUND_SWDEV_275126
+// rocm-4.1 compiler would crash for recursive lambda
+template <typename Container,
+          typename Reduce,
+          typename Init,
+          index_t IBegin = 0,
+          index_t IEnd   = Container::Size(),
+          index_t IStep  = 1>
+__host__ __device__ constexpr auto container_reduce(const Container& x,
+                                                    Reduce reduce,
+                                                    Init init,
+                                                    Number<IBegin> = Number<0>{},
+                                                    Number<IEnd>   = Number<Container::Size()>{},
+                                                    Number<IStep>  = Number<1>{})
+{
+    static_assert((IEnd - IBegin) % IStep == 0, "wrong!");
+
+    // f is recursive function, fs is a dummy of f
+    // i is index, y_old is current scan, r_old is current reduction
+    auto f = [&](auto fs, auto i, auto r_old) {
+        auto r_new = reduce(x[i], r_old);
+
+        if constexpr(i.value < IEnd - IStep)
+        {
+            // recursively call f/fs
+            return fs(fs, i + Number<IStep>{}, r_new);
+        }
+        else
+        {
+            return r_new;
+        }
+    };
+
+    // start recursion
+    return f(f, Number<IBegin>{}, init);
+}
+#else
+// i is index, y_old is current scan, r_old is current reduction
+template <typename Container,
+          typename Reduce,
+          typename ROld,
+          index_t I,
+          index_t IEnd,
+          index_t IStep>
+__host__ __device__ constexpr auto container_reduce_impl(
+    const Container& x, Reduce reduce, ROld r_old, Number<I> i, Number<IEnd>, Number<IStep>)
+{
+    auto r_new = reduce(x[i], r_old);
+
+    if constexpr(i.value < IEnd - IStep)
+    {
+        return container_reduce_impl(
+            x, reduce, r_new, i + Number<IStep>{}, Number<IEnd>{}, Number<IStep>{});
+    }
+    else
+    {
+        return r_new;
+    }
+}
+
+// rocm-4.1 compiler would crash for recursive lambda
+// container reduce with initial value
+template <typename Container,
+          typename Reduce,
+          typename Init,
+          index_t IBegin = 0,
+          index_t IEnd   = Container::Size(),
+          index_t IStep  = 1>
+__host__ __device__ constexpr auto container_reduce(const Container& x,
+                                                    Reduce reduce,
+                                                    Init init,
+                                                    Number<IBegin> = Number<0>{},
+                                                    Number<IEnd>   = Number<Container::Size()>{},
+                                                    Number<IStep>  = Number<1>{})
+{
+    static_assert((IEnd - IBegin) % IStep == 0, "wrong!");
+
+    if constexpr(IEnd > IBegin)
+    {
+        return container_reduce_impl(
+            x, reduce, init, Number<IBegin>{}, Number<IEnd>{}, Number<IStep>{});
+    }
+    else
+    {
+        return init;
+    }
+}
+#endif
+
+template <typename TData, index_t NSize, typename Reduce>
+__host__ __device__ constexpr auto
+container_reverse_inclusive_scan(const Array<TData, NSize>& x, Reduce f, TData init)
+{
+    Array<TData, NSize> y;
+
+    TData r = init;
+
+    static_for<NSize - 1, 0, -1>{}([&](auto i) {
+        r    = f(r, x[i]);
+        y(i) = r;
+    });
+
+    r              = f(r, x[Number<0>{}]);
+    y(Number<0>{}) = r;
+
+    return y;
+}
+
+template <typename TData, index_t NSize, typename Reduce>
+__host__ __device__ constexpr auto
+container_reverse_exclusive_scan(const Array<TData, NSize>& x, Reduce f, TData init)
+{
+    Array<TData, NSize> y;
+
+    TData r = init;
+
+    static_for<NSize - 1, 0, -1>{}([&](auto i) {
+        y(i) = r;
+        r    = f(r, x[i]);
+    });
+
+    y(Number<0>{}) = r;
+
+    return y;
+}
+
+template <index_t... Is, typename Reduce, index_t Init>
+__host__ __device__ constexpr auto
+container_reverse_exclusive_scan(const Sequence<Is...>& seq, Reduce f, Number<Init>)
+{
+    return reverse_exclusive_scan_sequence(seq, f, Number<Init>{});
+}
+
+#if !CK_WORKAROUND_SWDEV_275126
+// rocm4.1 compiler would crash with recursive lambda
+template <typename... Xs, typename Reduce, typename Init>
+__host__ __device__ constexpr auto
+container_reverse_exclusive_scan(const Tuple<Xs...>& x, Reduce reduce, Init init)
+{
+    constexpr index_t NSize = sizeof...(Xs);
+
+    // f is recursive function, fs is a dummy of f
+    // i is index, y_old is current scan, r_old is current reduction
+    auto f = [&](auto fs, auto i, auto y_old, auto r_old) {
+        auto r_new = reduce(x[i], r_old);
+
+        auto y_new = container_push_front(y_old, r_new);
+
+        if constexpr(i.value > 1)
+        {
+            // recursively call f/fs
+            return fs(fs, i - Number<1>{}, y_new, r_new);
+        }
+        else
+        {
+            return y_new;
+        }
+    };
+
+    // start recursion
+    return f(f, Number<NSize - 1>{}, make_tuple(init), init);
+}
+#else
+// i is index, y_old is current scan, r_old is current reduction
+template <typename... Xs, typename Reduce, index_t I, typename YOld, typename ROld>
+__host__ __device__ constexpr auto container_reverse_exclusive_scan_impl(
+    const Tuple<Xs...>& x, Reduce reduce, Number<I> i, YOld y_old, ROld r_old)
+{
+    auto r_new = reduce(x[i], r_old);
+
+    auto y_new = container_push_front(y_old, r_new);
+
+    if constexpr(i.value > 1)
+    {
+        // recursively call f/fs
+        return container_reverse_exclusive_scan_impl(x, reduce, i - Number<1>{}, y_new, r_new);
+    }
+    else
+    {
+        return y_new;
+    }
+}
+
+template <typename... Xs, typename Reduce, typename Init>
+__host__ __device__ constexpr auto
+container_reverse_exclusive_scan(const Tuple<Xs...>& x, Reduce reduce, Init init)
+{
+    constexpr index_t NSize = sizeof...(Xs);
+
+    return container_reverse_exclusive_scan_impl(
+        x, reduce, Number<NSize - 1>{}, make_tuple(init), init);
+}
+#endif
+
+// TODO: update to like container_reverse_exclusive_scan to deal with Tuple of Numebr<>
+template <typename... Xs, typename Reduce, typename TData>
+__host__ __device__ constexpr auto
+container_reverse_inclusive_scan(const Tuple<Xs...>& x, Reduce f, TData init)
+{
+    constexpr index_t NSize = sizeof...(Xs);
+
+    Tuple<Xs...> y;
+
+    TData r = init;
+
+    static_for<NSize - 1, 0, -1>{}([&](auto i) {
+        r    = f(r, x[i]);
+        y(i) = r;
+    });
+
+    r              = f(r, x[Number<0>{}]);
+    y(Number<0>{}) = r;
+
+    return y;
+}
+
+template <typename X, typename... Ys>
+__host__ __device__ constexpr auto container_concat(const X& x, const Ys&... ys)
+{
+    return container_concat(x, container_concat(ys...));
+}
+
+template <typename T, index_t NX, index_t NY>
+__host__ __device__ constexpr auto container_concat(const Array<T, NX>& ax, const Array<T, NY>& ay)
+{
+    return unpack2(
+        [&](auto&&... zs) { return make_array(std::forward<decltype(zs)>(zs)...); }, ax, ay);
+}
+
+template <typename... X, typename... Y>
+__host__ __device__ constexpr auto container_concat(const Tuple<X...>& tx, const Tuple<Y...>& ty)
+{
+    return unpack2(
+        [&](auto&&... zs) { return make_tuple(std::forward<decltype(zs)>(zs)...); }, tx, ty);
+}
+
+template <typename Container>
+__host__ __device__ constexpr auto container_concat(const Container& x)
+{
+    return x;
+}
+
+template <typename T, index_t N, index_t... Is>
+__host__ __device__ constexpr auto get_container_subset(const Array<T, N>& arr, Sequence<Is...>)
+{
+    static_assert(N >= sizeof...(Is), "wrong! size");
+
+    return make_array(arr[Number<Is>{}]...);
+}
+
+template <typename... Ts, index_t... Is>
+__host__ __device__ constexpr auto get_container_subset(const Tuple<Ts...>& tup, Sequence<Is...>)
+{
+    static_assert(sizeof...(Ts) >= sizeof...(Is), "wrong! size");
+
+    return make_tuple(tup[Number<Is>{}]...);
+}
+
+template <typename T, index_t N, index_t... Is>
+__host__ __device__ constexpr void
+set_container_subset(Array<T, N>& y, Sequence<Is...> picks, const Array<T, sizeof...(Is)>& x)
+{
+    static_assert(N >= sizeof...(Is), "wrong! size");
+
+    static_for<0, sizeof...(Is), 1>{}([&](auto i) { y(picks[i]) = x[i]; });
+}
+
+template <typename... Ys, index_t... Is, typename... Xs>
+__host__ __device__ constexpr void
+set_container_subset(Tuple<Ys...>& y, Sequence<Is...> picks, const Tuple<Xs...>& x)
+{
+    static_assert(sizeof...(Ys) >= sizeof...(Is) && sizeof...(Is) == sizeof...(Xs), "wrong! size");
+
+    static_for<0, sizeof...(Is), 1>{}([&](auto i) { y(picks[i]) = x[i]; });
+}
+
+template <typename Container>
+__host__ __device__ constexpr auto to_tuple_of_number(const Container&)
+{
+    static_assert(is_known_at_compile_time<Container>::value, "wrong!");
+
+    return generate_tuple(
+        [&](auto i) {
+            constexpr index_t tmp = Container::At(i);
+            return Number<tmp>{};
+        },
+        Container::Size());
+}
+
+template <index_t... Is>
+__host__ __device__ constexpr auto sequence_to_tuple_of_number(Sequence<Is...>)
+{
+    using Seq = Sequence<Is...>;
+
+    return generate_tuple(
+        [&](auto i) {
+            constexpr index_t tmp = Seq::At(i);
+            return Number<tmp>{};
+        },
+        Seq::Size());
+}
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/data_type.hpp b/composable_kernel/include/utility/data_type.hpp
new file mode 100644
index 0000000000..24a2190e84
--- /dev/null
+++ b/composable_kernel/include/utility/data_type.hpp
@@ -0,0 +1,1017 @@
+#ifndef CK_FLOAT_TYPE_AMD_HPP
+#define CK_FLOAT_TYPE_AMD_HPP
+
+#include "statically_indexed_array.hpp"
+
+namespace ck {
+
+using half_t = _Float16;
+
+// vector_type
+template <typename T, index_t N>
+struct vector_type;
+
+// Caution: DO NOT REMOVE
+// intentionally have only declaration but no definition to cause compilation failure when trying to
+// instantiate this template. The purpose is to catch user's mistake when trying to make "vector of
+// vectors"
+template <typename T, index_t V, index_t N>
+struct vector_type<T __attribute__((ext_vector_type(V))), N>;
+
+// Caution: DO NOT REMOVE
+// intentionally have only declaration but no definition to cause compilation failure when trying to
+// instantiate this template. The purpose is to catch user's mistake when trying to make "vector of
+// vectors"
+template <typename T, index_t V, index_t N>
+struct vector_type<vector_type<T, V>, N>;
+
+// vector_type_maker
+// This is the right way to handle "vector of vectors": making a bigger vector instead
+template <typename T, index_t N>
+struct vector_type_maker
+{
+    using type = vector_type<T, N>;
+};
+
+template <typename T, index_t N0, index_t N1>
+struct vector_type_maker<T __attribute__((ext_vector_type(N1))), N0>
+{
+    using type = vector_type<T, N0 * N1>;
+};
+
+template <typename T, index_t N0, index_t N1>
+struct vector_type_maker<vector_type<T, N1>, N0>
+{
+    using type = vector_type<T, N0 * N1>;
+};
+
+template <typename T, index_t N>
+using vector_type_maker_t = typename vector_type_maker<T, N>::type;
+
+template <typename T, index_t N>
+__host__ __device__ constexpr auto make_vector_type(Number<N>)
+{
+    return typename vector_type_maker<T, N>::type{};
+}
+
+// scalar_type
+template <typename TV>
+struct scalar_type;
+
+template <typename T, index_t N>
+struct scalar_type<T __attribute__((ext_vector_type(N)))>
+{
+    using type                           = T;
+    static constexpr index_t vector_size = N;
+};
+
+template <typename T, index_t N>
+struct scalar_type<vector_type<T, N>>
+{
+    using type                           = T;
+    static constexpr index_t vector_size = N;
+};
+
+//
+template <>
+struct scalar_type<float>
+{
+    using type                           = float;
+    static constexpr index_t vector_size = 1;
+};
+
+template <>
+struct scalar_type<half_t>
+{
+    using type                           = half_t;
+    static constexpr index_t vector_size = 1;
+};
+
+template <>
+struct scalar_type<ushort>
+{
+    using type                           = ushort;
+    static constexpr index_t vector_size = 1;
+};
+
+template <>
+struct scalar_type<int32_t>
+{
+    using type                           = int32_t;
+    static constexpr index_t vector_size = 1;
+};
+
+template <>
+struct scalar_type<int8_t>
+{
+    using type                           = int8_t;
+    static constexpr index_t vector_size = 1;
+};
+
+//
+template <typename T>
+struct vector_type<T, 1>
+{
+    using d1_t = T;
+    using type = d1_t;
+
+    union
+    {
+        T d1_;
+        StaticallyIndexedArray<T, 1> d1x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same<X, d1_t>::value, "wrong!");
+
+        return data_.d1x1_;
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same<X, d1_t>::value, "wrong!");
+
+        return data_.d1x1_;
+    }
+};
+
+template <typename T>
+struct vector_type<T, 2>
+{
+    using d1_t = T;
+    typedef T d2_t __attribute__((ext_vector_type(2)));
+
+    using type = d2_t;
+
+    union
+    {
+        d2_t d2_;
+        StaticallyIndexedArray<d1_t, 2> d1x2_;
+        StaticallyIndexedArray<d2_t, 1> d2x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value, "wrong!");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x2_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x1_;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value, "wrong!");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x2_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x1_;
+        }
+    }
+};
+
+template <typename T>
+struct vector_type<T, 4>
+{
+    using d1_t = T;
+    typedef T d2_t __attribute__((ext_vector_type(2)));
+    typedef T d4_t __attribute__((ext_vector_type(4)));
+
+    using type = d4_t;
+
+    union
+    {
+        d4_t d4_;
+        StaticallyIndexedArray<d1_t, 4> d1x4_;
+        StaticallyIndexedArray<d2_t, 2> d2x2_;
+        StaticallyIndexedArray<d4_t, 1> d4x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value || is_same<X, d4_t>::value,
+                      "wrong!");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x4_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x2_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x1_;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value || is_same<X, d4_t>::value,
+                      "wrong!");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x4_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x2_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x1_;
+        }
+    }
+};
+
+template <typename T>
+struct vector_type<T, 8>
+{
+    using d1_t = T;
+    typedef T d2_t __attribute__((ext_vector_type(2)));
+    typedef T d4_t __attribute__((ext_vector_type(4)));
+    typedef T d8_t __attribute__((ext_vector_type(8)));
+
+    using type = d8_t;
+
+    union
+    {
+        d8_t d8_;
+        StaticallyIndexedArray<d1_t, 8> d1x8_;
+        StaticallyIndexedArray<d2_t, 4> d2x4_;
+        StaticallyIndexedArray<d4_t, 2> d4x2_;
+        StaticallyIndexedArray<d8_t, 1> d8x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d8_t>::value,
+                      "wrong!");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x8_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x4_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x2_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x1_;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d8_t>::value,
+                      "wrong!");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x8_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x4_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x2_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x1_;
+        }
+    }
+};
+
+template <typename T>
+struct vector_type<T, 16>
+{
+    using d1_t = T;
+    typedef T d2_t __attribute__((ext_vector_type(2)));
+    typedef T d4_t __attribute__((ext_vector_type(4)));
+    typedef T d8_t __attribute__((ext_vector_type(8)));
+    typedef T d16_t __attribute__((ext_vector_type(16)));
+
+    using type = d16_t;
+
+    union
+    {
+        d16_t d16_;
+        StaticallyIndexedArray<d1_t, 16> d1x16_;
+        StaticallyIndexedArray<d2_t, 8> d2x8_;
+        StaticallyIndexedArray<d4_t, 4> d4x4_;
+        StaticallyIndexedArray<d8_t, 2> d8x2_;
+        StaticallyIndexedArray<d16_t, 1> d16x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
+                          is_same<X, d16_t>::value,
+                      "wrong!");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x16_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x8_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x4_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x2_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x1_;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
+                          is_same<X, d16_t>::value,
+                      "wrong!");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x16_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x8_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x4_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x2_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x1_;
+        }
+    }
+};
+
+template <typename T>
+struct vector_type<T, 32>
+{
+    using d1_t = T;
+    typedef T d2_t __attribute__((ext_vector_type(2)));
+    typedef T d4_t __attribute__((ext_vector_type(4)));
+    typedef T d8_t __attribute__((ext_vector_type(8)));
+    typedef T d16_t __attribute__((ext_vector_type(16)));
+    typedef T d32_t __attribute__((ext_vector_type(32)));
+
+    using type = d32_t;
+
+    union
+    {
+        d32_t d32_;
+        StaticallyIndexedArray<d1_t, 32> d1x32_;
+        StaticallyIndexedArray<d2_t, 16> d2x16_;
+        StaticallyIndexedArray<d4_t, 8> d4x8_;
+        StaticallyIndexedArray<d8_t, 4> d8x4_;
+        StaticallyIndexedArray<d16_t, 2> d16x2_;
+        StaticallyIndexedArray<d32_t, 1> d32x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
+                          is_same<X, d16_t>::value || is_same<X, d32_t>::value,
+                      "wrong!");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x32_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x16_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x8_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x4_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x2_;
+        }
+        else if constexpr(is_same<X, d32_t>::value)
+        {
+            return data_.d32x1_;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
+                          is_same<X, d16_t>::value || is_same<X, d32_t>::value,
+                      "wrong!");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x32_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x16_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x8_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x4_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x2_;
+        }
+        else if constexpr(is_same<X, d32_t>::value)
+        {
+            return data_.d32x1_;
+        }
+    }
+};
+
+template <typename T>
+struct vector_type<T, 64>
+{
+    using d1_t = T;
+    typedef T d2_t __attribute__((ext_vector_type(2)));
+    typedef T d4_t __attribute__((ext_vector_type(4)));
+    typedef T d8_t __attribute__((ext_vector_type(8)));
+    typedef T d16_t __attribute__((ext_vector_type(16)));
+    typedef T d32_t __attribute__((ext_vector_type(32)));
+    typedef T d64_t __attribute__((ext_vector_type(64)));
+
+    using type = d64_t;
+
+    union
+    {
+        d64_t d64_;
+        StaticallyIndexedArray<d1_t, 64> d1x64_;
+        StaticallyIndexedArray<d2_t, 32> d2x32_;
+        StaticallyIndexedArray<d4_t, 16> d4x16_;
+        StaticallyIndexedArray<d8_t, 8> d8x8_;
+        StaticallyIndexedArray<d16_t, 4> d16x4_;
+        StaticallyIndexedArray<d32_t, 2> d32x2_;
+        StaticallyIndexedArray<d64_t, 1> d64x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
+                          is_same<X, d16_t>::value || is_same<X, d32_t>::value ||
+                          is_same<X, d64_t>::value,
+                      "wrong!");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x64_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x32_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x16_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x8_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x4_;
+        }
+        else if constexpr(is_same<X, d32_t>::value)
+        {
+            return data_.d32x2_;
+        }
+        else if constexpr(is_same<X, d64_t>::value)
+        {
+            return data_.d64x1_;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
+                          is_same<X, d16_t>::value || is_same<X, d32_t>::value ||
+                          is_same<X, d64_t>::value,
+                      "wrong!");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x64_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x32_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x16_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x8_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x4_;
+        }
+        else if constexpr(is_same<X, d32_t>::value)
+        {
+            return data_.d32x2_;
+        }
+        else if constexpr(is_same<X, d64_t>::value)
+        {
+            return data_.d64x1_;
+        }
+    }
+};
+
+template <typename T>
+struct vector_type<T, 128>
+{
+    using d1_t = T;
+    typedef T d2_t __attribute__((ext_vector_type(2)));
+    typedef T d4_t __attribute__((ext_vector_type(4)));
+    typedef T d8_t __attribute__((ext_vector_type(8)));
+    typedef T d16_t __attribute__((ext_vector_type(16)));
+    typedef T d32_t __attribute__((ext_vector_type(32)));
+    typedef T d64_t __attribute__((ext_vector_type(64)));
+    typedef T d128_t __attribute__((ext_vector_type(128)));
+
+    using type = d128_t;
+
+    union
+    {
+        d128_t d128_;
+        StaticallyIndexedArray<d1_t, 128> d1x128_;
+        StaticallyIndexedArray<d2_t, 64> d2x64_;
+        StaticallyIndexedArray<d4_t, 32> d4x32_;
+        StaticallyIndexedArray<d8_t, 16> d8x16_;
+        StaticallyIndexedArray<d16_t, 8> d16x8_;
+        StaticallyIndexedArray<d32_t, 4> d32x4_;
+        StaticallyIndexedArray<d64_t, 2> d64x2_;
+        StaticallyIndexedArray<d128_t, 1> d128x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
+                          is_same<X, d16_t>::value || is_same<X, d32_t>::value ||
+                          is_same<X, d64_t>::value || is_same<X, d128_t>::value,
+                      "wrong!");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x128_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x64_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x32_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x16_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x8_;
+        }
+        else if constexpr(is_same<X, d32_t>::value)
+        {
+            return data_.d32x4_;
+        }
+        else if constexpr(is_same<X, d64_t>::value)
+        {
+            return data_.d64x2_;
+        }
+        else if constexpr(is_same<X, d128_t>::value)
+        {
+            return data_.d128x1_;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
+                          is_same<X, d16_t>::value || is_same<X, d32_t>::value ||
+                          is_same<X, d64_t>::value || is_same<X, d128_t>::value,
+                      "wrong!");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x128_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x64_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x32_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x16_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x8_;
+        }
+        else if constexpr(is_same<X, d32_t>::value)
+        {
+            return data_.d32x4_;
+        }
+        else if constexpr(is_same<X, d64_t>::value)
+        {
+            return data_.d64x2_;
+        }
+        else if constexpr(is_same<X, d128_t>::value)
+        {
+            return data_.d128x1_;
+        }
+    }
+};
+
+template <typename T>
+struct vector_type<T, 256>
+{
+    using d1_t = T;
+    typedef T d2_t __attribute__((ext_vector_type(2)));
+    typedef T d4_t __attribute__((ext_vector_type(4)));
+    typedef T d8_t __attribute__((ext_vector_type(8)));
+    typedef T d16_t __attribute__((ext_vector_type(16)));
+    typedef T d32_t __attribute__((ext_vector_type(32)));
+    typedef T d64_t __attribute__((ext_vector_type(64)));
+    typedef T d128_t __attribute__((ext_vector_type(128)));
+    typedef T d256_t __attribute__((ext_vector_type(256)));
+
+    using type = d256_t;
+
+    union
+    {
+        d256_t d256_;
+        StaticallyIndexedArray<d1_t, 256> d1x256_;
+        StaticallyIndexedArray<d2_t, 128> d2x128_;
+        StaticallyIndexedArray<d4_t, 64> d4x64_;
+        StaticallyIndexedArray<d8_t, 32> d8x32_;
+        StaticallyIndexedArray<d16_t, 16> d16x16_;
+        StaticallyIndexedArray<d32_t, 8> d32x8_;
+        StaticallyIndexedArray<d64_t, 4> d64x4_;
+        StaticallyIndexedArray<d128_t, 2> d128x2_;
+        StaticallyIndexedArray<d256_t, 1> d256x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(
+            is_same<X, d1_t>::value || is_same<X, d2_t>::value || is_same<X, d4_t>::value ||
+                is_same<X, d8_t>::value || is_same<X, d16_t>::value || is_same<X, d32_t>::value ||
+                is_same<X, d64_t>::value || is_same<X, d128_t>::value || is_same<X, d256_t>::value,
+            "wrong!");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x256_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x128_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x64_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x32_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x16_;
+        }
+        else if constexpr(is_same<X, d32_t>::value)
+        {
+            return data_.d32x8_;
+        }
+        else if constexpr(is_same<X, d64_t>::value)
+        {
+            return data_.d64x4_;
+        }
+        else if constexpr(is_same<X, d128_t>::value)
+        {
+            return data_.d128x2_;
+        }
+        else if constexpr(is_same<X, d256_t>::value)
+        {
+            return data_.d256x1_;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(
+            is_same<X, d1_t>::value || is_same<X, d2_t>::value || is_same<X, d4_t>::value ||
+                is_same<X, d8_t>::value || is_same<X, d16_t>::value || is_same<X, d32_t>::value ||
+                is_same<X, d64_t>::value || is_same<X, d128_t>::value || is_same<X, d256_t>::value,
+            "wrong!");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x256_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x128_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x64_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x32_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x16_;
+        }
+        else if constexpr(is_same<X, d32_t>::value)
+        {
+            return data_.d32x8_;
+        }
+        else if constexpr(is_same<X, d64_t>::value)
+        {
+            return data_.d64x4_;
+        }
+        else if constexpr(is_same<X, d128_t>::value)
+        {
+            return data_.d128x2_;
+        }
+        else if constexpr(is_same<X, d256_t>::value)
+        {
+            return data_.d256x1_;
+        }
+    }
+};
+
+// fp32
+using float2_t  = typename vector_type<float, 2>::type;
+using float4_t  = typename vector_type<float, 4>::type;
+using float8_t  = typename vector_type<float, 8>::type;
+using float16_t = typename vector_type<float, 16>::type;
+using float32_t = typename vector_type<float, 32>::type;
+using float64_t = typename vector_type<float, 64>::type;
+
+// fp16
+using half2_t  = typename vector_type<half_t, 2>::type;
+using half4_t  = typename vector_type<half_t, 4>::type;
+using half8_t  = typename vector_type<half_t, 8>::type;
+using half16_t = typename vector_type<half_t, 16>::type;
+using half32_t = typename vector_type<half_t, 32>::type;
+using half64_t = typename vector_type<half_t, 64>::type;
+
+// bfp16
+using ushort2_t  = typename vector_type<ushort, 2>::type;
+using ushort4_t  = typename vector_type<ushort, 4>::type;
+using ushort8_t  = typename vector_type<ushort, 8>::type;
+using ushort16_t = typename vector_type<ushort, 16>::type;
+using ushort32_t = typename vector_type<ushort, 32>::type;
+using ushort64_t = typename vector_type<ushort, 64>::type;
+
+// i32
+using int32x2_t  = typename vector_type<int32_t, 2>::type;
+using int32x4_t  = typename vector_type<int32_t, 4>::type;
+using int32x8_t  = typename vector_type<int32_t, 8>::type;
+using int32x16_t = typename vector_type<int32_t, 16>::type;
+using int32x32_t = typename vector_type<int32_t, 32>::type;
+using int32x64_t = typename vector_type<int32_t, 64>::type;
+
+// i8
+using int8x2_t  = typename vector_type<int8_t, 2>::type;
+using int8x4_t  = typename vector_type<int8_t, 4>::type;
+using int8x8_t  = typename vector_type<int8_t, 8>::type;
+using int8x16_t = typename vector_type<int8_t, 16>::type;
+using int8x32_t = typename vector_type<int8_t, 32>::type;
+using int8x64_t = typename vector_type<int8_t, 64>::type;
+
+// data type conversion
+template <typename T>
+struct type_convert
+{
+    template <typename X>
+    __device__ T operator()(X x) const
+    {
+        return static_cast<T>(x);
+    }
+};
+
+template <>
+template <>
+__device__ float type_convert<float>::operator()<ushort>(ushort x) const
+{
+    return bfloat16_to_float(x);
+}
+
+template <>
+template <>
+__device__ ushort type_convert<ushort>::operator()<float>(float x) const
+{
+    return float_to_bfloat16(x);
+}
+
+// TODO: deprecate this
+template <typename T>
+struct inner_product_with_conversion
+{
+    static constexpr auto convert = type_convert<T>();
+
+    template <typename X, index_t N>
+    __device__ T operator()(typename vector_type<X, N>::type a,
+                            typename vector_type<X, N>::type b) const
+    {
+        const vector_type<X, N> a_vector{a};
+        const vector_type<X, N> b_vector{b};
+
+        T acc = 0;
+
+        static_for<0, N, 1>{}([&](auto i) {
+            acc += convert(a_vector.Scalars()[i]) * convert(b_vector.Scalars()[i]);
+        });
+
+        return acc;
+    }
+
+    __device__ T operator()(float_t a, float_t b) const { return convert(a) * convert(b); }
+
+    __device__ T operator()(int8x4_t a, int8x4_t b) const
+    {
+        const vector_type<int8_t, 4> a_vector{a};
+        const vector_type<int8_t, 4> b_vector{b};
+
+        T acc = 0;
+
+        static_for<0, 4, 1>{}([&](auto i) {
+            acc += convert(a_vector.AsType<int8_t>()[i]) * convert(b_vector.AsType<int8_t>()[i]);
+        });
+
+        return acc;
+    }
+
+    __device__ T operator()(int8x8_t a, int8x8_t b) const
+    {
+        const vector_type<int8_t, 8> a_vector{a};
+        const vector_type<int8_t, 8> b_vector{b};
+
+        T acc = 0;
+
+        static_for<0, 8, 1>{}([&](auto i) {
+            acc += convert(a_vector.AsType<int8_t>()[i]) * convert(b_vector.AsType<int8_t>()[i]);
+        });
+
+        return acc;
+    }
+
+    __device__ T operator()(int8x16_t a, int8x16_t b) const
+    {
+        const vector_type<int8_t, 16> a_vector{a};
+        const vector_type<int8_t, 16> b_vector{b};
+
+        T acc = 0;
+
+        static_for<0, 16, 1>{}([&](auto i) {
+            acc += convert(a_vector.AsType<int8_t>()[i]) * convert(b_vector.AsType<int8_t>()[i]);
+        });
+
+        return acc;
+    }
+};
+
+template <typename T>
+struct NumericLimits;
+
+template <>
+struct NumericLimits<int32_t>
+{
+    __host__ __device__ static constexpr int32_t Min()
+    {
+        return std::numeric_limits<int32_t>::min();
+    }
+
+    __host__ __device__ static constexpr int32_t Max()
+    {
+        return std::numeric_limits<int32_t>::max();
+    }
+};
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/data_type_enum.hpp b/composable_kernel/include/utility/data_type_enum.hpp
new file mode 100644
index 0000000000..43499605dc
--- /dev/null
+++ b/composable_kernel/include/utility/data_type_enum.hpp
@@ -0,0 +1,20 @@
+#ifndef CK_DATA_TYPE_ENUM_HPP
+#define CK_DATA_TYPE_ENUM_HPP
+
+namespace ck {
+
+// this enumerate should be synchronized with include/miopen.h
+typedef enum
+{
+    Half     = 0,
+    Float    = 1,
+    Int32    = 2,
+    Int8     = 3,
+    Int8x4   = 4,
+    BFloat16 = 5,
+    Double   = 6,
+    Unknown  = 100,
+} DataTypeEnum_t;
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/data_type_helper.hpp b/composable_kernel/include/utility/data_type_helper.hpp
new file mode 100644
index 0000000000..6a234cd10b
--- /dev/null
+++ b/composable_kernel/include/utility/data_type_helper.hpp
@@ -0,0 +1,76 @@
+#ifndef CK_DATA_TYPE_HELPER_HPP
+#define CK_DATA_TYPE_HELPER_HPP
+
+#include "data_type.hpp"
+#include "data_type_enum.hpp"
+
+namespace ck {
+
+template <DataTypeEnum_t DataTypeEnum>
+struct get_datatype_from_enum;
+
+template <>
+struct get_datatype_from_enum<DataTypeEnum_t::Int8>
+{
+    using type = int8_t;
+};
+
+template <>
+struct get_datatype_from_enum<DataTypeEnum_t::Int32>
+{
+    using type = int32_t;
+};
+
+template <>
+struct get_datatype_from_enum<DataTypeEnum_t::Half>
+{
+    using type = half_t;
+};
+
+template <>
+struct get_datatype_from_enum<DataTypeEnum_t::Float>
+{
+    using type = float;
+};
+
+template <>
+struct get_datatype_from_enum<DataTypeEnum_t::Double>
+{
+    using type = double;
+};
+
+template <typename T>
+struct get_datatype_enum_from_type;
+
+template <>
+struct get_datatype_enum_from_type<int8_t>
+{
+    static constexpr DataTypeEnum_t value = DataTypeEnum_t::Int8;
+};
+
+template <>
+struct get_datatype_enum_from_type<int32_t>
+{
+    static constexpr DataTypeEnum_t value = DataTypeEnum_t::Int32;
+};
+
+template <>
+struct get_datatype_enum_from_type<half_t>
+{
+    static constexpr DataTypeEnum_t value = DataTypeEnum_t::Half;
+};
+
+template <>
+struct get_datatype_enum_from_type<float>
+{
+    static constexpr DataTypeEnum_t value = DataTypeEnum_t::Float;
+};
+
+template <>
+struct get_datatype_enum_from_type<double>
+{
+    static constexpr DataTypeEnum_t value = DataTypeEnum_t::Double;
+};
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/dynamic_buffer.hpp b/composable_kernel/include/utility/dynamic_buffer.hpp
new file mode 100644
index 0000000000..5f5f386306
--- /dev/null
+++ b/composable_kernel/include/utility/dynamic_buffer.hpp
@@ -0,0 +1,208 @@
+#ifndef CK_DYNAMIC_BUFFER_HPP
+#define CK_DYNAMIC_BUFFER_HPP
+
+namespace ck {
+
+#include "amd_buffer_addressing_v2.hpp"
+
+template <AddressSpaceEnum_t BufferAddressSpace, typename T, typename ElementSpaceSize>
+struct DynamicBuffer
+{
+    using type = T;
+
+    T* p_data_;
+    ElementSpaceSize element_space_size_;
+
+    __host__ __device__ constexpr DynamicBuffer(T* p_data, ElementSpaceSize element_space_size)
+        : p_data_{p_data}, element_space_size_{element_space_size}
+    {
+    }
+
+    __host__ __device__ static constexpr AddressSpaceEnum_t GetAddressSpace()
+    {
+        return BufferAddressSpace;
+    }
+
+    __host__ __device__ constexpr const T& operator[](index_t i) const { return p_data_[i]; }
+
+    __host__ __device__ constexpr T& operator()(index_t i) { return p_data_[i]; }
+
+    template <typename X,
+              typename std::enable_if<
+                  is_same<typename scalar_type<remove_cv_t<remove_reference_t<X>>>::type,
+                          typename scalar_type<remove_cv_t<remove_reference_t<T>>>::type>::value,
+                  bool>::type = false>
+    __host__ __device__ constexpr auto Get(index_t i, bool is_valid_offset) const
+    {
+        // X contains multiple T
+        constexpr index_t scalar_per_t_vector =
+            scalar_type<remove_cv_t<remove_reference_t<T>>>::vector_size;
+
+        constexpr index_t scalar_per_x_vector =
+            scalar_type<remove_cv_t<remove_reference_t<X>>>::vector_size;
+
+        static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
+                      "wrong! X need to be multiple T");
+
+        constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
+
+        if constexpr(GetAddressSpace() == AddressSpaceEnum_t::Global)
+        {
+#if CK_USE_AMD_BUFFER_ADDRESSING
+            return amd_buffer_load_v2<remove_cv_t<remove_reference_t<T>>, t_per_x>(
+                p_data_, i, is_valid_offset, element_space_size_);
+#else
+            return is_valid_offset ? *reinterpret_cast<const X*>(&p_data_[i]) : X{0};
+#endif
+        }
+        else
+        {
+            return is_valid_offset ? *reinterpret_cast<const X*>(&p_data_[i]) : X{0};
+        }
+    }
+
+    template <typename X,
+              typename std::enable_if<
+                  is_same<typename scalar_type<remove_cv_t<remove_reference_t<X>>>::type,
+                          typename scalar_type<remove_cv_t<remove_reference_t<T>>>::type>::value,
+                  bool>::type = false>
+    __host__ __device__ void Set(index_t i, bool is_valid_offset, const X& x)
+    {
+        // X contains multiple T
+        constexpr index_t scalar_per_t_vector =
+            scalar_type<remove_cv_t<remove_reference_t<T>>>::vector_size;
+
+        constexpr index_t scalar_per_x_vector =
+            scalar_type<remove_cv_t<remove_reference_t<X>>>::vector_size;
+
+        static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
+                      "wrong! X need to be multiple T");
+
+        constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
+
+        if constexpr(GetAddressSpace() == AddressSpaceEnum_t::Global)
+        {
+#if CK_USE_AMD_BUFFER_ADDRESSING
+            amd_buffer_store_v2<remove_cv_t<remove_reference_t<T>>, t_per_x>(
+                x, p_data_, i, is_valid_offset, element_space_size_);
+#else
+            if(is_valid_offset)
+            {
+                *reinterpret_cast<X*>(&p_data_[i]) = x;
+            }
+#endif
+        }
+        else if constexpr(GetAddressSpace() == AddressSpaceEnum_t::Lds)
+        {
+            if(is_valid_offset)
+            {
+#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
+                *reinterpret_cast<X*>(&p_data_[i]) = x;
+#else
+                // HACK: compiler would lower IR "store<i8, 16> address_space(3)" into
+                // inefficient
+                // ISA, so I try to let compiler emit IR "store<i32, 4>" which would be lower to
+                // ds_write_b128
+                // TODO: remove this after compiler fix
+                if constexpr(is_same<typename scalar_type<remove_cv_t<remove_reference_t<T>>>::type,
+                                     int8_t>::value)
+                {
+                    static_assert(
+                        (is_same<remove_cv_t<remove_reference_t<T>>, int8_t>::value &&
+                         is_same<remove_cv_t<remove_reference_t<X>>, int8_t>::value) ||
+                            (is_same<remove_cv_t<remove_reference_t<T>>, int8_t>::value &&
+                             is_same<remove_cv_t<remove_reference_t<X>>, int8x2_t>::value) ||
+                            (is_same<remove_cv_t<remove_reference_t<T>>, int8_t>::value &&
+                             is_same<remove_cv_t<remove_reference_t<X>>, int8x4_t>::value) ||
+                            (is_same<remove_cv_t<remove_reference_t<T>>, int8x4_t>::value &&
+                             is_same<remove_cv_t<remove_reference_t<X>>, int8x4_t>::value) ||
+                            (is_same<remove_cv_t<remove_reference_t<T>>, int8x8_t>::value &&
+                             is_same<remove_cv_t<remove_reference_t<X>>, int8x8_t>::value) ||
+                            (is_same<remove_cv_t<remove_reference_t<T>>, int8x16_t>::value &&
+                             is_same<remove_cv_t<remove_reference_t<X>>, int8x16_t>::value),
+                        "wrong! not implemented for this combination, please add "
+                        "implementation");
+
+                    if constexpr(is_same<remove_cv_t<remove_reference_t<T>>, int8_t>::value &&
+                                 is_same<remove_cv_t<remove_reference_t<X>>, int8_t>::value)
+                    {
+                        // HACK: cast pointer of x is bad
+                        // TODO: remove this after compiler fix
+                        *reinterpret_cast<int8_t*>(&p_data_[i]) =
+                            *reinterpret_cast<const int8_t*>(&x);
+                    }
+                    else if constexpr(is_same<remove_cv_t<remove_reference_t<T>>, int8_t>::value &&
+                                      is_same<remove_cv_t<remove_reference_t<X>>, int8x2_t>::value)
+                    {
+                        // HACK: cast pointer of x is bad
+                        // TODO: remove this after compiler fix
+                        *reinterpret_cast<int16_t*>(&p_data_[i]) =
+                            *reinterpret_cast<const int16_t*>(&x);
+                    }
+                    else if constexpr(is_same<remove_cv_t<remove_reference_t<T>>, int8_t>::value &&
+                                      is_same<remove_cv_t<remove_reference_t<X>>, int8x4_t>::value)
+                    {
+                        // HACK: cast pointer of x is bad
+                        // TODO: remove this after compiler fix
+                        *reinterpret_cast<int32_t*>(&p_data_[i]) =
+                            *reinterpret_cast<const int32_t*>(&x);
+                    }
+                    else if constexpr(is_same<remove_cv_t<remove_reference_t<T>>,
+                                              int8x4_t>::value &&
+                                      is_same<remove_cv_t<remove_reference_t<X>>, int8x4_t>::value)
+                    {
+                        // HACK: cast pointer of x is bad
+                        // TODO: remove this after compiler fix
+                        *reinterpret_cast<int32_t*>(&p_data_[i]) =
+                            *reinterpret_cast<const int32_t*>(&x);
+                    }
+                    else if constexpr(is_same<remove_cv_t<remove_reference_t<T>>,
+                                              int8x8_t>::value &&
+                                      is_same<remove_cv_t<remove_reference_t<X>>, int8x8_t>::value)
+                    {
+                        // HACK: cast pointer of x is bad
+                        // TODO: remove this after compiler fix
+                        *reinterpret_cast<int32x2_t*>(&p_data_[i]) =
+                            *reinterpret_cast<const int32x2_t*>(&x);
+                    }
+                    else if constexpr(is_same<remove_cv_t<remove_reference_t<T>>,
+                                              int8x16_t>::value &&
+                                      is_same<remove_cv_t<remove_reference_t<X>>, int8x16_t>::value)
+                    {
+                        // HACK: cast pointer of x is bad
+                        // TODO: remove this after compiler fix
+                        *reinterpret_cast<int32x4_t*>(&p_data_[i]) =
+                            *reinterpret_cast<const int32x4_t*>(&x);
+                    }
+                }
+                else
+                {
+                    *reinterpret_cast<X*>(&p_data_[i]) = x;
+                }
+#endif
+            }
+        }
+        else
+        {
+            if(is_valid_offset)
+            {
+                *reinterpret_cast<X*>(&p_data_[i]) = x;
+            }
+        }
+    }
+
+    __host__ __device__ static constexpr bool IsStaticBuffer() { return false; }
+
+    __host__ __device__ static constexpr bool IsDynamicBuffer() { return true; }
+};
+
+template <AddressSpaceEnum_t BufferAddressSpace = AddressSpaceEnum_t::Generic,
+          typename T,
+          typename ElementSpaceSize>
+__host__ __device__ constexpr auto make_dynamic_buffer(T* p, ElementSpaceSize element_space_size)
+{
+    return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize>{p, element_space_size};
+}
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/functional.hpp b/composable_kernel/include/utility/functional.hpp
new file mode 100644
index 0000000000..b84b617f44
--- /dev/null
+++ b/composable_kernel/include/utility/functional.hpp
@@ -0,0 +1,116 @@
+#ifndef CK_FUNCTIONAL_HPP
+#define CK_FUNCTIONAL_HPP
+
+#include "integral_constant.hpp"
+#include "type.hpp"
+
+namespace ck {
+
+// TODO: right? wrong?
+struct forwarder
+{
+    template <typename T>
+    __host__ __device__ constexpr T&& operator()(T&& x) const
+    {
+        return static_cast<T&&>(x);
+    }
+};
+
+struct swallow
+{
+    template <typename... Ts>
+    __host__ __device__ constexpr swallow(Ts&&...)
+    {
+    }
+};
+
+template <typename T>
+struct logical_and
+{
+    constexpr bool operator()(const T& x, const T& y) const { return x && y; }
+};
+
+template <typename T>
+struct logical_or
+{
+    constexpr bool operator()(const T& x, const T& y) const { return x || y; }
+};
+
+template <typename T>
+struct logical_not
+{
+    constexpr bool operator()(const T& x) const { return !x; }
+};
+
+// Emulate if constexpr
+template <bool>
+struct static_if;
+
+template <>
+struct static_if<true>
+{
+    using Type = static_if<true>;
+
+    template <typename F>
+    __host__ __device__ constexpr auto operator()(F f) const
+    {
+        // This is a trick for compiler:
+        //   Pass forwarder to lambda "f" as "auto" argument, and make sure "f" will
+        //   use it,
+        //   this will make "f" a generic lambda, so that "f" won't be compiled
+        //   until being
+        //   instantiated here
+        f(forwarder{});
+        return Type{};
+    }
+
+    template <typename F>
+    __host__ __device__ static void Else(F)
+    {
+    }
+};
+
+template <>
+struct static_if<false>
+{
+    using Type = static_if<false>;
+
+    template <typename F>
+    __host__ __device__ constexpr auto operator()(F) const
+    {
+        return Type{};
+    }
+
+    template <typename F>
+    __host__ __device__ static void Else(F f)
+    {
+        // This is a trick for compiler:
+        //   Pass forwarder to lambda "f" as "auto" argument, and make sure "f" will
+        //   use it,
+        //   this will make "f" a generic lambda, so that "f" won't be compiled
+        //   until being
+        //   instantiated here
+        f(forwarder{});
+    }
+};
+
+template <bool predicate, class X, class Y>
+struct conditional;
+
+template <class X, class Y>
+struct conditional<true, X, Y>
+{
+    using type = X;
+};
+
+template <class X, class Y>
+struct conditional<false, X, Y>
+{
+    using type = Y;
+};
+
+template <bool predicate, class X, class Y>
+using conditional_t = typename conditional<predicate, X, Y>::type;
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/functional2.hpp b/composable_kernel/include/utility/functional2.hpp
new file mode 100644
index 0000000000..371182a05e
--- /dev/null
+++ b/composable_kernel/include/utility/functional2.hpp
@@ -0,0 +1,48 @@
+#ifndef CK_FUNCTIONAL2_HPP
+#define CK_FUNCTIONAL2_HPP
+
+#include "functional.hpp"
+#include "sequence.hpp"
+
+namespace ck {
+
+namespace detail {
+
+template <class>
+struct static_for_impl;
+
+template <index_t... Is>
+struct static_for_impl<Sequence<Is...>>
+{
+    template <class F>
+    __host__ __device__ constexpr void operator()(F f) const
+    {
+        swallow{(f(Number<Is>{}), 0)...};
+    }
+};
+
+} // namespace detail
+
+// F signature: F(Number<Iter>)
+template <index_t NBegin, index_t NEnd, index_t Increment>
+struct static_for
+{
+    __host__ __device__ constexpr static_for()
+    {
+        static_assert(Increment != 0 && (NEnd - NBegin) % Increment == 0,
+                      "Wrong! should satisfy (NEnd - NBegin) % Increment == 0");
+        static_assert((Increment > 0 && NBegin <= NEnd) || (Increment < 0 && NBegin >= NEnd),
+                      "wrongs! should (Increment > 0 && NBegin <= NEnd) || (Increment < 0 && "
+                      "NBegin >= NEnd)");
+    }
+
+    template <class F>
+    __host__ __device__ constexpr void operator()(F f) const
+    {
+        detail::static_for_impl<typename arithmetic_sequence_gen<NBegin, NEnd, Increment>::type>{}(
+            f);
+    }
+};
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/functional3.hpp b/composable_kernel/include/utility/functional3.hpp
new file mode 100644
index 0000000000..6a400f3ca6
--- /dev/null
+++ b/composable_kernel/include/utility/functional3.hpp
@@ -0,0 +1,142 @@
+#ifndef CK_FUNCTIONAL3_HPP
+#define CK_FUNCTIONAL3_HPP
+
+#include "functional.hpp"
+#include "functional2.hpp"
+#include "sequence.hpp"
+#include "multi_index.hpp"
+
+namespace ck {
+
+namespace detail {
+
+// RemainLengths: Sequence<...>
+// Orders: Sequence<...>
+template <class RemainLengths, class Orders>
+struct static_ford_impl
+{
+    __host__ __device__ constexpr static_ford_impl()
+    {
+        static_assert(RemainLengths::GetSize() > 0, "wrong! should not get here");
+    }
+
+    // F signature: F(Sequence<...>)
+    // CurrentOrderedId: Sequence<...>
+    template <class F, class CurrentOrderedId>
+    __host__ __device__ constexpr void operator()(F f, CurrentOrderedId) const
+    {
+        static_for<0, RemainLengths::Front(), 1>{}([=](auto I) {
+            static_ford_impl<decltype(RemainLengths::PopFront()), Orders>{}(
+                f, CurrentOrderedId::PushBack(I));
+        });
+    }
+};
+
+template <class Orders>
+struct static_ford_impl<Sequence<>, Orders>
+{
+    // F signature: F(Sequence<...>)
+    // OrderedId: Sequence<...>
+    template <class F, class OrderedId>
+    __host__ __device__ constexpr void operator()(F f, OrderedId) const
+    {
+        // retrive unordered Id
+        f(OrderedId::ReorderGivenOld2New(Orders{}));
+    }
+};
+
+// RemainLengths: Sequence<...>
+// Orders: Sequence<...>
+template <class RemainLengths, class Orders>
+struct ford_impl
+{
+    __host__ __device__ constexpr ford_impl()
+    {
+        static_assert(RemainLengths::GetSize() > 0, "wrong! should not get here");
+    }
+
+    // F signature: F(Array<...> multi_id)
+    // CurrentOrderdId: Array<...>
+    template <class F, class CurrentOrderedId>
+    __host__ __device__ constexpr void operator()(F f, CurrentOrderedId current_ordered_id) const
+    {
+        for(index_t i = 0; i < RemainLengths::Front(); ++i)
+        {
+            ford_impl<decltype(RemainLengths::PopFront()), Orders>{}(
+                f, container_push_back(current_ordered_id, i));
+        }
+    }
+};
+
+template <class Orders>
+struct ford_impl<Sequence<>, Orders>
+{
+    // F signature: F(Array<...> multi_id)
+    // CurrentOrderdId: Array<...>
+    template <class F, class CurrentOrderedId>
+    __host__ __device__ constexpr void operator()(F f, CurrentOrderedId current_ordered_id) const
+    {
+        // retrive unordered Id
+        f(container_reorder_given_old2new(current_ordered_id, Orders{}));
+    }
+};
+
+} // namespace detail
+
+// Lengths is Sequence<...>, it is the length of each dimension for
+// N-dimensional loop
+// Orders is Sequence<...>, it is the order of dimension in which static_ford
+// will loop over each
+// dimension
+template <class Lengths,
+          class Orders = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::type>
+struct static_ford
+{
+    __host__ __device__ constexpr static_ford()
+    {
+        static_assert(Lengths::GetSize() > 0, "wrong! Lengths is empty");
+        static_assert(Lengths::GetSize() == Orders::GetSize(), "wrong! inconsistent size");
+    }
+
+    // F signature: F(Sequence<...> multi_id)
+    // multi_id is the unordered multi-index
+    template <class F>
+    __host__ __device__ constexpr void operator()(F f) const
+    {
+        constexpr auto ordered_lengths = Lengths::ReorderGivenNew2Old(Orders{});
+        detail::static_ford_impl<decltype(ordered_lengths), Orders>{}(f, Sequence<>{});
+    }
+};
+
+// Lengths is Sequence<...>, it is the length of each dimension for
+// N-dimensional loop
+// Orders is Sequence<...>, it is the order of dimension in which ford will loop
+// over each
+// dimension
+template <class Lengths,
+          class Orders = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::type>
+struct ford
+{
+    __host__ __device__ constexpr ford()
+    {
+        static_assert(Lengths::GetSize() > 0, "wrong! Lengths is empty");
+        static_assert(Lengths::GetSize() == Orders::GetSize(), "wrong! inconsistent size");
+    }
+
+    // F signature: F(Array<...> multi_id)
+    // multi_id is the unordered multi-index
+    template <class F>
+    __host__ __device__ constexpr void operator()(F f) const
+    {
+        constexpr auto ordered_lengths = Lengths::ReorderGivenNew2Old(Orders{});
+
+        for(index_t i = 0; i < ordered_lengths.Front(); ++i)
+        {
+            detail::ford_impl<decltype(ordered_lengths.PopFront()), Orders>{}(f,
+                                                                              make_multi_index(i));
+        }
+    }
+};
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/functional4.hpp b/composable_kernel/include/utility/functional4.hpp
new file mode 100644
index 0000000000..b039644380
--- /dev/null
+++ b/composable_kernel/include/utility/functional4.hpp
@@ -0,0 +1,62 @@
+#ifndef CK_FUNCTIONAL4_HPP
+#define CK_FUNCTIONAL4_HPP
+
+#include "sequence.hpp"
+#include "tuple.hpp"
+#include "array.hpp"
+
+namespace ck {
+
+namespace detail {
+
+template <typename Indices>
+struct unpack_impl;
+
+template <index_t... Is>
+struct unpack_impl<Sequence<Is...>>
+{
+    template <typename F, typename X>
+    __host__ __device__ constexpr auto operator()(F&& f, X&& x) const
+    {
+        return std::forward<F>(f)(std::forward<X>(x).At(Number<Is>{})...);
+    }
+};
+
+template <typename Seq0, typename Seq1>
+struct unpack2_impl;
+
+// TODO: remove this, after properly implementing unpack that takes any number of containers
+template <index_t... Is, index_t... Js>
+struct unpack2_impl<Sequence<Is...>, Sequence<Js...>>
+{
+    template <typename F, typename X, typename Y>
+    __host__ __device__ constexpr auto operator()(F&& f, X&& x, Y&& y) const
+    {
+        return std::forward<F>(f)(std::forward<X>(x).At(Number<Is>{})...,
+                                  std::forward<Y>(y).At(Number<Js>{})...);
+    }
+};
+
+} // namespace detail
+
+template <typename F, typename X>
+__host__ __device__ constexpr auto unpack(F&& f, X&& x)
+{
+    using X_ = remove_reference_t<X>;
+    return detail::unpack_impl<typename arithmetic_sequence_gen<0, X_::Size(), 1>::type>{}(
+        std::forward<F>(f), std::forward<X>(x));
+}
+
+// TODO: properly implement unpack that takes any number of containers
+template <typename F, typename X, typename Y>
+__host__ __device__ constexpr auto unpack2(F&& f, X&& x, Y&& y)
+{
+    using X_ = remove_reference_t<X>;
+    using Y_ = remove_reference_t<Y>;
+    return detail::unpack2_impl<typename arithmetic_sequence_gen<0, X_::Size(), 1>::type,
+                                typename arithmetic_sequence_gen<0, Y_::Size(), 1>::type>{}(
+        std::forward<F>(f), std::forward<X>(x), std::forward<Y>(y));
+}
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/integral_constant.hpp b/composable_kernel/include/utility/integral_constant.hpp
new file mode 100644
index 0000000000..14f3df894b
--- /dev/null
+++ b/composable_kernel/include/utility/integral_constant.hpp
@@ -0,0 +1,17 @@
+#ifndef CK_INTEGRAL_CONSTANT_HPP
+#define CK_INTEGRAL_CONSTANT_HPP
+
+namespace ck {
+
+template <class T, T v>
+struct integral_constant
+{
+    static constexpr T value = v;
+    typedef T value_type;
+    typedef integral_constant type;
+    __host__ __device__ constexpr operator value_type() const noexcept { return value; }
+    __host__ __device__ constexpr value_type operator()() const noexcept { return value; }
+};
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/magic_division.hpp b/composable_kernel/include/utility/magic_division.hpp
new file mode 100644
index 0000000000..b7489016e9
--- /dev/null
+++ b/composable_kernel/include/utility/magic_division.hpp
@@ -0,0 +1,155 @@
+#ifndef CK_MAGIC_DIVISION_HPP
+#define CK_MAGIC_DIVISION_HPP
+
+#include "config.hpp"
+#include "integral_constant.hpp"
+#include "number.hpp"
+#include "type.hpp"
+#include "tuple.hpp"
+
+namespace ck {
+
+// magic number division
+// Caution:
+//   1. For uint32_t as dividend: magic number division implementation being used would produce
+//   correct result if the dividend is uint32_t and its value is within 31-bit value range.
+//   2. For int32_t as dividendd: magic number division for int32_t dividened has not been
+//   implemented, the int32_t dividend would be bit-wise interpreted as uint32_t and magic number
+//   division implementation for uint32_t is then used. Therefore, dividend value need to be
+//   non-negative.
+// TODO:
+//   1. Implement magic number divison for int32_t
+//   2. Implement magic number divison for unit32_t with 32-bit value range
+struct MagicDivision
+{
+    // uint32_t
+    __host__ __device__ static constexpr auto CalculateMagicNumbers(uint32_t divisor)
+    {
+        // assert(divisior >= 1 && divisior <= INT32_MAX);
+        uint32_t shift = 0;
+        for(shift = 0; shift < 32; ++shift)
+        {
+            if((1U << shift) >= divisor)
+            {
+                break;
+            }
+        }
+
+        uint64_t one        = 1;
+        uint64_t multiplier = ((one << 32) * ((one << shift) - divisor)) / divisor + 1;
+        // assert(multiplier <= 0xffffffffUL);
+
+        return make_tuple(uint32_t(multiplier), shift);
+    }
+
+    __host__ __device__ static constexpr uint32_t CalculateMagicMultiplier(uint32_t divisor)
+    {
+        auto tmp = CalculateMagicNumbers(divisor);
+
+        return tmp[Number<0>{}];
+    }
+
+    __host__ __device__ static constexpr uint32_t CalculateMagicShift(uint32_t divisor)
+    {
+        auto tmp = CalculateMagicNumbers(divisor);
+
+        return tmp[Number<1>{}];
+    }
+
+    // integral_constant<uint32_t, .>
+    template <uint32_t Divisor>
+    __host__ __device__ static constexpr auto
+        CalculateMagicNumbers(integral_constant<uint32_t, Divisor>)
+    {
+        constexpr auto tmp = CalculateMagicNumbers(uint32_t{Divisor});
+
+        constexpr uint32_t multiplier = tmp[Number<0>{}];
+        constexpr uint32_t shift      = tmp[Number<1>{}];
+
+        return make_tuple(integral_constant<uint32_t, multiplier>{},
+                          integral_constant<uint32_t, shift>{});
+    }
+
+    template <uint32_t Divisor>
+    __host__ __device__ static constexpr auto
+        CalculateMagicMultiplier(integral_constant<uint32_t, Divisor>)
+    {
+        constexpr uint32_t multiplier = CalculateMagicMultiplier(uint32_t{Divisor});
+
+        return integral_constant<uint32_t, multiplier>{};
+    }
+
+    template <uint32_t Divisor>
+    __host__ __device__ static constexpr auto
+        CalculateMagicShift(integral_constant<uint32_t, Divisor>)
+    {
+        constexpr uint32_t shift = CalculateMagicShift(uint32_t{Divisor});
+
+        return integral_constant<uint32_t, shift>{};
+    }
+
+    // integral_constant<int32_t, .>
+    template <int32_t Divisor>
+    __host__ __device__ static constexpr auto
+        CalculateMagicNumbers(integral_constant<int32_t, Divisor>)
+    {
+        return CalculateMagicNumbers(integral_constant<uint32_t, Divisor>{});
+    }
+
+    template <int32_t Divisor>
+    __host__ __device__ static constexpr auto
+        CalculateMagicMultiplier(integral_constant<int32_t, Divisor>)
+    {
+        return CalculateMagicMultiplier(integral_constant<uint32_t, Divisor>{});
+    }
+
+    template <int32_t Divisor>
+    __host__ __device__ static constexpr auto
+        CalculateMagicShift(integral_constant<int32_t, Divisor>)
+    {
+        return CalculateMagicShift(integral_constant<uint32_t, Divisor>{});
+    }
+
+    // magic division for uint32_t
+    __host__ __device__ static constexpr uint32_t
+    DoMagicDivision(uint32_t dividend, uint32_t multiplier, uint32_t shift)
+    {
+        uint32_t tmp = (uint64_t(dividend) * uint64_t(multiplier)) >> 32;
+        return (tmp + dividend) >> shift;
+    }
+
+#if 1 // debug
+    // HACK: magic division for int32_t
+    // HACK: use dividend_i32 as if it's uint32_t, dividend_i32 need to be
+    // non-negative for result to be correct
+    // TODO: figure out how to do magic number divison for int32_t as dividended
+    __host__ __device__ static constexpr int32_t
+    DoMagicDivision(int32_t dividend_i32, uint32_t multiplier, uint32_t shift)
+    {
+        uint32_t dividend_u32 = as_type<uint32_t>(dividend_i32);
+        uint32_t tmp =
+            (static_cast<uint64_t>(dividend_u32) * static_cast<uint64_t>(multiplier)) >> 32;
+        return (tmp + dividend_u32) >> shift;
+    }
+#else
+    // the inline ASM is producing wrong result
+    __host__ __device__ static int32_t
+    DoMagicDivision(int32_t dividend_i32, uint32_t multiplier, uint32_t shift)
+    {
+        uint32_t r;
+        asm volatile("\n \
+                v_mul_hi_u32 %0, %1, %2 \n \
+                v_add_u32_e32 %0, %1, %0 \n \
+                v_lshrrev_b32_e32 %0, %3, %0 \n \
+                "
+                     : "=v"(r)
+                     : "v"(as_type<uint32_t>(dividend_i32)), "s"(multiplier), "s"(shift));
+
+        return as_type<int32_t>(r);
+    }
+#endif
+};
+
+} // namespace ck
+
+#endif
diff --git a/composable_kernel/include/utility/math.hpp b/composable_kernel/include/utility/math.hpp
new file mode 100644
index 0000000000..e451059647
--- /dev/null
+++ b/composable_kernel/include/utility/math.hpp
@@ -0,0 +1,225 @@
+#ifndef CK_MATH_HPP
+#define CK_MATH_HPP
+
+#include "config.hpp"
+#include "integral_constant.hpp"
+#include "number.hpp"
+#include "type.hpp"
+
+namespace ck {
+namespace math {
+
+template <typename T, T s>
+struct scales
+{
+    __host__ __device__ constexpr T operator()(T a) const { return s * a; }
+};
+
+template <typename T>
+struct plus
+{
+    __host__ __device__ constexpr T operator()(T a, T b) const { return a + b; }
+};
+
+template <typename T>
+struct minus
+{
+    __host__ __device__ constexpr T operator()(T a, T b) const { return a - b; }
+};
+
+template <typename T>
+struct multiplies
+{
+    __host__ __device__ constexpr T operator()(T a, T b) const { return a * b; }
+};
+
+struct multiplies_v2
+{
+    template <typename A, typename B>
+    __host__ __device__ constexpr auto operator()(const A& a, const B& b) const
+    {
+        return a * b;
+    }
+};
+
+template <typename T>
+struct maximize
+{
+    __host__ __device__ constexpr T operator()(T a, T b) const { return a >= b ? a : b; }
+};
+
+template <typename T>
+struct minimize
+{
+    __host__ __device__ constexpr T operator()(T a, T b) const { return a <= b ? a : b; }
+};
+
+template <typename T>
+struct integer_divide_ceiler
+{
+    __host__ __device__ constexpr T operator()(T a, T b) const
+    {
+        static_assert(is_same<T, index_t>{} || is_same<T, int>{}, "wrong type");
+
+        return (a + b - Number<1>{}) / b;
+    }
+};
+
+template <typename X, typename Y>
+__host__ __device__ constexpr auto integer_divide_floor(X x, Y y)
+{
+    return x / y;
+}
+
+template <typename X, typename Y>
+__host__ __device__ constexpr auto integer_divide_ceil(X x, Y y)
+{
+    return (x + y - Number<1>{}) / y;
+}
+
+template <typename X, typename Y>
+__host__ __device__ constexpr auto integer_least_multiple(X x, Y y)
+{
+    return y * integer_divide_ceil(x, y);
+}
+
+template <typename T>
+__host__ __device__ constexpr T max(T x)
+{
+    return x;
+}
+
+template <typename T>
+__host__ __device__ constexpr T max(T x, T y)
+{
+    return x > y ? x : y;
+}
+
+template <index_t X>
+__host__ __device__ constexpr index_t max(Number<X>, index_t y)
+{
+    return X > y ? X : y;
+}
+
+template <index_t Y>
+__host__ __device__ constexpr index_t max(index_t x, Number<Y>)
+{
+    return x > Y ? x : Y;
+}
+
+template <typename X, typename... Ys>
+__host__ __device__ constexpr auto max(X x, Ys... ys)
+{
+    static_assert(sizeof...(Ys) > 0, "not enough argument");
+
+    return max(x, max(ys...));
+}
+
+template <typename T>
+__host__ __device__ constexpr T min(T x)
+{
+    return x;
+}
+
+template <typename T>
+__host__ __device__ constexpr T min(T x, T y)
+{
+    return x < y ? x : y;
+}
+
+template <index_t X>
+__host__ __device__ constexpr index_t min(Number<X>, index_t y)
+{
+    return X < y ? X : y;
+}
+
+template <index_t Y>
+__host__ __device__ constexpr index_t min(index_t x, Number<Y>)
+{
+    return x < Y ? x : Y;
+}
+
+template <typename X, typename... Ys>
+__host__ __device__ constexpr auto min(X x, Ys... ys)
+{
+    static_assert(sizeof...(Ys) > 0, "not enough argument");
+
+    return min(x, min(ys...));
+}
+
+// greatest common divisor, aka highest common factor
+__host__ __device__ constexpr index_t gcd(index_t x, index_t y)
+{
+    if(x < 0)
+    {
+        return gcd(-x, y);
+    }
+    else if(y < 0)
+    {
+        return gcd(x, -y);
+    }
+    else if(x == y || x == 0)
+    {
+        return y;
+    }
+    else if(y == 0)
+    {
+        return x;
+    }
+    else if(x > y)
+    {
+        return gcd(x % y, y);
+    }
+    else
+    {
+        return gcd(x, y % x);
+    }
+}
+
+template <index_t X, index_t Y>
+__host__ __device__ constexpr auto gcd(Number<X>, Number<Y>)
+{
+    constexpr auto r = gcd(X, Y);
+
+    return Number<r>{};
+}
+
+template <typename X,
+          typename... Ys,
+          typename std::enable_if<sizeof...(Ys) >= 2, bool>::type = false>
+__host__ __device__ constexpr auto gcd(X x, Ys... ys)
+{
+    return gcd(x, gcd(ys...));
+}
+
+// least common multiple
+template <typename X, typename Y>
+__host__ __device__ constexpr auto lcm(X x, Y y)
+{
+    return (x * y) / gcd(x, y);
+}
+
+template <typename X,
+          typename... Ys,
+          typename std::enable_if<sizeof...(Ys) >= 2, bool>::type = false>
+__host__ __device__ constexpr auto lcm(X x, Ys... ys)
+{
+    return lcm(x, lcm(ys...));
+}
+
+template <typename T>
+struct equal
+{
+    __host__ __device__ constexpr bool operator()(T x, T y) const { return x == y; }
+};
+
+template <typename T>
+struct less
+{
+    __host__ __device__ constexpr bool operator()(T x, T y) const { return x < y; }
+};
+
+} // namespace math
+} // namespace ck
+
+#endif
diff --git a/composable_kernel/include/utility/multi_index.hpp b/composable_kernel/include/utility/multi_index.hpp
new file mode 100644
index 0000000000..0bb34fb1e2
--- /dev/null
+++ b/composable_kernel/include/utility/multi_index.hpp
@@ -0,0 +1,12 @@
+#ifndef CK_MULTI_INDEX_HPP
+#define CK_MULTI_INDEX_HPP
+
+#include "common_header.hpp"
+
+#if CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX
+#include "array_multi_index.hpp"
+#else
+#include "statically_indexed_array_multi_index.hpp"
+#endif
+
+#endif
diff --git a/composable_kernel/include/utility/number.hpp b/composable_kernel/include/utility/number.hpp
new file mode 100644
index 0000000000..f8c5643694
--- /dev/null
+++ b/composable_kernel/include/utility/number.hpp
@@ -0,0 +1,44 @@
+#ifndef CK_NUMBER_HPP
+#define CK_NUMBER_HPP
+
+#include "integral_constant.hpp"
+
+namespace ck {
+
+template <index_t N>
+using Number = integral_constant<index_t, N>;
+
+template <index_t X, index_t Y>
+__host__ __device__ constexpr auto operator+(Number<X>, Number<Y>)
+{
+    return Number<X + Y>{};
+}
+
+template <index_t X, index_t Y>
+__host__ __device__ constexpr auto operator-(Number<X>, Number<Y>)
+{
+    static_assert(Y <= X, "wrong!");
+    return Number<X - Y>{};
+}
+
+template <index_t X, index_t Y>
+__host__ __device__ constexpr auto operator*(Number<X>, Number<Y>)
+{
+    return Number<X * Y>{};
+}
+
+template <index_t X, index_t Y>
+__host__ __device__ constexpr auto operator/(Number<X>, Number<Y>)
+{
+    static_assert(Y > 0, "wrong!");
+    return Number<X / Y>{};
+}
+
+template <index_t X, index_t Y>
+__host__ __device__ constexpr auto operator%(Number<X>, Number<Y>)
+{
+    static_assert(Y > 0, "wrong!");
+    return Number<X % Y>{};
+}
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/print.hpp b/composable_kernel/include/utility/print.hpp
new file mode 100644
index 0000000000..0dd646153a
--- /dev/null
+++ b/composable_kernel/include/utility/print.hpp
@@ -0,0 +1,70 @@
+#ifndef CK_PRINT_HPP
+#define CK_PRINT_HPP
+
+#include "array.hpp"
+#include "statically_indexed_array.hpp"
+#include "container_helper.hpp"
+#include "sequence.hpp"
+
+namespace ck {
+
+template <typename T>
+__host__ __device__ void print_array(const char* s, T a)
+{
+    using data_type         = decltype(a.At(Number<0>{}));
+    constexpr index_t nsize = a.Size();
+
+#if 0
+    if constexpr(is_same<data_type, uint32_t>{})
+    {
+        printf("%s size %u, {", s, nsize);
+        static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("%u, ", uint32_t{a[i]}); });
+        printf("}\n");
+    }
+    else if constexpr(is_same<data_type, int32_t>{})
+    {
+        printf("%s size %d, {", s, nsize);
+        static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("%d, ", int32_t{a[i]}); });
+        printf("}\n");
+    }
+    else if constexpr(is_same<data_type, bool>{})
+    {
+        printf("%s size %d, {", s, nsize);
+        static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("%d, ", bool{a[i]}); });
+        printf("}\n");
+    }
+#else
+    printf("%s size %d, {", s, nsize);
+    static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("%d, ", int32_t{a[i]}); });
+    printf("}\n");
+#endif
+}
+
+template <typename T>
+__host__ __device__ void print_array_v2(const char* s, T a)
+{
+    using data_type         = decltype(a.At(Number<0>{}));
+    constexpr index_t nsize = a.Size();
+
+#if 0
+    if constexpr(is_same<data_type, uint32_t>{})
+    {
+        printf("%s size %u, {", s, nsize);
+        static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("[%u] %u, ", i.value, a[i]); });
+        printf("}\n");
+    }
+    else if constexpr(is_same<data_type, int32_t>{})
+    {
+        printf("%s size %d, {", s, nsize);
+        static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("[%d] %d, ", i.value, a[i]); });
+        printf("}\n");
+    }
+#else
+    printf("%s size %d, {", s, nsize);
+    static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("[%d] %d, ", i.value, a[i]); });
+    printf("}\n");
+#endif
+}
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/sequence.hpp b/composable_kernel/include/utility/sequence.hpp
new file mode 100644
index 0000000000..81eb488715
--- /dev/null
+++ b/composable_kernel/include/utility/sequence.hpp
@@ -0,0 +1,882 @@
+#ifndef CK_SEQUENCE_HPP
+#define CK_SEQUENCE_HPP
+
+#include "integral_constant.hpp"
+#include "type.hpp"
+#include "functional.hpp"
+#include "math.hpp"
+
+namespace ck {
+
+template <index_t, index_t, index_t>
+struct static_for;
+
+template <index_t...>
+struct Sequence;
+
+template <typename Seq, index_t I>
+struct sequence_split;
+
+template <typename>
+struct sequence_reverse;
+
+template <typename>
+struct sequence_map_inverse;
+
+template <typename>
+struct is_valid_sequence_map;
+
+template <index_t I, index_t... Is>
+__host__ __device__ constexpr auto sequence_pop_front(Sequence<I, Is...>);
+
+template <typename Seq>
+__host__ __device__ constexpr auto sequence_pop_back(Seq);
+
+template <index_t... Is>
+struct Sequence
+{
+    using Type      = Sequence;
+    using data_type = index_t;
+
+    static constexpr index_t mSize = sizeof...(Is);
+
+    __host__ __device__ static constexpr auto Size() { return Number<mSize>{}; }
+
+    __host__ __device__ static constexpr auto GetSize() { return Size(); }
+
+    __host__ __device__ static constexpr index_t At(index_t I)
+    {
+        // the last dummy element is to prevent compiler complain about empty array, when mSize = 0
+        const index_t mData[mSize + 1] = {Is..., 0};
+        return mData[I];
+    }
+
+    template <index_t I>
+    __host__ __device__ static constexpr auto At(Number<I>)
+    {
+        static_assert(I < mSize, "wrong! I too large");
+
+        return Number<At(I)>{};
+    }
+
+    template <index_t I>
+    __host__ __device__ static constexpr auto Get(Number<I>)
+    {
+        return At(Number<I>{});
+    }
+
+    template <typename I>
+    __host__ __device__ constexpr auto operator[](I i) const
+    {
+        return At(i);
+    }
+
+    template <index_t... IRs>
+    __host__ __device__ static constexpr auto ReorderGivenNew2Old(Sequence<IRs...> /*new2old*/)
+    {
+        static_assert(sizeof...(Is) == sizeof...(IRs),
+                      "wrong! reorder map should have the same size as Sequence to be rerodered");
+
+        static_assert(is_valid_sequence_map<Sequence<IRs...>>::value, "wrong! invalid reorder map");
+
+        return Sequence<Type::At(Number<IRs>{})...>{};
+    }
+
+    // MapOld2New is Sequence<...>
+    template <typename MapOld2New>
+    __host__ __device__ static constexpr auto ReorderGivenOld2New(MapOld2New)
+    {
+        static_assert(MapOld2New::Size() == Size(),
+                      "wrong! reorder map should have the same size as Sequence to be rerodered");
+
+        static_assert(is_valid_sequence_map<MapOld2New>::value, "wrong! invalid reorder map");
+
+        return ReorderGivenNew2Old(typename sequence_map_inverse<MapOld2New>::type{});
+    }
+
+    __host__ __device__ static constexpr auto Reverse()
+    {
+        return typename sequence_reverse<Type>::type{};
+    }
+
+    __host__ __device__ static constexpr auto Front()
+    {
+        static_assert(mSize > 0, "wrong!");
+        return At(Number<0>{});
+    }
+
+    __host__ __device__ static constexpr auto Back()
+    {
+        static_assert(mSize > 0, "wrong!");
+        return At(Number<mSize - 1>{});
+    }
+
+    __host__ __device__ static constexpr auto PopFront() { return sequence_pop_front(Type{}); }
+
+    __host__ __device__ static constexpr auto PopBack() { return sequence_pop_back(Type{}); }
+
+    template <index_t... Xs>
+    __host__ __device__ static constexpr auto PushFront(Sequence<Xs...>)
+    {
+        return Sequence<Xs..., Is...>{};
+    }
+
+    template <index_t... Xs>
+    __host__ __device__ static constexpr auto PushFront(Number<Xs>...)
+    {
+        return Sequence<Xs..., Is...>{};
+    }
+
+    template <index_t... Xs>
+    __host__ __device__ static constexpr auto PushBack(Sequence<Xs...>)
+    {
+        return Sequence<Is..., Xs...>{};
+    }
+
+    template <index_t... Xs>
+    __host__ __device__ static constexpr auto PushBack(Number<Xs>...)
+    {
+        return Sequence<Is..., Xs...>{};
+    }
+
+    template <index_t... Ns>
+    __host__ __device__ static constexpr auto Extract(Number<Ns>...)
+    {
+        return Sequence<Type::At(Number<Ns>{})...>{};
+    }
+
+    template <index_t... Ns>
+    __host__ __device__ static constexpr auto Extract(Sequence<Ns...>)
+    {
+        return Sequence<Type::At(Number<Ns>{})...>{};
+    }
+
+    template <index_t I, index_t X>
+    __host__ __device__ static constexpr auto Modify(Number<I>, Number<X>)
+    {
+        static_assert(I < Size(), "wrong!");
+
+        using seq_split          = sequence_split<Type, I>;
+        constexpr auto seq_left  = typename seq_split::left_type{};
+        constexpr auto seq_right = typename seq_split::right_type{}.PopFront();
+
+        return seq_left.PushBack(Number<X>{}).PushBack(seq_right);
+    }
+
+    template <typename F>
+    __host__ __device__ static constexpr auto Transform(F f)
+    {
+        return Sequence<f(Is)...>{};
+    }
+
+    __host__ __device__ static void Print()
+    {
+        printf("{");
+        printf("size %d, ", index_t{Size()});
+        static_for<0, Size(), 1>{}([&](auto i) { printf("%d ", At(i).value); });
+        printf("}");
+    }
+};
+
+// merge sequence
+template <typename Seq, typename... Seqs>
+struct sequence_merge
+{
+    using type = typename sequence_merge<Seq, typename sequence_merge<Seqs...>::type>::type;
+};
+
+template <index_t... Xs, index_t... Ys>
+struct sequence_merge<Sequence<Xs...>, Sequence<Ys...>>
+{
+    using type = Sequence<Xs..., Ys...>;
+};
+
+template <typename Seq>
+struct sequence_merge<Seq>
+{
+    using type = Seq;
+};
+
+// generate sequence
+template <index_t NSize, typename F>
+struct sequence_gen
+{
+    template <index_t IBegin, index_t NRemain, typename G>
+    struct sequence_gen_impl
+    {
+        static constexpr index_t NRemainLeft  = NRemain / 2;
+        static constexpr index_t NRemainRight = NRemain - NRemainLeft;
+        static constexpr index_t IMiddle      = IBegin + NRemainLeft;
+
+        using type = typename sequence_merge<
+            typename sequence_gen_impl<IBegin, NRemainLeft, G>::type,
+            typename sequence_gen_impl<IMiddle, NRemainRight, G>::type>::type;
+    };
+
+    template <index_t I, typename G>
+    struct sequence_gen_impl<I, 1, G>
+    {
+        static constexpr index_t Is = G{}(Number<I>{});
+        using type                  = Sequence<Is>;
+    };
+
+    template <index_t I, typename G>
+    struct sequence_gen_impl<I, 0, G>
+    {
+        using type = Sequence<>;
+    };
+
+    using type = typename sequence_gen_impl<0, NSize, F>::type;
+};
+
+// arithmetic sequence
+template <index_t IBegin, index_t IEnd, index_t Increment>
+struct arithmetic_sequence_gen
+{
+    struct F
+    {
+        __host__ __device__ constexpr index_t operator()(index_t i) const
+        {
+            return i * Increment + IBegin;
+        }
+    };
+
+    using type = typename sequence_gen<(IEnd - IBegin) / Increment, F>::type;
+};
+
+// uniform sequence
+template <index_t NSize, index_t I>
+struct uniform_sequence_gen
+{
+    struct F
+    {
+        __host__ __device__ constexpr index_t operator()(index_t) const { return I; }
+    };
+
+    using type = typename sequence_gen<NSize, F>::type;
+};
+
+// reverse inclusive scan (with init) sequence
+template <typename, typename, index_t>
+struct sequence_reverse_inclusive_scan;
+
+template <index_t I, index_t... Is, typename Reduce, index_t Init>
+struct sequence_reverse_inclusive_scan<Sequence<I, Is...>, Reduce, Init>
+{
+    using old_scan = typename sequence_reverse_inclusive_scan<Sequence<Is...>, Reduce, Init>::type;
+
+    static constexpr index_t new_reduce = Reduce{}(I, old_scan{}.Front());
+
+    using type = typename sequence_merge<Sequence<new_reduce>, old_scan>::type;
+};
+
+template <index_t I, typename Reduce, index_t Init>
+struct sequence_reverse_inclusive_scan<Sequence<I>, Reduce, Init>
+{
+    using type = Sequence<Reduce{}(I, Init)>;
+};
+
+template <typename Reduce, index_t Init>
+struct sequence_reverse_inclusive_scan<Sequence<>, Reduce, Init>
+{
+    using type = Sequence<>;
+};
+
+// split sequence
+template <typename Seq, index_t I>
+struct sequence_split
+{
+    static constexpr index_t NSize = Seq{}.Size();
+
+    using range0 = typename arithmetic_sequence_gen<0, I, 1>::type;
+    using range1 = typename arithmetic_sequence_gen<I, NSize, 1>::type;
+
+    using left_type  = decltype(Seq::Extract(range0{}));
+    using right_type = decltype(Seq::Extract(range1{}));
+};
+
+// reverse sequence
+template <typename Seq>
+struct sequence_reverse
+{
+    static constexpr index_t NSize = Seq{}.Size();
+
+    using seq_split = sequence_split<Seq, NSize / 2>;
+    using type      = typename sequence_merge<
+        typename sequence_reverse<typename seq_split::right_type>::type,
+        typename sequence_reverse<typename seq_split::left_type>::type>::type;
+};
+
+template <index_t I>
+struct sequence_reverse<Sequence<I>>
+{
+    using type = Sequence<I>;
+};
+
+template <index_t I0, index_t I1>
+struct sequence_reverse<Sequence<I0, I1>>
+{
+    using type = Sequence<I1, I0>;
+};
+
+#if 1
+template <typename Reduce, typename Seq, typename... Seqs>
+struct sequence_reduce
+{
+    using type = typename sequence_reduce<Reduce,
+                                          Seq,
+                                          typename sequence_reduce<Reduce, Seqs...>::type>::type;
+};
+
+template <typename Reduce, index_t... Xs, index_t... Ys>
+struct sequence_reduce<Reduce, Sequence<Xs...>, Sequence<Ys...>>
+{
+    using type = Sequence<Reduce{}(Xs, Ys)...>;
+};
+
+template <typename Reduce, typename Seq>
+struct sequence_reduce<Reduce, Seq>
+{
+    using type = Seq;
+};
+#endif
+
+template <typename Values, typename Ids, typename Compare>
+struct sequence_sort_impl
+{
+    template <typename LeftValues,
+              typename LeftIds,
+              typename RightValues,
+              typename RightIds,
+              typename MergedValues,
+              typename MergedIds,
+              typename Comp>
+    struct sorted_sequence_merge_impl
+    {
+        static constexpr bool choose_left = LeftValues::Front() < RightValues::Front();
+
+        static constexpr index_t chosen_value =
+            choose_left ? LeftValues::Front() : RightValues::Front();
+        static constexpr index_t chosen_id = choose_left ? LeftIds::Front() : RightIds::Front();
+
+        using new_merged_values = decltype(MergedValues::PushBack(Number<chosen_value>{}));
+        using new_merged_ids    = decltype(MergedIds::PushBack(Number<chosen_id>{}));
+
+        using new_left_values =
+            typename conditional<choose_left, decltype(LeftValues::PopFront()), LeftValues>::type;
+        using new_left_ids =
+            typename conditional<choose_left, decltype(LeftIds::PopFront()), LeftIds>::type;
+
+        using new_right_values =
+            typename conditional<choose_left, RightValues, decltype(RightValues::PopFront())>::type;
+        using new_right_ids =
+            typename conditional<choose_left, RightIds, decltype(RightIds::PopFront())>::type;
+
+        using merge = sorted_sequence_merge_impl<new_left_values,
+                                                 new_left_ids,
+                                                 new_right_values,
+                                                 new_right_ids,
+                                                 new_merged_values,
+                                                 new_merged_ids,
+                                                 Comp>;
+        // this is output
+        using merged_values = typename merge::merged_values;
+        using merged_ids    = typename merge::merged_ids;
+    };
+
+    template <typename LeftValues,
+              typename LeftIds,
+              typename MergedValues,
+              typename MergedIds,
+              typename Comp>
+    struct sorted_sequence_merge_impl<LeftValues,
+                                      LeftIds,
+                                      Sequence<>,
+                                      Sequence<>,
+                                      MergedValues,
+                                      MergedIds,
+                                      Comp>
+    {
+        using merged_values = typename sequence_merge<MergedValues, LeftValues>::type;
+        using merged_ids    = typename sequence_merge<MergedIds, LeftIds>::type;
+    };
+
+    template <typename RightValues,
+              typename RightIds,
+              typename MergedValues,
+              typename MergedIds,
+              typename Comp>
+    struct sorted_sequence_merge_impl<Sequence<>,
+                                      Sequence<>,
+                                      RightValues,
+                                      RightIds,
+                                      MergedValues,
+                                      MergedIds,
+                                      Comp>
+    {
+        using merged_values = typename sequence_merge<MergedValues, RightValues>::type;
+        using merged_ids    = typename sequence_merge<MergedIds, RightIds>::type;
+    };
+
+    template <typename LeftValues,
+              typename LeftIds,
+              typename RightValues,
+              typename RightIds,
+              typename Comp>
+    struct sorted_sequence_merge
+    {
+        using merge = sorted_sequence_merge_impl<LeftValues,
+                                                 LeftIds,
+                                                 RightValues,
+                                                 RightIds,
+                                                 Sequence<>,
+                                                 Sequence<>,
+                                                 Comp>;
+
+        using merged_values = typename merge::merged_values;
+        using merged_ids    = typename merge::merged_ids;
+    };
+
+    static constexpr index_t nsize = Values::Size();
+
+    using split_unsorted_values = sequence_split<Values, nsize / 2>;
+    using split_unsorted_ids    = sequence_split<Ids, nsize / 2>;
+
+    using left_unsorted_values = typename split_unsorted_values::left_type;
+    using left_unsorted_ids    = typename split_unsorted_ids::left_type;
+    using left_sort          = sequence_sort_impl<left_unsorted_values, left_unsorted_ids, Compare>;
+    using left_sorted_values = typename left_sort::sorted_values;
+    using left_sorted_ids    = typename left_sort::sorted_ids;
+
+    using right_unsorted_values = typename split_unsorted_values::right_type;
+    using right_unsorted_ids    = typename split_unsorted_ids::right_type;
+    using right_sort = sequence_sort_impl<right_unsorted_values, right_unsorted_ids, Compare>;
+    using right_sorted_values = typename right_sort::sorted_values;
+    using right_sorted_ids    = typename right_sort::sorted_ids;
+
+    using merged_sorted = sorted_sequence_merge<left_sorted_values,
+                                                left_sorted_ids,
+                                                right_sorted_values,
+                                                right_sorted_ids,
+                                                Compare>;
+
+    using sorted_values = typename merged_sorted::merged_values;
+    using sorted_ids    = typename merged_sorted::merged_ids;
+};
+
+template <index_t ValueX, index_t ValueY, index_t IdX, index_t IdY, typename Compare>
+struct sequence_sort_impl<Sequence<ValueX, ValueY>, Sequence<IdX, IdY>, Compare>
+{
+    static constexpr bool choose_x = Compare{}(ValueX, ValueY);
+
+    using sorted_values =
+        typename conditional<choose_x, Sequence<ValueX, ValueY>, Sequence<ValueY, ValueX>>::type;
+    using sorted_ids = typename conditional<choose_x, Sequence<IdX, IdY>, Sequence<IdY, IdX>>::type;
+};
+
+template <index_t Value, index_t Id, typename Compare>
+struct sequence_sort_impl<Sequence<Value>, Sequence<Id>, Compare>
+{
+    using sorted_values = Sequence<Value>;
+    using sorted_ids    = Sequence<Id>;
+};
+
+template <typename Compare>
+struct sequence_sort_impl<Sequence<>, Sequence<>, Compare>
+{
+    using sorted_values = Sequence<>;
+    using sorted_ids    = Sequence<>;
+};
+
+template <typename Values, typename Compare>
+struct sequence_sort
+{
+    using unsorted_ids = typename arithmetic_sequence_gen<0, Values::Size(), 1>::type;
+    using sort         = sequence_sort_impl<Values, unsorted_ids, Compare>;
+
+    // this is output
+    using type                = typename sort::sorted_values;
+    using sorted2unsorted_map = typename sort::sorted_ids;
+};
+
+template <typename Values, typename Less, typename Equal>
+struct sequence_unique_sort
+{
+    template <typename RemainValues,
+              typename RemainIds,
+              typename UniquifiedValues,
+              typename UniquifiedIds,
+              typename Eq>
+    struct sorted_sequence_uniquify_impl
+    {
+        static constexpr index_t current_value = RemainValues::Front();
+        static constexpr index_t current_id    = RemainIds::Front();
+
+        static constexpr bool is_unique_value = (current_value != UniquifiedValues::Back());
+
+        using new_remain_values = decltype(RemainValues::PopFront());
+        using new_remain_ids    = decltype(RemainIds::PopFront());
+
+        using new_uniquified_values =
+            typename conditional<is_unique_value,
+                                 decltype(UniquifiedValues::PushBack(Number<current_value>{})),
+                                 UniquifiedValues>::type;
+
+        using new_uniquified_ids =
+            typename conditional<is_unique_value,
+                                 decltype(UniquifiedIds::PushBack(Number<current_id>{})),
+                                 UniquifiedIds>::type;
+
+        using uniquify = sorted_sequence_uniquify_impl<new_remain_values,
+                                                       new_remain_ids,
+                                                       new_uniquified_values,
+                                                       new_uniquified_ids,
+                                                       Eq>;
+
+        // this is output
+        using uniquified_values = typename uniquify::uniquified_values;
+        using uniquified_ids    = typename uniquify::uniquified_ids;
+    };
+
+    template <typename UniquifiedValues, typename UniquifiedIds, typename Eq>
+    struct sorted_sequence_uniquify_impl<Sequence<>,
+                                         Sequence<>,
+                                         UniquifiedValues,
+                                         UniquifiedIds,
+                                         Eq>
+    {
+        using uniquified_values = UniquifiedValues;
+        using uniquified_ids    = UniquifiedIds;
+    };
+
+    template <typename SortedValues, typename SortedIds, typename Eq>
+    struct sorted_sequence_uniquify
+    {
+        using uniquify = sorted_sequence_uniquify_impl<decltype(SortedValues::PopFront()),
+                                                       decltype(SortedIds::PopFront()),
+                                                       Sequence<SortedValues::Front()>,
+                                                       Sequence<SortedIds::Front()>,
+                                                       Eq>;
+
+        using uniquified_values = typename uniquify::uniquified_values;
+        using uniquified_ids    = typename uniquify::uniquified_ids;
+    };
+
+    using sort          = sequence_sort<Values, Less>;
+    using sorted_values = typename sort::type;
+    using sorted_ids    = typename sort::sorted2unsorted_map;
+
+    using uniquify = sorted_sequence_uniquify<sorted_values, sorted_ids, Equal>;
+
+    // this is output
+    using type                = typename uniquify::uniquified_values;
+    using sorted2unsorted_map = typename uniquify::uniquified_ids;
+};
+
+template <typename SeqMap>
+struct is_valid_sequence_map : is_same<typename arithmetic_sequence_gen<0, SeqMap::Size(), 1>::type,
+                                       typename sequence_sort<SeqMap, math::less<index_t>>::type>
+{
+};
+
+template <typename SeqMap>
+struct sequence_map_inverse
+{
+    template <typename X2Y, typename WorkingY2X, index_t XBegin, index_t XRemain>
+    struct sequence_map_inverse_impl
+    {
+        static constexpr auto new_y2x =
+            WorkingY2X::Modify(X2Y::At(Number<XBegin>{}), Number<XBegin>{});
+
+        using type =
+            typename sequence_map_inverse_impl<X2Y, decltype(new_y2x), XBegin + 1, XRemain - 1>::
+                type;
+    };
+
+    template <typename X2Y, typename WorkingY2X, index_t XBegin>
+    struct sequence_map_inverse_impl<X2Y, WorkingY2X, XBegin, 0>
+    {
+        using type = WorkingY2X;
+    };
+
+    using type =
+        typename sequence_map_inverse_impl<SeqMap,
+                                           typename uniform_sequence_gen<SeqMap::Size(), 0>::type,
+                                           0,
+                                           SeqMap::Size()>::type;
+};
+
+template <index_t... Xs, index_t... Ys>
+__host__ __device__ constexpr auto operator+(Sequence<Xs...>, Sequence<Ys...>)
+{
+    static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong! inconsistent size");
+
+    return Sequence<(Xs + Ys)...>{};
+}
+
+template <index_t... Xs, index_t... Ys>
+__host__ __device__ constexpr auto operator-(Sequence<Xs...>, Sequence<Ys...>)
+{
+    static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong! inconsistent size");
+
+    return Sequence<(Xs - Ys)...>{};
+}
+
+template <index_t... Xs, index_t... Ys>
+__host__ __device__ constexpr auto operator*(Sequence<Xs...>, Sequence<Ys...>)
+{
+    static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong! inconsistent size");
+
+    return Sequence<(Xs * Ys)...>{};
+}
+
+template <index_t... Xs, index_t... Ys>
+__host__ __device__ constexpr auto operator/(Sequence<Xs...>, Sequence<Ys...>)
+{
+    static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong! inconsistent size");
+
+    return Sequence<(Xs / Ys)...>{};
+}
+
+template <index_t... Xs, index_t... Ys>
+__host__ __device__ constexpr auto operator%(Sequence<Xs...>, Sequence<Ys...>)
+{
+    static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong! inconsistent size");
+
+    return Sequence<(Xs % Ys)...>{};
+}
+
+template <index_t... Xs, index_t Y>
+__host__ __device__ constexpr auto operator+(Sequence<Xs...>, Number<Y>)
+{
+    return Sequence<(Xs + Y)...>{};
+}
+
+template <index_t... Xs, index_t Y>
+__host__ __device__ constexpr auto operator-(Sequence<Xs...>, Number<Y>)
+{
+    return Sequence<(Xs - Y)...>{};
+}
+
+template <index_t... Xs, index_t Y>
+__host__ __device__ constexpr auto operator*(Sequence<Xs...>, Number<Y>)
+{
+    return Sequence<(Xs * Y)...>{};
+}
+
+template <index_t... Xs, index_t Y>
+__host__ __device__ constexpr auto operator/(Sequence<Xs...>, Number<Y>)
+{
+    return Sequence<(Xs / Y)...>{};
+}
+
+template <index_t... Xs, index_t Y>
+__host__ __device__ constexpr auto operator%(Sequence<Xs...>, Number<Y>)
+{
+    return Sequence<(Xs % Y)...>{};
+}
+
+template <index_t Y, index_t... Xs>
+__host__ __device__ constexpr auto operator+(Number<Y>, Sequence<Xs...>)
+{
+    return Sequence<(Y + Xs)...>{};
+}
+
+template <index_t Y, index_t... Xs>
+__host__ __device__ constexpr auto operator-(Number<Y>, Sequence<Xs...>)
+{
+    constexpr auto seq_x = Sequence<Xs...>{};
+
+    return Sequence<(Y - Xs)...>{};
+}
+
+template <index_t Y, index_t... Xs>
+__host__ __device__ constexpr auto operator*(Number<Y>, Sequence<Xs...>)
+{
+    return Sequence<(Y * Xs)...>{};
+}
+
+template <index_t Y, index_t... Xs>
+__host__ __device__ constexpr auto operator/(Number<Y>, Sequence<Xs...>)
+{
+    return Sequence<(Y / Xs)...>{};
+}
+
+template <index_t Y, index_t... Xs>
+__host__ __device__ constexpr auto operator%(Number<Y>, Sequence<Xs...>)
+{
+    return Sequence<(Y % Xs)...>{};
+}
+
+template <index_t I, index_t... Is>
+__host__ __device__ constexpr auto sequence_pop_front(Sequence<I, Is...>)
+{
+    return Sequence<Is...>{};
+}
+
+template <typename Seq>
+__host__ __device__ constexpr auto sequence_pop_back(Seq)
+{
+    static_assert(Seq::Size() > 0, "wrong! cannot pop an empty Sequence!");
+    return sequence_pop_front(Seq::Reverse()).Reverse();
+}
+
+template <typename... Seqs>
+__host__ __device__ constexpr auto merge_sequences(Seqs...)
+{
+    return typename sequence_merge<Seqs...>::type{};
+}
+
+template <typename F, index_t... Xs>
+__host__ __device__ constexpr auto transform_sequences(F f, Sequence<Xs...>)
+{
+    return Sequence<f(Xs)...>{};
+}
+
+template <typename F, index_t... Xs, index_t... Ys>
+__host__ __device__ constexpr auto transform_sequences(F f, Sequence<Xs...>, Sequence<Ys...>)
+{
+    static_assert(Sequence<Xs...>::mSize == Sequence<Ys...>::mSize, "Dim not the same");
+
+    return Sequence<f(Xs, Ys)...>{};
+}
+
+template <typename F, index_t... Xs, index_t... Ys, index_t... Zs>
+__host__ __device__ constexpr auto
+transform_sequences(F f, Sequence<Xs...>, Sequence<Ys...>, Sequence<Zs...>)
+{
+    static_assert(Sequence<Xs...>::mSize == Sequence<Ys...>::mSize &&
+                      Sequence<Xs...>::mSize == Sequence<Zs...>::mSize,
+                  "Dim not the same");
+
+    return Sequence<f(Xs, Ys, Zs)...>{};
+}
+
+template <typename Seq, typename Reduce, index_t Init>
+__host__ __device__ constexpr auto reverse_inclusive_scan_sequence(Seq, Reduce, Number<Init>)
+{
+    return typename sequence_reverse_inclusive_scan<Seq, Reduce, Init>::type{};
+}
+
+template <typename Seq, typename Reduce, index_t Init>
+__host__ __device__ constexpr auto reverse_exclusive_scan_sequence(Seq, Reduce, Number<Init>)
+{
+    return reverse_inclusive_scan_sequence(Seq::PopFront(), Reduce{}, Number<Init>{})
+        .PushBack(Number<Init>{});
+}
+
+template <typename Seq, typename Reduce, index_t Init>
+__host__ __device__ constexpr auto inclusive_scan_sequence(Seq, Reduce, Number<Init>)
+{
+    return reverse_inclusive_scan_sequence(Seq{}.Reverse(), Reduce{}, Number<Init>{}).Reverse();
+}
+
+template <typename Seq, index_t... Is>
+__host__ __device__ constexpr auto pick_sequence_elements_by_ids(Seq, Sequence<Is...> /* ids */)
+{
+    return Sequence<Seq::At(Number<Is>{})...>{};
+}
+
+#if 1
+namespace detail {
+template <typename WorkSeq, typename RemainSeq, typename RemainMask>
+struct pick_sequence_elements_by_mask_impl
+{
+    using new_work_seq = typename conditional<RemainMask::Front(),
+                                              decltype(WorkSeq::PushBack(RemainSeq::Front())),
+                                              WorkSeq>::type;
+
+    using type =
+        typename pick_sequence_elements_by_mask_impl<new_work_seq,
+                                                     decltype(RemainSeq::PopFront()),
+                                                     decltype(RemainMask::PopFront())>::type;
+};
+
+template <typename WorkSeq>
+struct pick_sequence_elements_by_mask_impl<WorkSeq, Sequence<>, Sequence<>>
+{
+    using type = WorkSeq;
+};
+
+} // namespace detail
+
+template <typename Seq, typename Mask>
+__host__ __device__ constexpr auto pick_sequence_elements_by_mask(Seq, Mask)
+{
+    static_assert(Seq::Size() == Mask::Size(), "wrong!");
+
+    return typename detail::pick_sequence_elements_by_mask_impl<Sequence<>, Seq, Mask>::type{};
+}
+
+namespace detail {
+template <typename WorkSeq, typename RemainValues, typename RemainIds>
+struct modify_sequence_elements_by_ids_impl
+{
+    using new_work_seq = decltype(WorkSeq::Modify(RemainIds::Front(), RemainValues::Front()));
+
+    using type =
+        typename modify_sequence_elements_by_ids_impl<new_work_seq,
+                                                      decltype(RemainValues::PopFront()),
+                                                      decltype(RemainIds::PopFront())>::type;
+};
+
+template <typename WorkSeq>
+struct modify_sequence_elements_by_ids_impl<WorkSeq, Sequence<>, Sequence<>>
+{
+    using type = WorkSeq;
+};
+} // namespace detail
+
+template <typename Seq, typename Values, typename Ids>
+__host__ __device__ constexpr auto modify_sequence_elements_by_ids(Seq, Values, Ids)
+{
+    static_assert(Values::Size() == Ids::Size() && Seq::Size() >= Values::Size(), "wrong!");
+
+    return typename detail::modify_sequence_elements_by_ids_impl<Seq, Values, Ids>::type{};
+}
+#endif
+
+template <typename Seq, typename Reduce, index_t Init>
+__host__ __device__ constexpr index_t
+reduce_on_sequence(Seq, Reduce f, Number<Init> /*initial_value*/)
+{
+    index_t result = Init;
+
+    for(index_t i = 0; i < Seq::Size(); ++i)
+    {
+        result = f(result, Seq::At(i));
+    }
+
+    return result;
+}
+
+// TODO: a generic any_of for any container
+template <typename Seq, typename F>
+__host__ __device__ constexpr bool sequence_any_of(Seq, F f)
+{
+    bool flag = false;
+
+    for(index_t i = 0; i < Seq::Size(); ++i)
+    {
+        flag = flag || f(Seq::At(i));
+    }
+
+    return flag;
+}
+
+// TODO: a generic all_of for any container
+template <typename Seq, typename F>
+__host__ __device__ constexpr bool sequence_all_of(Seq, F f)
+{
+    bool flag = true;
+
+    for(index_t i = 0; i < Seq::Size(); ++i)
+    {
+        flag = flag && f(Seq::At(i));
+    }
+
+    return flag;
+}
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/sequence_helper.hpp b/composable_kernel/include/utility/sequence_helper.hpp
new file mode 100644
index 0000000000..88d7da63e8
--- /dev/null
+++ b/composable_kernel/include/utility/sequence_helper.hpp
@@ -0,0 +1,36 @@
+#ifndef CK_SEQUENCE_HELPER_HPP
+#define CK_SEQUENCE_HELPER_HPP
+
+#include "tuple.hpp"
+
+namespace ck {
+
+template <index_t... Is>
+__host__ __device__ constexpr auto make_sequence(Number<Is>...)
+{
+    return Sequence<Is...>{};
+}
+
+// F returns index_t
+template <typename F, index_t N>
+__host__ __device__ constexpr auto generate_sequence(F, Number<N>)
+{
+    return typename sequence_gen<N, F>::type{};
+}
+
+// F returns Number<>
+template <typename F, index_t N>
+__host__ __device__ constexpr auto generate_sequence_v2(F&& f, Number<N>)
+{
+    return unpack([&f](auto&&... xs) { return make_sequence(f(xs)...); },
+                  typename arithmetic_sequence_gen<0, N, 1>::type{});
+}
+
+template <index_t... Is>
+__host__ __device__ constexpr auto to_sequence(Tuple<Number<Is>...>)
+{
+    return Sequence<Is...>{};
+}
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/static_buffer.hpp b/composable_kernel/include/utility/static_buffer.hpp
new file mode 100644
index 0000000000..a23cf4f80d
--- /dev/null
+++ b/composable_kernel/include/utility/static_buffer.hpp
@@ -0,0 +1,35 @@
+#ifndef CK_STATIC_BUFFER_HPP
+#define CK_STATIC_BUFFER_HPP
+
+#include "statically_indexed_array.hpp"
+
+namespace ck {
+
+template <AddressSpaceEnum_t BufferAddressSpace, typename T, index_t N>
+struct StaticBuffer : public StaticallyIndexedArray<T, N>
+{
+    using type = T;
+    using base = StaticallyIndexedArray<T, N>;
+
+    __host__ __device__ constexpr StaticBuffer() : base{} {}
+
+    __host__ __device__ static constexpr AddressSpaceEnum_t GetAddressSpace()
+    {
+        return BufferAddressSpace;
+    }
+
+    __host__ __device__ static constexpr bool IsStaticBuffer() { return true; }
+
+    __host__ __device__ static constexpr bool IsDynamicBuffer() { return false; }
+};
+
+template <AddressSpaceEnum_t BufferAddressSpace = AddressSpaceEnum_t::Generic,
+          typename T,
+          index_t N>
+__host__ __device__ constexpr auto make_static_buffer(Number<N>)
+{
+    return StaticBuffer<BufferAddressSpace, T, N>{};
+}
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/statically_indexed_array.hpp b/composable_kernel/include/utility/statically_indexed_array.hpp
new file mode 100644
index 0000000000..f30a3a9ee6
--- /dev/null
+++ b/composable_kernel/include/utility/statically_indexed_array.hpp
@@ -0,0 +1,40 @@
+#ifndef CK_STATICALLY_INDEXED_ARRAY_HPP
+#define CK_STATICALLY_INDEXED_ARRAY_HPP
+
+#include "functional2.hpp"
+#include "sequence.hpp"
+#include "tuple.hpp"
+
+namespace ck {
+
+namespace detail {
+
+template <typename T, index_t NSize>
+__host__ __device__ constexpr auto generate_same_type_tuple()
+{
+    return generate_tuple([](auto) -> T { return T{}; }, Number<NSize>{});
+}
+
+template <typename T, index_t NSize>
+using same_type_tuple = decltype(generate_same_type_tuple<T, NSize>());
+
+} // namespace detail
+
+template <typename T, index_t NSize>
+using StaticallyIndexedArray = detail::same_type_tuple<T, NSize>;
+
+template <typename X, typename... Xs>
+__host__ __device__ constexpr auto make_statically_indexed_array(const X& x, const Xs&... xs)
+{
+    return StaticallyIndexedArray<X, sizeof...(Xs) + 1>(x, static_cast<X>(xs)...);
+}
+
+// make empty StaticallyIndexedArray
+template <typename X>
+__host__ __device__ constexpr auto make_statically_indexed_array()
+{
+    return StaticallyIndexedArray<X, 0>();
+}
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/statically_indexed_array_multi_index.hpp b/composable_kernel/include/utility/statically_indexed_array_multi_index.hpp
new file mode 100644
index 0000000000..9e96f06d73
--- /dev/null
+++ b/composable_kernel/include/utility/statically_indexed_array_multi_index.hpp
@@ -0,0 +1,108 @@
+#ifndef CK_STATICALLY_INDEXED_ARRAY_MULTI_INDEX_HPP
+#define CK_STATICALLY_INDEXED_ARRAY_MULTI_INDEX_HPP
+
+#include "common_header.hpp"
+
+namespace ck {
+
+template <index_t N>
+using MultiIndex = StaticallyIndexedArray<index_t, N>;
+
+template <typename... Xs>
+__host__ __device__ constexpr auto make_multi_index(Xs&&... xs)
+{
+    return make_statically_indexed_array<index_t>(index_t{xs}...);
+}
+
+template <index_t NSize>
+__host__ __device__ constexpr auto make_zero_multi_index()
+{
+    return unpack([](auto... xs) { return make_multi_index(xs...); },
+                  typename uniform_sequence_gen<NSize, 0>::type{});
+}
+
+template <typename T>
+__host__ __device__ constexpr auto to_multi_index(const T& x)
+{
+    return unpack([](auto... ys) { return make_multi_index(ys...); }, x);
+}
+
+// Here should use MultiIndex<NSize>, instead of Tuple<Ys...>, although the former
+// is the alias of the latter. This is because compiler cannot infer the NSize if
+// using MultiIndex<NSize>
+// TODO: how to fix this?
+template <typename... Ys, typename X>
+__host__ __device__ constexpr auto operator+=(Tuple<Ys...>& y, const X& x)
+{
+    static_assert(X::Size() == sizeof...(Ys), "wrong! size not the same");
+    constexpr index_t NSize = sizeof...(Ys);
+    static_for<0, NSize, 1>{}([&](auto i) { y(i) += x[i]; });
+    return y;
+}
+
+template <typename... Ys, typename X>
+__host__ __device__ constexpr auto operator-=(Tuple<Ys...>& y, const X& x)
+{
+    static_assert(X::Size() == sizeof...(Ys), "wrong! size not the same");
+    constexpr index_t NSize = sizeof...(Ys);
+    static_for<0, NSize, 1>{}([&](auto i) { y(i) -= x[i]; });
+    return y;
+}
+
+template <typename... Xs, typename Y>
+__host__ __device__ constexpr auto operator+(const Tuple<Xs...>& x, const Y& y)
+{
+    static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
+    constexpr index_t NSize = sizeof...(Xs);
+
+    Tuple<Xs...> r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = x[i] + y[i]; });
+    return r;
+}
+
+template <typename... Xs, typename Y>
+__host__ __device__ constexpr auto operator-(const Tuple<Xs...>& x, const Y& y)
+{
+    static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
+    constexpr index_t NSize = sizeof...(Xs);
+
+    Tuple<Xs...> r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = x[i] - y[i]; });
+    return r;
+}
+
+template <typename... Xs, typename Y>
+__host__ __device__ constexpr auto operator*(const Tuple<Xs...>& x, const Y& y)
+{
+    static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
+    constexpr index_t NSize = sizeof...(Xs);
+
+    Tuple<Xs...> r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = x[i] * y[i]; });
+    return r;
+}
+
+// MultiIndex = index_t * MultiIndex
+template <typename... Xs>
+__host__ __device__ constexpr auto operator*(index_t a, const Tuple<Xs...>& x)
+{
+    constexpr index_t NSize = sizeof...(Xs);
+
+    Tuple<Xs...> r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = a * x[i]; });
+    return r;
+}
+
+template <typename... Xs>
+__host__ __device__ void print_multi_index(const Tuple<Xs...>& x)
+{
+    printf("{");
+    printf("MultiIndex, ");
+    printf("size %d,", index_t{sizeof...(Xs)});
+    static_for<0, sizeof...(Xs), 1>{}(
+        [&](auto i) { printf("%d ", static_cast<index_t>(x.At(i))); });
+    printf("}");
+}
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/synchronization.hpp b/composable_kernel/include/utility/synchronization.hpp
new file mode 100644
index 0000000000..da74f2074d
--- /dev/null
+++ b/composable_kernel/include/utility/synchronization.hpp
@@ -0,0 +1,21 @@
+#ifndef CK_SYNCHRONIZATION_AMD_HPP
+#define CK_SYNCHRONIZATION_AMD_HPP
+
+#include "config.hpp"
+
+namespace ck {
+
+__device__ void block_sync_lds()
+{
+#if CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
+    asm volatile("\
+    s_waitcnt lgkmcnt(0) \n \
+    s_barrier \
+    " ::);
+#else
+    __syncthreads();
+#endif
+}
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/tuple.hpp b/composable_kernel/include/utility/tuple.hpp
new file mode 100644
index 0000000000..15b73011b4
--- /dev/null
+++ b/composable_kernel/include/utility/tuple.hpp
@@ -0,0 +1,167 @@
+#ifndef CK_TUPLE_HPP
+#define CK_TUPLE_HPP
+
+#include "integral_constant.hpp"
+#include "sequence.hpp"
+#include "type.hpp"
+
+namespace ck {
+
+namespace detail {
+
+template <index_t>
+struct TupleElementKey
+{
+    __host__ __device__ constexpr TupleElementKey() = default;
+};
+
+template <typename Key, typename Data>
+struct TupleElement
+{
+    __host__ __device__ constexpr TupleElement() = default;
+
+    template <
+        typename T,
+        typename std::enable_if<!is_same<remove_reference_t<remove_cv_t<T>>, TupleElement>::value,
+                                bool>::type = false>
+    __host__ __device__ constexpr TupleElement(T&& v) : mData(std::forward<T>(v))
+    {
+    }
+
+    Data mData;
+};
+
+template <typename Key, typename Data>
+__host__ __device__ constexpr const Data& get_tuple_element(const TupleElement<Key, Data>& x)
+{
+    return static_cast<const Data&>(x.mData);
+}
+
+template <typename Key, typename Data>
+__host__ __device__ constexpr Data& get_tuple_element(TupleElement<Key, Data>& x)
+{
+    return x.mData;
+}
+
+// TODO: not sure the use of reference is correct
+template <typename Key, typename Data>
+__host__ __device__ constexpr Data&& get_tuple_element(TupleElement<Key, Data>&& x)
+{
+    return static_cast<Data&&>(x.mData);
+}
+
+template <typename Indices, typename... Xs>
+struct TupleImpl;
+
+template <index_t... Is, typename... Xs>
+struct TupleImpl<Sequence<Is...>, Xs...> : TupleElement<TupleElementKey<Is>, Xs>...
+{
+    __host__ __device__ constexpr TupleImpl() = default;
+
+    template <
+        typename Y,
+        typename std::enable_if<sizeof...(Is) == 1 && sizeof...(Xs) == 1 &&
+                                    !is_same<remove_reference_t<remove_cv_t<Y>>, TupleImpl>::value,
+                                bool>::type = false>
+    __host__ __device__ constexpr TupleImpl(Y&& y)
+        : TupleElement<TupleElementKey<Is>, Xs>(std::forward<Y>(y))...
+    {
+    }
+
+    template <typename... Ys, typename std::enable_if<sizeof...(Ys) >= 2, bool>::type = false>
+    __host__ __device__ constexpr TupleImpl(Ys&&... ys)
+        : TupleElement<TupleElementKey<Is>, Xs>(std::forward<Ys>(ys))...
+    {
+        static_assert(sizeof...(Is) == sizeof...(Xs) && sizeof...(Is) == sizeof...(Ys),
+                      "wrong! inconsistent size");
+    }
+
+    __host__ __device__ static constexpr index_t Size() { return sizeof...(Xs); }
+
+    template <index_t I>
+    __host__ __device__ constexpr const auto& GetElementByKey(TupleElementKey<I>) const
+    {
+        return get_tuple_element<TupleElementKey<I>>(*this);
+    }
+
+    template <index_t I>
+    __host__ __device__ constexpr auto& GetElementByKey(TupleElementKey<I>)
+    {
+        return get_tuple_element<TupleElementKey<I>>(*this);
+    }
+};
+
+} // namespace detail
+
+template <typename... Xs>
+struct Tuple : detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(Xs), 1>::type, Xs...>
+{
+    using base =
+        detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(Xs), 1>::type, Xs...>;
+
+    __host__ __device__ constexpr Tuple() = default;
+
+    template <typename Y,
+              typename std::enable_if<
+                  sizeof...(Xs) == 1 && !is_same<remove_reference_t<remove_cv_t<Y>>, Tuple>::value,
+                  bool>::type = false>
+    __host__ __device__ constexpr Tuple(Y&& y) : base(std::forward<Y>(y))
+    {
+    }
+
+    template <typename... Ys,
+              typename std::enable_if<sizeof...(Ys) == sizeof...(Xs) && sizeof...(Ys) >= 2,
+                                      bool>::type = false>
+    __host__ __device__ constexpr Tuple(Ys&&... ys) : base(std::forward<Ys>(ys)...)
+    {
+    }
+
+    __host__ __device__ static constexpr index_t Size() { return sizeof...(Xs); }
+
+    template <index_t I>
+    __host__ __device__ constexpr const auto& At(Number<I>) const
+    {
+        static_assert(I < base::Size(), "wrong! out of range");
+        return base::GetElementByKey(detail::TupleElementKey<I>{});
+    }
+
+    template <index_t I>
+    __host__ __device__ constexpr auto& At(Number<I>)
+    {
+        static_assert(I < base::Size(), "wrong! out of range");
+        return base::GetElementByKey(detail::TupleElementKey<I>{});
+    }
+
+    template <index_t I>
+    __host__ __device__ constexpr const auto& operator[](Number<I> i) const
+    {
+        return At(i);
+    }
+
+    template <index_t I>
+    __host__ __device__ constexpr auto& operator()(Number<I> i)
+    {
+        return At(i);
+    }
+
+    template <typename T>
+    __host__ __device__ constexpr auto operator=(const T& a)
+    {
+        static_assert(T::Size() == Size(), "wrong! size not the same");
+
+        static_for<0, Size(), 1>{}([&](auto i) { operator()(i) = a[i]; });
+
+        return *this;
+    }
+
+    __host__ __device__ static constexpr bool IsStaticBuffer() { return true; }
+};
+
+template <typename... Xs>
+__host__ __device__ constexpr auto make_tuple(Xs&&... xs)
+{
+    return Tuple<remove_cv_t<remove_reference_t<Xs>>...>(std::forward<Xs>(xs)...);
+}
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/tuple_helper.hpp b/composable_kernel/include/utility/tuple_helper.hpp
new file mode 100644
index 0000000000..9499a3596c
--- /dev/null
+++ b/composable_kernel/include/utility/tuple_helper.hpp
@@ -0,0 +1,80 @@
+#ifndef CK_TUPLE_HELPER_HPP
+#define CK_TUPLE_HELPER_HPP
+
+#include "functional4.hpp"
+#include "tuple.hpp"
+
+namespace ck {
+
+template <typename... Ts>
+struct is_known_at_compile_time<Tuple<Ts...>>
+{
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return container_reduce(
+            Tuple<Ts...>{},
+            [](auto x, bool r) {
+                return is_known_at_compile_time<
+                           remove_cv_t<remove_reference_t<decltype(x)>>>::value &
+                       r;
+            },
+            true);
+    }
+
+    static constexpr bool value = IsKnownAtCompileTime();
+};
+
+template <typename F, index_t N>
+__host__ __device__ constexpr auto generate_tuple(F&& f, Number<N>)
+{
+    return unpack([&f](auto&&... xs) { return make_tuple(f(xs)...); },
+                  typename arithmetic_sequence_gen<0, N, 1>::type{});
+}
+
+namespace detail {
+
+template <typename F, typename X, index_t... Is>
+__host__ __device__ constexpr auto transform_tuples_impl(F f, const X& x, Sequence<Is...>)
+{
+    return make_tuple(f(x.At(Number<Is>{}))...);
+}
+
+template <typename F, typename X, typename Y, index_t... Is>
+__host__ __device__ constexpr auto
+transform_tuples_impl(F f, const X& x, const Y& y, Sequence<Is...>)
+{
+    return make_tuple(f(x.At(Number<Is>{}), y.At(Number<Is>{}))...);
+}
+
+template <typename F, typename X, typename Y, typename Z, index_t... Is>
+__host__ __device__ constexpr auto
+transform_tuples_impl(F f, const X& x, const Y& y, const Z& z, Sequence<Is...>)
+{
+    return make_tuple(f(x.At(Number<Is>{}), y.At(Number<Is>{}), z.At(Number<Is>{}))...);
+}
+
+} // namespace detail
+
+template <typename F, typename X>
+__host__ __device__ constexpr auto transform_tuples(F f, const X& x)
+{
+    return detail::transform_tuples_impl(
+        f, x, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{});
+}
+
+template <typename F, typename X, typename Y>
+__host__ __device__ constexpr auto transform_tuples(F f, const X& x, const Y& y)
+{
+    return detail::transform_tuples_impl(
+        f, x, y, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{});
+}
+
+template <typename F, typename X, typename Y, typename Z>
+__host__ __device__ constexpr auto transform_tuples(F f, const X& x, const Y& y, const Z& z)
+{
+    return detail::transform_tuples_impl(
+        f, x, y, z, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{});
+}
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/type.hpp b/composable_kernel/include/utility/type.hpp
new file mode 100644
index 0000000000..32f7dfb569
--- /dev/null
+++ b/composable_kernel/include/utility/type.hpp
@@ -0,0 +1,60 @@
+#ifndef CK_TYPE_HPP
+#define CK_TYPE_HPP
+
+#include "integral_constant.hpp"
+
+namespace ck {
+
+template <typename X, typename Y>
+struct is_same : public integral_constant<bool, false>
+{
+};
+
+template <typename X>
+struct is_same<X, X> : public integral_constant<bool, true>
+{
+};
+
+template <typename T>
+using remove_reference_t = typename std::remove_reference<T>::type;
+
+template <typename T>
+using remove_cv_t = typename std::remove_cv<T>::type;
+
+template <typename T>
+constexpr std::remove_reference_t<T>&& move(T&& t) noexcept
+{
+    return static_cast<typename std::remove_reference<T>::type&&>(t);
+}
+
+template <typename T>
+struct is_known_at_compile_time;
+
+template <>
+struct is_known_at_compile_time<index_t>
+{
+    static constexpr bool value = false;
+};
+
+template <typename T, T X>
+struct is_known_at_compile_time<integral_constant<T, X>>
+{
+    static constexpr bool value = true;
+};
+
+template <typename Y,
+          typename X,
+          typename std::enable_if<sizeof(X) == sizeof(Y), bool>::type = false>
+__host__ __device__ constexpr Y as_type(X x)
+{
+    union AsType
+    {
+        X x;
+        Y y;
+    };
+
+    return AsType{x}.y;
+}
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/utility.hpp b/composable_kernel/include/utility/utility.hpp
new file mode 100644
index 0000000000..9f34e044b7
--- /dev/null
+++ b/composable_kernel/include/utility/utility.hpp
@@ -0,0 +1,14 @@
+#ifndef CK_UTILITY_HPP
+#define CK_UTILITY_HPP
+
+#include "config.hpp"
+
+namespace ck {
+
+__device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
+
+__device__ index_t get_block_1d_id() { return blockIdx.x; }
+
+} // namespace ck
+
+#endif
diff --git a/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp b/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp
new file mode 100644
index 0000000000..652ccdb926
--- /dev/null
+++ b/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp
@@ -0,0 +1,374 @@
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "gridwise_dynamic_gemm_dlops_v1r2.hpp"
+#include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp"
+
+using namespace ck;
+
+constexpr DataTypeEnum_t ABDataTypeEnum  = static_cast<DataTypeEnum_t>(CK_PARAM_ABDataTypeEnum);
+constexpr DataTypeEnum_t AccDataTypeEnum = static_cast<DataTypeEnum_t>(CK_PARAM_AccDataTypeEnum);
+constexpr DataTypeEnum_t CDataTypeEnum   = static_cast<DataTypeEnum_t>(CK_PARAM_CDataTypeEnum);
+
+using FloatAB  = typename get_datatype_from_enum<ABDataTypeEnum>::type;
+using FloatAcc = typename get_datatype_from_enum<AccDataTypeEnum>::type;
+using FloatC   = typename get_datatype_from_enum<CDataTypeEnum>::type;
+
+constexpr index_t BlockSize = CK_PARAM_BlockSize;
+
+constexpr index_t MPerBlock            = CK_PARAM_MPerBlock;
+constexpr index_t NPerBlock            = CK_PARAM_NPerBlock;
+constexpr index_t KPerBlock            = CK_PARAM_KPerBlock;
+constexpr index_t M1PerThread          = CK_PARAM_M1PerThread;
+constexpr index_t N1PerThread          = CK_PARAM_N1PerThread;
+constexpr index_t KPerThread           = CK_PARAM_KPerThread;
+constexpr index_t M1N1ThreadClusterM10 = CK_PARAM_M1N1ThreadClusterM10;
+constexpr index_t M1N1ThreadClusterN10 = CK_PARAM_M1N1ThreadClusterN10;
+constexpr index_t M1N1ThreadClusterM11 = CK_PARAM_M1N1ThreadClusterM11;
+constexpr index_t M1N1ThreadClusterN11 = CK_PARAM_M1N1ThreadClusterN11;
+
+using ABlockTransferThreadSliceLengths_K_M0_M1 =
+    Sequence<CK_PARAM_ABlockTransferThreadSliceLengths_K_M0_M1>;
+using ABlockTransferThreadClusterLengths_K_M0_M1 =
+    Sequence<CK_PARAM_ABlockTransferThreadClusterLengths_K_M0_M1>;
+using ABlockTransferThreadClusterArrangeOrder =
+    Sequence<CK_PARAM_ABlockTransferThreadClusterArrangeOrder>;
+using ABlockTransferSrcAccessOrder = Sequence<CK_PARAM_ABlockTransferSrcAccessOrder>;
+
+constexpr index_t ABlockTransferSrcVectorDim       = CK_PARAM_ABlockTransferSrcVectorDim;
+constexpr index_t ABlockTransferSrcScalarPerVector = CK_PARAM_ABlockTransferSrcScalarPerVector;
+constexpr index_t ABlockTransferDstScalarPerVector_M1 =
+    CK_PARAM_ABlockTransferDstScalarPerVector_M1;
+constexpr bool AThreadTransferSrcResetCoordinateAfterRun =
+    static_cast<bool>(CK_PARAM_AThreadTransferSrcResetCoordinateAfterRun);
+
+using BBlockTransferThreadSliceLengths_K_N0_N1 =
+    Sequence<CK_PARAM_BBlockTransferThreadSliceLengths_K_N0_N1>;
+using BBlockTransferThreadClusterLengths_K_N0_N1 =
+    Sequence<CK_PARAM_BBlockTransferThreadClusterLengths_K_N0_N1>;
+using BBlockTransferThreadClusterArrangeOrder =
+    Sequence<CK_PARAM_BBlockTransferThreadClusterArrangeOrder>;
+using BBlockTransferSrcAccessOrder = Sequence<CK_PARAM_BBlockTransferSrcAccessOrder>;
+
+constexpr index_t BBlockTransferSrcVectorDim       = CK_PARAM_BBlockTransferSrcVectorDim;
+constexpr index_t BBlockTransferSrcScalarPerVector = CK_PARAM_BBlockTransferSrcScalarPerVector;
+constexpr index_t BBlockTransferDstScalarPerVector_N1 =
+    CK_PARAM_BBlockTransferDstScalarPerVector_N1;
+constexpr bool BThreadTransferSrcResetCoordinateAfterRun =
+    static_cast<bool>(CK_PARAM_BThreadTransferSrcResetCoordinateAfterRun);
+
+using CThreadTransferSrcDstAccessOrder = Sequence<CK_PARAM_CThreadTransferSrcDstAccessOrder>;
+constexpr index_t CThreadTransferSrcDstVectorDim    = CK_PARAM_CThreadTransferSrcDstVectorDim;
+constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDstScalarPerVector;
+
+constexpr bool HasMainKBlockLoop       = static_cast<bool>(CK_PARAM_HAS_MAIN_KBLOCK_LOOP);
+constexpr bool HasDoubleTailKBlockLoop = static_cast<bool>(CK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP);
+
+extern "C" __global__ void
+dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw_prepare(
+    int n,
+    int c,
+    int hi,
+    int wi,
+    int k,
+    int y,
+    int x,
+    int convStrideH,
+    int convStrideW,
+    int convDilationY,
+    int convDilationX,
+    int leftPadH,
+    int leftPadW,
+    int rightPadH,
+    int rightPadW,
+    void* p_a_k_m0_m1_grid_desc,
+    void* p_b_k_n0_n1_grid_desc,
+    void* p_c_m0_m10_m11_n0_n10_n11_grid_desc,
+    void* p_c_blockid_to_m0_n0_block_cluster_adaptor)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+
+    const index_t ho = (hi + leftPadH + rightPadH - convDilationY * (y - 1) - 1) / convStrideH + 1;
+    const index_t wo = (wi + leftPadW + rightPadW - convDilationX * (x - 1) - 1) / convStrideW + 1;
+
+    const auto in_n_c_hi_wi_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(n, c, hi, wi));
+    const auto wei_k_c_y_x_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(k, c, y, x));
+    const auto out_n_k_ho_wo_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(n, k, ho, wo));
+
+    const auto descs = transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(
+        wei_k_c_y_x_desc,
+        in_n_c_hi_wi_desc,
+        out_n_k_ho_wo_desc,
+        make_tuple(convStrideH, convStrideW),
+        make_tuple(convDilationY, convDilationX),
+        make_tuple(leftPadH, leftPadW),
+        make_tuple(rightPadH, rightPadW));
+
+    const auto a_k_m_grid_desc = descs[I0];
+    const auto b_k_n_grid_desc = descs[I1];
+    const auto c_m_n_grid_desc = descs[I2];
+
+    using AKMGridDesc = decltype(a_k_m_grid_desc);
+    using BKNGridDesc = decltype(b_k_n_grid_desc);
+    using CMNGridDesc = decltype(c_m_n_grid_desc);
+
+    using AGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{}),
+                                                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{})));
+
+    using BGridIteratorHacks =
+        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}),
+                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{})));
+
+    using CGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 1, 0, 0>{},
+                                                              Sequence<0, 0, 1, 0, 0>{},
+                                                              Sequence<0, 0, 1, 0, 0>{}),
+                                                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 2, 0, 0>{},
+                                                              Sequence<0, 0, 2, 0, 0>{},
+                                                              Sequence<0, 0, 2, 0, 0>{})));
+
+    using AGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0>;
+    using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
+
+    using GridwiseGemm =
+        GridwiseDynamicGemmDlops_km_kn_mn_v1r2<BlockSize,
+                                               FloatAB,
+                                               FloatAcc,
+                                               FloatC,
+                                               InMemoryDataOperationEnum_t::Set, /* ToDo tunable */
+                                               AKMGridDesc,
+                                               BKNGridDesc,
+                                               CMNGridDesc,
+                                               MPerBlock,
+                                               NPerBlock,
+                                               KPerBlock,
+                                               M1PerThread,
+                                               N1PerThread,
+                                               KPerThread,
+                                               M1N1ThreadClusterM10,
+                                               M1N1ThreadClusterN10,
+                                               M1N1ThreadClusterM11,
+                                               M1N1ThreadClusterN11,
+                                               ABlockTransferThreadSliceLengths_K_M0_M1,
+                                               ABlockTransferThreadClusterLengths_K_M0_M1,
+                                               ABlockTransferThreadClusterArrangeOrder,
+                                               ABlockTransferSrcAccessOrder,
+                                               ABlockTransferSrcVectorDim,
+                                               ABlockTransferSrcScalarPerVector,
+                                               ABlockTransferDstScalarPerVector_M1,
+                                               AThreadTransferSrcResetCoordinateAfterRun,
+                                               BBlockTransferThreadSliceLengths_K_N0_N1,
+                                               BBlockTransferThreadClusterLengths_K_N0_N1,
+                                               BBlockTransferThreadClusterArrangeOrder,
+                                               BBlockTransferSrcAccessOrder,
+                                               BBlockTransferSrcVectorDim,
+                                               BBlockTransferSrcScalarPerVector,
+                                               BBlockTransferDstScalarPerVector_N1,
+                                               BThreadTransferSrcResetCoordinateAfterRun,
+                                               CThreadTransferSrcDstAccessOrder,
+                                               CThreadTransferSrcDstVectorDim,
+                                               CThreadTransferDstScalarPerVector,
+                                               AGridIteratorHacks,
+                                               BGridIteratorHacks,
+                                               CGridIteratorHacks,
+                                               AGridMoveSliceWindowIteratorHacks,
+                                               BGridMoveSliceWindowIteratorHacks>;
+
+    auto a_k_m0_m1_grid_desc = GridwiseGemm::MakeAKM0M1GridDescriptor(a_k_m_grid_desc);
+    auto b_k_n0_n1_grid_desc = GridwiseGemm::MakeBKN0N1GridDescriptor(b_k_n_grid_desc);
+    auto c_m0_m10_m11_n0_n10_n11_grid_desc =
+        GridwiseGemm::MakeCM0M10M11N0N10N11GridDescriptor(c_m_n_grid_desc);
+    auto c_blockid_to_m0_n0_block_cluster_adaptor =
+        GridwiseGemm::MakeCBlockIdToM0N0BlockClusterAdaptor(c_m_n_grid_desc);
+
+    if(hipThreadIdx_x == 0)
+    {
+        *static_cast<decltype(a_k_m0_m1_grid_desc)*>(p_a_k_m0_m1_grid_desc) = a_k_m0_m1_grid_desc;
+        *static_cast<decltype(b_k_n0_n1_grid_desc)*>(p_b_k_n0_n1_grid_desc) = b_k_n0_n1_grid_desc;
+        *static_cast<decltype(c_m0_m10_m11_n0_n10_n11_grid_desc)*>(
+            p_c_m0_m10_m11_n0_n10_n11_grid_desc) = c_m0_m10_m11_n0_n10_n11_grid_desc;
+        *static_cast<decltype(c_blockid_to_m0_n0_block_cluster_adaptor)*>(
+            p_c_blockid_to_m0_n0_block_cluster_adaptor) = c_blockid_to_m0_n0_block_cluster_adaptor;
+    };
+};
+
+extern "C" __global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            const void CONSTANT* p_a_k_m0_m1_grid_desc,
+            const void CONSTANT* p_b_k_n0_n1_grid_desc,
+            const void CONSTANT* p_c_m0_m10_m11_n0_n10_n11_grid_desc,
+            const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+
+    constexpr auto in_n_c_hi_wi_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 256, 28, 28));
+    constexpr auto wei_k_c_y_x_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 256, 3, 3));
+    constexpr auto out_n_k_ho_wo_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 256, 28, 28));
+
+    constexpr auto descs =
+        transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
+                                                                        in_n_c_hi_wi_desc,
+                                                                        out_n_k_ho_wo_desc,
+                                                                        make_tuple(1, 1),
+                                                                        make_tuple(1, 1),
+                                                                        make_tuple(1, 1),
+                                                                        make_tuple(1, 1));
+
+    constexpr auto a_k_m_grid_desc = descs[I0];
+    constexpr auto b_k_n_grid_desc = descs[I1];
+    constexpr auto c_m_n_grid_desc = descs[I2];
+
+    using AKMGridDesc = decltype(a_k_m_grid_desc);
+    using BKNGridDesc = decltype(b_k_n_grid_desc);
+    using CMNGridDesc = decltype(c_m_n_grid_desc);
+
+    using AGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{}),
+                                                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{})));
+
+    using BGridIteratorHacks =
+        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}),
+                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{})));
+
+    using CGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 1, 0, 0>{},
+                                                              Sequence<0, 0, 1, 0, 0>{},
+                                                              Sequence<0, 0, 1, 0, 0>{}),
+                                                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 2, 0, 0>{},
+                                                              Sequence<0, 0, 2, 0, 0>{},
+                                                              Sequence<0, 0, 2, 0, 0>{})));
+
+    using AGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0>;
+    using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
+
+    using GridwiseGemm =
+        GridwiseDynamicGemmDlops_km_kn_mn_v1r2<BlockSize,
+                                               FloatAB,
+                                               FloatAcc,
+                                               FloatC,
+                                               InMemoryDataOperationEnum_t::Set, /* ToDo tunable */
+                                               AKMGridDesc,
+                                               BKNGridDesc,
+                                               CMNGridDesc,
+                                               MPerBlock,
+                                               NPerBlock,
+                                               KPerBlock,
+                                               M1PerThread,
+                                               N1PerThread,
+                                               KPerThread,
+                                               M1N1ThreadClusterM10,
+                                               M1N1ThreadClusterN10,
+                                               M1N1ThreadClusterM11,
+                                               M1N1ThreadClusterN11,
+                                               ABlockTransferThreadSliceLengths_K_M0_M1,
+                                               ABlockTransferThreadClusterLengths_K_M0_M1,
+                                               ABlockTransferThreadClusterArrangeOrder,
+                                               ABlockTransferSrcAccessOrder,
+                                               ABlockTransferSrcVectorDim,
+                                               ABlockTransferSrcScalarPerVector,
+                                               ABlockTransferDstScalarPerVector_M1,
+                                               AThreadTransferSrcResetCoordinateAfterRun,
+                                               BBlockTransferThreadSliceLengths_K_N0_N1,
+                                               BBlockTransferThreadClusterLengths_K_N0_N1,
+                                               BBlockTransferThreadClusterArrangeOrder,
+                                               BBlockTransferSrcAccessOrder,
+                                               BBlockTransferSrcVectorDim,
+                                               BBlockTransferSrcScalarPerVector,
+                                               BBlockTransferDstScalarPerVector_N1,
+                                               BThreadTransferSrcResetCoordinateAfterRun,
+                                               CThreadTransferSrcDstAccessOrder,
+                                               CThreadTransferSrcDstVectorDim,
+                                               CThreadTransferDstScalarPerVector,
+                                               AGridIteratorHacks,
+                                               BGridIteratorHacks,
+                                               CGridIteratorHacks,
+                                               AGridMoveSliceWindowIteratorHacks,
+                                               BGridMoveSliceWindowIteratorHacks>;
+
+    constexpr auto a_k_m0_m1_grid_desc_tmp =
+        GridwiseGemm::MakeAKM0M1GridDescriptor(a_k_m_grid_desc);
+    constexpr auto b_k_n0_n1_grid_desc_tmp =
+        GridwiseGemm::MakeBKN0N1GridDescriptor(b_k_n_grid_desc);
+    constexpr auto c_m0_m10_m11_n0_n10_n11_grid_desc_tmp =
+        GridwiseGemm::MakeCM0M10M11N0N10N11GridDescriptor(c_m_n_grid_desc);
+    constexpr auto c_blockid_to_m0_n0_block_cluster_adaptor_tmp =
+        GridwiseGemm::MakeCBlockIdToM0N0BlockClusterAdaptor(c_m_n_grid_desc);
+
+    using AKM0M1GridDesc            = decltype(a_k_m0_m1_grid_desc_tmp);
+    using BKN0N1GridDesc            = decltype(b_k_n0_n1_grid_desc_tmp);
+    using CM0M10M11N0N10N11GridDesc = decltype(c_m0_m10_m11_n0_n10_n11_grid_desc_tmp);
+    using CBlockIdToM0N0BlockClusterAdaptor =
+        decltype(c_blockid_to_m0_n0_block_cluster_adaptor_tmp);
+
+    const auto a_k_m0_m1_grid_desc =
+        *reinterpret_cast<const AKM0M1GridDesc*>((const void*)p_a_k_m0_m1_grid_desc);
+    const auto b_k_n0_n1_grid_desc =
+        *reinterpret_cast<const BKN0N1GridDesc*>((const void*)p_b_k_n0_n1_grid_desc);
+    const auto c_m0_m10_m11_n0_n10_n11_grid_desc =
+        *reinterpret_cast<const CM0M10M11N0N10N11GridDesc*>(
+            (const void*)p_c_m0_m10_m11_n0_n10_n11_grid_desc);
+    const auto c_blockid_to_m0_n0_block_cluster_adaptor =
+        *reinterpret_cast<const CBlockIdToM0N0BlockClusterAdaptor*>(
+            (const void*)p_c_blockid_to_m0_n0_block_cluster_adaptor);
+
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseGemm::Run(p_a_grid,
+                      p_b_grid,
+                      p_c_grid,
+                      p_shared_block,
+                      a_k_m0_m1_grid_desc,
+                      b_k_n0_n1_grid_desc,
+                      c_m0_m10_m11_n0_n10_n11_grid_desc,
+                      c_blockid_to_m0_n0_block_cluster_adaptor,
+                      integral_constant<bool, HasMainKBlockLoop>{},
+                      integral_constant<bool, HasDoubleTailKBlockLoop>{});
+};
diff --git a/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp b/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
new file mode 100644
index 0000000000..d33bc74aa6
--- /dev/null
+++ b/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
@@ -0,0 +1,362 @@
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "gridwise_dynamic_gemm_xdlops_v2r3.hpp"
+#include "transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp"
+
+using namespace ck;
+
+constexpr DataTypeEnum_t ABDataTypeEnum  = static_cast<DataTypeEnum_t>(CK_PARAM_ABDataTypeEnum);
+constexpr DataTypeEnum_t AccDataTypeEnum = static_cast<DataTypeEnum_t>(CK_PARAM_AccDataTypeEnum);
+constexpr DataTypeEnum_t CDataTypeEnum   = static_cast<DataTypeEnum_t>(CK_PARAM_CDataTypeEnum);
+
+using FloatAB  = typename get_datatype_from_enum<ABDataTypeEnum>::type;
+using FloatAcc = typename get_datatype_from_enum<AccDataTypeEnum>::type;
+using FloatC   = typename get_datatype_from_enum<CDataTypeEnum>::type;
+
+constexpr index_t BlockSize = CK_PARAM_BlockSize;
+
+constexpr index_t MPerBlock = CK_PARAM_MPerBlock;
+constexpr index_t NPerBlock = CK_PARAM_NPerBlock;
+constexpr index_t KPerBlock = CK_PARAM_KPerBlock;
+
+constexpr index_t MPerWave = CK_PARAM_MPerWave;
+constexpr index_t NPerWave = CK_PARAM_NPerWave;
+constexpr index_t MRepeat  = CK_PARAM_MRepeat;
+constexpr index_t NRepeat  = CK_PARAM_NRepeat;
+constexpr index_t K1       = CK_PARAM_K1;
+
+using ABlockTransferThreadSliceLengths_K0_M_K1 =
+    Sequence<CK_PARAM_ABlockTransferThreadSliceLengths_K0_M_K1>;
+using ABlockTransferThreadClusterLengths_K0_M_K1 =
+    Sequence<CK_PARAM_ABlockTransferThreadClusterLengths_K0_M_K1>;
+using ABlockTransferThreadClusterArrangeOrder =
+    Sequence<CK_PARAM_ABlockTransferThreadClusterArrangeOrder>;
+using ABlockTransferSrcAccessOrder = Sequence<CK_PARAM_ABlockTransferSrcAccessOrder>;
+
+constexpr index_t ABlockTransferSrcVectorDim       = CK_PARAM_ABlockTransferSrcVectorDim;
+constexpr index_t ABlockTransferSrcScalarPerVector = CK_PARAM_ABlockTransferSrcScalarPerVector;
+constexpr index_t ABlockTransferDstScalarPerVector_K1 =
+    CK_PARAM_ABlockTransferDstScalarPerVector_K1;
+constexpr bool AThreadTransferSrcResetCoordinateAfterRun =
+    static_cast<bool>(CK_PARAM_AThreadTransferSrcResetCoordinateAfterRun);
+
+using BBlockTransferThreadSliceLengths_K0_N_K1 =
+    Sequence<CK_PARAM_BBlockTransferThreadSliceLengths_K0_N_K1>;
+using BBlockTransferThreadClusterLengths_K0_N_K1 =
+    Sequence<CK_PARAM_BBlockTransferThreadClusterLengths_K0_N_K1>;
+using BBlockTransferThreadClusterArrangeOrder =
+    Sequence<CK_PARAM_BBlockTransferThreadClusterArrangeOrder>;
+using BBlockTransferSrcAccessOrder = Sequence<CK_PARAM_BBlockTransferSrcAccessOrder>;
+
+constexpr index_t BBlockTransferSrcVectorDim       = CK_PARAM_BBlockTransferSrcVectorDim;
+constexpr index_t BBlockTransferSrcScalarPerVector = CK_PARAM_BBlockTransferSrcScalarPerVector;
+constexpr index_t BBlockTransferDstScalarPerVector_K1 =
+    CK_PARAM_BBlockTransferDstScalarPerVector_K1;
+constexpr bool BThreadTransferSrcResetCoordinateAfterRun =
+    static_cast<bool>(CK_PARAM_BThreadTransferSrcResetCoordinateAfterRun);
+
+using CThreadTransferSrcDstAccessOrder = Sequence<CK_PARAM_CThreadTransferSrcDstAccessOrder>;
+constexpr index_t CThreadTransferSrcDstVectorDim    = CK_PARAM_CThreadTransferSrcDstVectorDim;
+constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDstScalarPerVector;
+
+extern "C" __global__ void
+dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_prepare(
+    int n,
+    int c,
+    int hi,
+    int wi,
+    int k,
+    int y,
+    int x,
+    int convStrideH,
+    int convStrideW,
+    int convDilationY,
+    int convDilationX,
+    int leftPadH,
+    int leftPadW,
+    int rightPadH,
+    int rightPadW,
+    void* p_a_k0_m_k1_grid_desc,
+    void* p_b_k0_n_k1_grid_desc,
+    void* p_c_m0_m1_m2_n_grid_desc,
+    void* p_c_blockid_to_m0_n0_block_cluster_adaptor)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+
+    const index_t ho = (hi + leftPadH + rightPadH - convDilationY * (y - 1) - 1) / convStrideH + 1;
+    const index_t wo = (wi + leftPadW + rightPadW - convDilationX * (x - 1) - 1) / convStrideW + 1;
+
+    const auto in_n_c_hi_wi_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(n, c, hi, wi));
+    const auto wei_k_c_y_x_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(k, c, y, x));
+    const auto out_n_k_ho_wo_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(n, k, ho, wo));
+
+    const auto descs = transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(
+        wei_k_c_y_x_desc,
+        in_n_c_hi_wi_desc,
+        out_n_k_ho_wo_desc,
+        make_tuple(convStrideH, convStrideW),
+        make_tuple(convDilationY, convDilationX),
+        make_tuple(leftPadH, leftPadW),
+        make_tuple(rightPadH, rightPadW),
+        Number<K1>{});
+
+    const auto a_k0_m_k1_grid_desc = descs[I0];
+    const auto b_k0_n_k1_grid_desc = descs[I1];
+    const auto c_m_n_grid_desc     = descs[I2];
+
+    using AK0MK1GridDesc = decltype(a_k0_m_k1_grid_desc);
+    using BK0NK1GridDesc = decltype(b_k0_n_k1_grid_desc);
+    using CMNGridDesc    = decltype(c_m_n_grid_desc);
+
+    using AGridIteratorHacks = decltype(make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{})));
+
+    using BGridIteratorHacks =
+        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
+                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})));
+
+    using CGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 1, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 1, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 1, 0, 0>{}),
+                                                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 2, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 2, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 2, 0, 0>{})));
+
+    using AGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0>;
+    using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
+
+    using GridwiseGemm =
+        GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
+                                                       FloatAB,
+                                                       FloatAcc,
+                                                       FloatC,
+                                                       InMemoryDataOperationEnum_t::Set,
+                                                       AK0MK1GridDesc,
+                                                       BK0NK1GridDesc,
+                                                       CMNGridDesc,
+                                                       MPerBlock,
+                                                       NPerBlock,
+                                                       KPerBlock,
+                                                       MPerWave,
+                                                       NPerWave,
+                                                       K1,
+                                                       MRepeat,
+                                                       NRepeat,
+                                                       ABlockTransferThreadSliceLengths_K0_M_K1,
+                                                       ABlockTransferThreadClusterLengths_K0_M_K1,
+                                                       ABlockTransferThreadClusterArrangeOrder,
+                                                       ABlockTransferSrcAccessOrder,
+                                                       ABlockTransferSrcVectorDim,
+                                                       ABlockTransferSrcScalarPerVector,
+                                                       ABlockTransferDstScalarPerVector_K1,
+                                                       AThreadTransferSrcResetCoordinateAfterRun,
+                                                       BBlockTransferThreadSliceLengths_K0_N_K1,
+                                                       BBlockTransferThreadClusterLengths_K0_N_K1,
+                                                       BBlockTransferThreadClusterArrangeOrder,
+                                                       BBlockTransferSrcAccessOrder,
+                                                       BBlockTransferSrcVectorDim,
+                                                       BBlockTransferSrcScalarPerVector,
+                                                       BBlockTransferDstScalarPerVector_K1,
+                                                       BThreadTransferSrcResetCoordinateAfterRun,
+                                                       CThreadTransferSrcDstAccessOrder,
+                                                       CThreadTransferSrcDstVectorDim,
+                                                       CThreadTransferDstScalarPerVector,
+                                                       AGridIteratorHacks,
+                                                       BGridIteratorHacks,
+                                                       CGridIteratorHacks,
+                                                       AGridMoveSliceWindowIteratorHacks,
+                                                       BGridMoveSliceWindowIteratorHacks,
+                                                       false>;
+
+    auto c_m0_m1_m2_n_grid_desc = GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
+
+    auto c_blockid_to_m0_n0_block_cluster_adaptor =
+        GridwiseGemm::MakeCBlockClusterAdaptor(c_m_n_grid_desc);
+
+    if(hipThreadIdx_x == 0)
+    {
+        *static_cast<remove_cv_t<decltype(a_k0_m_k1_grid_desc)>*>(p_a_k0_m_k1_grid_desc) =
+            a_k0_m_k1_grid_desc;
+        *static_cast<remove_cv_t<decltype(b_k0_n_k1_grid_desc)>*>(p_b_k0_n_k1_grid_desc) =
+            b_k0_n_k1_grid_desc;
+        *static_cast<decltype(c_m0_m1_m2_n_grid_desc)*>(p_c_m0_m1_m2_n_grid_desc) =
+            c_m0_m1_m2_n_grid_desc;
+        *static_cast<decltype(c_blockid_to_m0_n0_block_cluster_adaptor)*>(
+            p_c_blockid_to_m0_n0_block_cluster_adaptor) = c_blockid_to_m0_n0_block_cluster_adaptor;
+    }
+};
+
+extern "C" __global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            const void CONSTANT* p_a_k0_m_k1_grid_desc,
+            const void CONSTANT* p_b_k0_n_k1_grid_desc,
+            const void CONSTANT* p_c_m0_m1_m2_n_grid_desc,
+            const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor)
+{
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+
+    constexpr auto in_n_c_hi_wi_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 256, 28, 28));
+    constexpr auto wei_k_c_y_x_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 256, 3, 3));
+    constexpr auto out_n_k_ho_wo_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 256, 28, 28));
+
+    constexpr auto descs =
+        transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
+                                                                          in_n_c_hi_wi_desc,
+                                                                          out_n_k_ho_wo_desc,
+                                                                          make_tuple(1, 1),
+                                                                          make_tuple(1, 1),
+                                                                          make_tuple(1, 1),
+                                                                          make_tuple(1, 1),
+                                                                          Number<K1>{});
+
+    constexpr auto a_k0_m_k1_grid_desc_tmp = descs[I0];
+    constexpr auto b_k0_n_k1_grid_desc_tmp = descs[I1];
+    constexpr auto c_m_n_grid_desc         = descs[I2];
+
+    using AGridIteratorHacks = decltype(make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{})));
+
+    using BGridIteratorHacks =
+        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
+                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})));
+
+    using CGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 1, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 1, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 1, 0, 0>{}),
+                                                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 2, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 2, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 2, 0, 0>{})));
+
+    using AGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0>;
+    using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
+
+    using AK0MK1GridDesc = decltype(a_k0_m_k1_grid_desc_tmp);
+    using BK0NK1GridDesc = decltype(b_k0_n_k1_grid_desc_tmp);
+    using CMNGridDesc    = decltype(c_m_n_grid_desc);
+
+    using GridwiseGemm =
+        GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
+                                                       FloatAB,
+                                                       FloatAcc,
+                                                       FloatC,
+                                                       InMemoryDataOperationEnum_t::Set,
+                                                       AK0MK1GridDesc,
+                                                       BK0NK1GridDesc,
+                                                       CMNGridDesc,
+                                                       MPerBlock,
+                                                       NPerBlock,
+                                                       KPerBlock,
+                                                       MPerWave,
+                                                       NPerWave,
+                                                       K1,
+                                                       MRepeat,
+                                                       NRepeat,
+                                                       ABlockTransferThreadSliceLengths_K0_M_K1,
+                                                       ABlockTransferThreadClusterLengths_K0_M_K1,
+                                                       ABlockTransferThreadClusterArrangeOrder,
+                                                       ABlockTransferSrcAccessOrder,
+                                                       ABlockTransferSrcVectorDim,
+                                                       ABlockTransferSrcScalarPerVector,
+                                                       ABlockTransferDstScalarPerVector_K1,
+                                                       AThreadTransferSrcResetCoordinateAfterRun,
+                                                       BBlockTransferThreadSliceLengths_K0_N_K1,
+                                                       BBlockTransferThreadClusterLengths_K0_N_K1,
+                                                       BBlockTransferThreadClusterArrangeOrder,
+                                                       BBlockTransferSrcAccessOrder,
+                                                       BBlockTransferSrcVectorDim,
+                                                       BBlockTransferSrcScalarPerVector,
+                                                       BBlockTransferDstScalarPerVector_K1,
+                                                       BThreadTransferSrcResetCoordinateAfterRun,
+                                                       CThreadTransferSrcDstAccessOrder,
+                                                       CThreadTransferSrcDstVectorDim,
+                                                       CThreadTransferDstScalarPerVector,
+                                                       AGridIteratorHacks,
+                                                       BGridIteratorHacks,
+                                                       CGridIteratorHacks,
+                                                       AGridMoveSliceWindowIteratorHacks,
+                                                       BGridMoveSliceWindowIteratorHacks,
+                                                       false>;
+
+    constexpr auto c_m0_m1_m2_n_grid_desc_tmp =
+        GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
+    constexpr auto c_blockid_to_m0_n0_block_cluster_adaptor_tmp =
+        GridwiseGemm::MakeCBlockClusterAdaptor(c_m_n_grid_desc);
+
+    using CM0M1M2NGridDesc = decltype(c_m0_m1_m2_n_grid_desc_tmp);
+    using CBlockIdToM0N0BlockClusterAdaptor =
+        decltype(c_blockid_to_m0_n0_block_cluster_adaptor_tmp);
+
+    const auto a_k0_m_k1_grid_desc =
+        *reinterpret_cast<const AK0MK1GridDesc*>((const void*)p_a_k0_m_k1_grid_desc);
+    const auto b_k0_n_k1_grid_desc =
+        *reinterpret_cast<const BK0NK1GridDesc*>((const void*)p_b_k0_n_k1_grid_desc);
+    const auto c_m0_m1_m2_n_grid_desc =
+        *reinterpret_cast<const CM0M1M2NGridDesc*>((const void*)p_c_m0_m1_m2_n_grid_desc);
+    const auto c_blockid_to_m0_n0_block_cluster_adaptor =
+        *reinterpret_cast<const CBlockIdToM0N0BlockClusterAdaptor*>(
+            (const void*)p_c_blockid_to_m0_n0_block_cluster_adaptor);
+
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseGemm::Run(p_a_grid,
+                      p_b_grid,
+                      p_c_grid,
+                      p_shared_block,
+                      a_k0_m_k1_grid_desc,
+                      b_k0_n_k1_grid_desc,
+                      c_m0_m1_m2_n_grid_desc,
+                      c_blockid_to_m0_n0_block_cluster_adaptor);
+};
diff --git a/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp b/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp
new file mode 100644
index 0000000000..d49693b511
--- /dev/null
+++ b/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp
@@ -0,0 +1,362 @@
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "gridwise_dynamic_gemm_xdlops_v2r3.hpp"
+#include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
+
+using namespace ck;
+
+constexpr DataTypeEnum_t ABDataTypeEnum  = static_cast<DataTypeEnum_t>(CK_PARAM_ABDataTypeEnum);
+constexpr DataTypeEnum_t AccDataTypeEnum = static_cast<DataTypeEnum_t>(CK_PARAM_AccDataTypeEnum);
+constexpr DataTypeEnum_t CDataTypeEnum   = static_cast<DataTypeEnum_t>(CK_PARAM_CDataTypeEnum);
+
+using FloatAB  = typename get_datatype_from_enum<ABDataTypeEnum>::type;
+using FloatAcc = typename get_datatype_from_enum<AccDataTypeEnum>::type;
+using FloatC   = typename get_datatype_from_enum<CDataTypeEnum>::type;
+
+constexpr index_t BlockSize = CK_PARAM_BlockSize;
+
+constexpr index_t MPerBlock = CK_PARAM_MPerBlock;
+constexpr index_t NPerBlock = CK_PARAM_NPerBlock;
+constexpr index_t KPerBlock = CK_PARAM_KPerBlock;
+
+constexpr index_t MPerWave = CK_PARAM_MPerWave;
+constexpr index_t NPerWave = CK_PARAM_NPerWave;
+constexpr index_t MRepeat  = CK_PARAM_MRepeat;
+constexpr index_t NRepeat  = CK_PARAM_NRepeat;
+constexpr index_t K1       = CK_PARAM_K1;
+
+using ABlockTransferThreadSliceLengths_K0_M_K1 =
+    Sequence<CK_PARAM_ABlockTransferThreadSliceLengths_K0_M_K1>;
+using ABlockTransferThreadClusterLengths_K0_M_K1 =
+    Sequence<CK_PARAM_ABlockTransferThreadClusterLengths_K0_M_K1>;
+using ABlockTransferThreadClusterArrangeOrder =
+    Sequence<CK_PARAM_ABlockTransferThreadClusterArrangeOrder>;
+using ABlockTransferSrcAccessOrder = Sequence<CK_PARAM_ABlockTransferSrcAccessOrder>;
+
+constexpr index_t ABlockTransferSrcVectorDim       = CK_PARAM_ABlockTransferSrcVectorDim;
+constexpr index_t ABlockTransferSrcScalarPerVector = CK_PARAM_ABlockTransferSrcScalarPerVector;
+constexpr index_t ABlockTransferDstScalarPerVector_K1 =
+    CK_PARAM_ABlockTransferDstScalarPerVector_K1;
+constexpr bool AThreadTransferSrcResetCoordinateAfterRun =
+    static_cast<bool>(CK_PARAM_AThreadTransferSrcResetCoordinateAfterRun);
+
+using BBlockTransferThreadSliceLengths_K0_N_K1 =
+    Sequence<CK_PARAM_BBlockTransferThreadSliceLengths_K0_N_K1>;
+using BBlockTransferThreadClusterLengths_K0_N_K1 =
+    Sequence<CK_PARAM_BBlockTransferThreadClusterLengths_K0_N_K1>;
+using BBlockTransferThreadClusterArrangeOrder =
+    Sequence<CK_PARAM_BBlockTransferThreadClusterArrangeOrder>;
+using BBlockTransferSrcAccessOrder = Sequence<CK_PARAM_BBlockTransferSrcAccessOrder>;
+
+constexpr index_t BBlockTransferSrcVectorDim       = CK_PARAM_BBlockTransferSrcVectorDim;
+constexpr index_t BBlockTransferSrcScalarPerVector = CK_PARAM_BBlockTransferSrcScalarPerVector;
+constexpr index_t BBlockTransferDstScalarPerVector_K1 =
+    CK_PARAM_BBlockTransferDstScalarPerVector_K1;
+constexpr bool BThreadTransferSrcResetCoordinateAfterRun =
+    static_cast<bool>(CK_PARAM_BThreadTransferSrcResetCoordinateAfterRun);
+
+using CThreadTransferSrcDstAccessOrder = Sequence<CK_PARAM_CThreadTransferSrcDstAccessOrder>;
+constexpr index_t CThreadTransferSrcDstVectorDim    = CK_PARAM_CThreadTransferSrcDstVectorDim;
+constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDstScalarPerVector;
+
+extern "C" __global__ void
+dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk_prepare(
+    int n,
+    int hi,
+    int wi,
+    int c,
+    int k,
+    int y,
+    int x,
+    int convStrideH,
+    int convStrideW,
+    int convDilationY,
+    int convDilationX,
+    int leftPadH,
+    int leftPadW,
+    int rightPadH,
+    int rightPadW,
+    void* p_a_k0_m_k1_grid_desc,
+    void* p_b_k0_n_k1_grid_desc,
+    void* p_c_m0_m1_m2_n_grid_desc,
+    void* p_c_blockid_to_m0_n0_block_cluster_adaptor)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+
+    const index_t ho = (hi + leftPadH + rightPadH - convDilationY * (y - 1) - 1) / convStrideH + 1;
+    const index_t wo = (wi + leftPadW + rightPadW - convDilationX * (x - 1) - 1) / convStrideW + 1;
+
+    const auto in_n_hi_wi_c_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(n, hi, wi, c));
+    const auto wei_k_y_x_c_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(k, y, x, c));
+    const auto out_n_ho_wo_k_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(n, ho, wo, k));
+
+    const auto descs = transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad(
+        in_n_hi_wi_c_desc,
+        wei_k_y_x_c_desc,
+        out_n_ho_wo_k_desc,
+        make_tuple(convStrideH, convStrideW),
+        make_tuple(convDilationY, convDilationX),
+        make_tuple(leftPadH, leftPadW),
+        make_tuple(rightPadH, rightPadW),
+        Number<K1>{});
+
+    const auto a_k0_m_k1_grid_desc = descs[I0];
+    const auto b_k0_n_k1_grid_desc = descs[I1];
+    const auto c_m_n_grid_desc     = descs[I2];
+
+    using AK0MK1GridDesc = decltype(a_k0_m_k1_grid_desc);
+    using BK0NK1GridDesc = decltype(b_k0_n_k1_grid_desc);
+    using CMNGridDesc    = decltype(c_m_n_grid_desc);
+
+    using BGridIteratorHacks = decltype(make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{})));
+
+    using AGridIteratorHacks =
+        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
+                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})));
+
+    using CGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 1, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 1, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 1, 0, 0>{}),
+                                                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 2, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 2, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 2, 0, 0>{})));
+
+    using AGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
+    using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0>;
+
+    using GridwiseGemm =
+        GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
+                                                       FloatAB,
+                                                       FloatAcc,
+                                                       FloatC,
+                                                       InMemoryDataOperationEnum_t::Set,
+                                                       AK0MK1GridDesc,
+                                                       BK0NK1GridDesc,
+                                                       CMNGridDesc,
+                                                       MPerBlock,
+                                                       NPerBlock,
+                                                       KPerBlock,
+                                                       MPerWave,
+                                                       NPerWave,
+                                                       K1,
+                                                       MRepeat,
+                                                       NRepeat,
+                                                       ABlockTransferThreadSliceLengths_K0_M_K1,
+                                                       ABlockTransferThreadClusterLengths_K0_M_K1,
+                                                       ABlockTransferThreadClusterArrangeOrder,
+                                                       ABlockTransferSrcAccessOrder,
+                                                       ABlockTransferSrcVectorDim,
+                                                       ABlockTransferSrcScalarPerVector,
+                                                       ABlockTransferDstScalarPerVector_K1,
+                                                       AThreadTransferSrcResetCoordinateAfterRun,
+                                                       BBlockTransferThreadSliceLengths_K0_N_K1,
+                                                       BBlockTransferThreadClusterLengths_K0_N_K1,
+                                                       BBlockTransferThreadClusterArrangeOrder,
+                                                       BBlockTransferSrcAccessOrder,
+                                                       BBlockTransferSrcVectorDim,
+                                                       BBlockTransferSrcScalarPerVector,
+                                                       BBlockTransferDstScalarPerVector_K1,
+                                                       BThreadTransferSrcResetCoordinateAfterRun,
+                                                       CThreadTransferSrcDstAccessOrder,
+                                                       CThreadTransferSrcDstVectorDim,
+                                                       CThreadTransferDstScalarPerVector,
+                                                       AGridIteratorHacks,
+                                                       BGridIteratorHacks,
+                                                       CGridIteratorHacks,
+                                                       AGridMoveSliceWindowIteratorHacks,
+                                                       BGridMoveSliceWindowIteratorHacks,
+                                                       false>;
+
+    auto c_m0_m1_m2_n_grid_desc = GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
+
+    auto c_blockid_to_m0_n0_block_cluster_adaptor =
+        GridwiseGemm::MakeCBlockClusterAdaptor(c_m_n_grid_desc);
+
+    if(hipThreadIdx_x == 0)
+    {
+        *static_cast<remove_cv_t<decltype(a_k0_m_k1_grid_desc)>*>(p_a_k0_m_k1_grid_desc) =
+            a_k0_m_k1_grid_desc;
+        *static_cast<remove_cv_t<decltype(b_k0_n_k1_grid_desc)>*>(p_b_k0_n_k1_grid_desc) =
+            b_k0_n_k1_grid_desc;
+        *static_cast<decltype(c_m0_m1_m2_n_grid_desc)*>(p_c_m0_m1_m2_n_grid_desc) =
+            c_m0_m1_m2_n_grid_desc;
+        *static_cast<decltype(c_blockid_to_m0_n0_block_cluster_adaptor)*>(
+            p_c_blockid_to_m0_n0_block_cluster_adaptor) = c_blockid_to_m0_n0_block_cluster_adaptor;
+    }
+};
+
+extern "C" __global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            const void CONSTANT* p_a_k0_m_k1_grid_desc,
+            const void CONSTANT* p_b_k0_n_k1_grid_desc,
+            const void CONSTANT* p_c_m0_m1_m2_n_grid_desc,
+            const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor)
+{
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto in_n_hi_wi_c_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 28, 28, 256));
+    constexpr auto wei_k_y_x_c_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 3, 3, 256));
+    constexpr auto out_n_ho_wo_k_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 28, 28, 256));
+
+    constexpr auto descs =
+        transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad(in_n_hi_wi_c_desc,
+                                                                          wei_k_y_x_c_desc,
+                                                                          out_n_ho_wo_k_desc,
+                                                                          make_tuple(1, 1),
+                                                                          make_tuple(1, 1),
+                                                                          make_tuple(1, 1),
+                                                                          make_tuple(1, 1),
+                                                                          Number<K1>{});
+
+    constexpr auto a_k0_m_k1_grid_desc_tmp = descs[I0];
+    constexpr auto b_k0_n_k1_grid_desc_tmp = descs[I1];
+    constexpr auto c_m_n_grid_desc         = descs[I2];
+
+    using AK0MK1GridDesc = decltype(a_k0_m_k1_grid_desc_tmp);
+    using BK0NK1GridDesc = decltype(b_k0_n_k1_grid_desc_tmp);
+    using CMNGridDesc    = decltype(c_m_n_grid_desc);
+
+    using BGridIteratorHacks = decltype(make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{})));
+
+    using AGridIteratorHacks =
+        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
+                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})));
+
+    using CGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 1, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 1, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 1, 0, 0>{}),
+                                                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 2, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 2, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 2, 0, 0>{})));
+
+    using AGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
+    using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0>;
+
+    using GridwiseGemm =
+        GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
+                                                       FloatAB,
+                                                       FloatAcc,
+                                                       FloatC,
+                                                       InMemoryDataOperationEnum_t::Set,
+                                                       AK0MK1GridDesc,
+                                                       BK0NK1GridDesc,
+                                                       CMNGridDesc,
+                                                       MPerBlock,
+                                                       NPerBlock,
+                                                       KPerBlock,
+                                                       MPerWave,
+                                                       NPerWave,
+                                                       K1,
+                                                       MRepeat,
+                                                       NRepeat,
+                                                       ABlockTransferThreadSliceLengths_K0_M_K1,
+                                                       ABlockTransferThreadClusterLengths_K0_M_K1,
+                                                       ABlockTransferThreadClusterArrangeOrder,
+                                                       ABlockTransferSrcAccessOrder,
+                                                       ABlockTransferSrcVectorDim,
+                                                       ABlockTransferSrcScalarPerVector,
+                                                       ABlockTransferDstScalarPerVector_K1,
+                                                       AThreadTransferSrcResetCoordinateAfterRun,
+                                                       BBlockTransferThreadSliceLengths_K0_N_K1,
+                                                       BBlockTransferThreadClusterLengths_K0_N_K1,
+                                                       BBlockTransferThreadClusterArrangeOrder,
+                                                       BBlockTransferSrcAccessOrder,
+                                                       BBlockTransferSrcVectorDim,
+                                                       BBlockTransferSrcScalarPerVector,
+                                                       BBlockTransferDstScalarPerVector_K1,
+                                                       BThreadTransferSrcResetCoordinateAfterRun,
+                                                       CThreadTransferSrcDstAccessOrder,
+                                                       CThreadTransferSrcDstVectorDim,
+                                                       CThreadTransferDstScalarPerVector,
+                                                       AGridIteratorHacks,
+                                                       BGridIteratorHacks,
+                                                       CGridIteratorHacks,
+                                                       AGridMoveSliceWindowIteratorHacks,
+                                                       BGridMoveSliceWindowIteratorHacks,
+                                                       false>;
+    constexpr auto c_m0_m1_m2_n_grid_desc_tmp =
+        GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
+    constexpr auto c_blockid_to_m0_n0_block_cluster_adaptor_tmp =
+        GridwiseGemm::MakeCBlockClusterAdaptor(c_m_n_grid_desc);
+
+    using CM0M1M2NGridDesc = decltype(c_m0_m1_m2_n_grid_desc_tmp);
+    using CBlockIdToM0N0BlockClusterAdaptor =
+        decltype(c_blockid_to_m0_n0_block_cluster_adaptor_tmp);
+
+    const auto a_k0_m_k1_grid_desc =
+        *reinterpret_cast<const AK0MK1GridDesc*>((const void*)p_a_k0_m_k1_grid_desc);
+    const auto b_k0_n_k1_grid_desc =
+        *reinterpret_cast<const BK0NK1GridDesc*>((const void*)p_b_k0_n_k1_grid_desc);
+    const auto c_m0_m1_m2_n_grid_desc =
+        *reinterpret_cast<const CM0M1M2NGridDesc*>((const void*)p_c_m0_m1_m2_n_grid_desc);
+    const auto c_blockid_to_m0_n0_block_cluster_adaptor =
+        *reinterpret_cast<const CBlockIdToM0N0BlockClusterAdaptor*>(
+            (const void*)p_c_blockid_to_m0_n0_block_cluster_adaptor);
+
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseGemm::Run(p_a_grid,
+                      p_b_grid,
+                      p_c_grid,
+                      p_shared_block,
+                      a_k0_m_k1_grid_desc,
+                      b_k0_n_k1_grid_desc,
+                      c_m0_m1_m2_n_grid_desc,
+                      c_blockid_to_m0_n0_block_cluster_adaptor);
+};
diff --git a/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp b/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp
new file mode 100644
index 0000000000..90c957bb0b
--- /dev/null
+++ b/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp
@@ -0,0 +1,392 @@
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "gridwise_dynamic_contraction_dlops_v1r2.hpp"
+#include "transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp"
+
+using namespace ck;
+
+constexpr DataTypeEnum_t ABDataTypeEnum  = static_cast<DataTypeEnum_t>(CK_PARAM_ABDataTypeEnum);
+constexpr DataTypeEnum_t AccDataTypeEnum = static_cast<DataTypeEnum_t>(CK_PARAM_AccDataTypeEnum);
+constexpr DataTypeEnum_t CDataTypeEnum   = static_cast<DataTypeEnum_t>(CK_PARAM_CDataTypeEnum);
+
+using FloatAB  = typename get_datatype_from_enum<ABDataTypeEnum>::type;
+using FloatAcc = typename get_datatype_from_enum<AccDataTypeEnum>::type;
+using FloatC   = typename get_datatype_from_enum<CDataTypeEnum>::type;
+
+constexpr index_t BlockSize = CK_PARAM_BlockSize;
+
+constexpr auto GN0 = Number<CK_PARAM_GN0>{};
+constexpr auto GK1 = Number<CK_PARAM_GK1>{};
+
+constexpr index_t GM1PerBlockGM11 = CK_PARAM_GM1PerBlockGM11;
+constexpr index_t GN1PerBlockGN11 = CK_PARAM_GN1PerBlockGN11;
+constexpr index_t GK0PerBlock     = CK_PARAM_GK0PerBlock;
+
+constexpr index_t BM1PerThreadBM11 = CK_PARAM_BM1PerThreadBM11;
+constexpr index_t BN1PerThreadBN11 = CK_PARAM_BN1PerThreadBN11;
+constexpr index_t BK0PerThread     = CK_PARAM_BK0PerThread;
+
+using BM10BN10ThreadClusterBM10Xs = Sequence<CK_PARAM_BM10BN10ThreadClusterBM10Xs>;
+using BM10BN10ThreadClusterBN10Xs = Sequence<CK_PARAM_BM10BN10ThreadClusterBN10Xs>;
+
+using ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1 =
+    Sequence<CK_PARAM_ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1>;
+using ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1 =
+    Sequence<CK_PARAM_ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1>;
+using ABlockTransferThreadClusterArrangeOrder = Sequence<1, 2, 3, 0, 4>;
+using ABlockTransferSrcAccessOrder            = Sequence<3, 2, 1, 0, 4>;
+using ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 =
+    Sequence<CK_PARAM_ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1>;
+using ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 =
+    Sequence<CK_PARAM_ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1>;
+using ABlockTransferSrcVectorTensorContiguousDimOrder = Sequence<0, 1, 2, 3, 4>;
+
+using BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1 =
+    Sequence<CK_PARAM_BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1>;
+using BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1 =
+    Sequence<CK_PARAM_BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1>;
+using BBlockTransferThreadClusterArrangeOrder = Sequence<0, 4, 1, 2, 3>;
+using BBlockTransferSrcAccessOrder            = Sequence<4, 3, 2, 0, 1>;
+using BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 =
+    Sequence<CK_PARAM_BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1>;
+using BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 =
+    Sequence<CK_PARAM_BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1>;
+using BBlockTransferSrcVectorTensorContiguousDimOrder = Sequence<0, 1, 2, 3, 4>;
+
+using CThreadTransferSrcDstAccessOrder              = Sequence<3, 4, 5, 0, 1, 2>;
+constexpr index_t CThreadTransferSrcDstVectorDim    = 5;
+constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDstScalarPerVector;
+
+constexpr bool HasMainKBlockLoop       = static_cast<bool>(CK_PARAM_HasMainKBlockLoop);
+constexpr bool HasDoubleTailKBlockLoop = static_cast<bool>(CK_PARAM_HasDoubleTailKBlockLoop);
+
+extern "C" __global__ void
+dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare(index_t N,
+                                                                            index_t C,
+                                                                            index_t Hi,
+                                                                            index_t Wi,
+                                                                            index_t K,
+                                                                            index_t Y,
+                                                                            index_t X,
+                                                                            index_t ConvStrideH,
+                                                                            index_t ConvStrideW,
+                                                                            index_t ConvDilationH,
+                                                                            index_t ConvDilationW,
+                                                                            index_t InLeftPadH,
+                                                                            index_t InLeftPadW,
+                                                                            index_t InRightPadH,
+                                                                            index_t InRightPadW,
+                                                                            void* p_desc_tuple)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+
+    const index_t Ho =
+        (Hi + InLeftPadH + InRightPadH - ConvDilationH * (Y - 1) - 1) / ConvStrideH + 1;
+    const index_t Wo =
+        (Wi + InLeftPadW + InRightPadW - ConvDilationW * (X - 1) - 1) / ConvStrideW + 1;
+
+    const auto in_n_c_hi_wi_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, C, Hi, Wi));
+    const auto wei_k_c_y_x_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C, Y, X));
+    const auto out_n_k_ho_wo_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K, Ho, Wo));
+
+    const auto descs = transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad(
+        wei_k_c_y_x_desc,
+        in_n_c_hi_wi_desc,
+        out_n_k_ho_wo_desc,
+        make_tuple(ConvStrideH, ConvStrideW),
+        make_tuple(ConvDilationH, ConvDilationW),
+        make_tuple(InLeftPadH, InLeftPadW),
+        make_tuple(InRightPadH, InRightPadW),
+        GN0,
+        GK1);
+
+    const auto a_grid_desc_gk0_gm0_gm1_gk1 = descs[I0];
+    const auto b_grid_desc_gk0_gn0_gn1_gk1 = descs[I1];
+    const auto c_grid_desc_gm0_gm1_gn0_gn1 = descs[I2];
+
+    using AGridDesc_GK0_GM0_GM1_GK1 = decltype(a_grid_desc_gk0_gm0_gm1_gk1);
+    using BGridDesc_GK0_GN0_GN1_GK1 = decltype(b_grid_desc_gk0_gn0_gn1_gk1);
+    using CGridDesc_GM0_GM1_GN0_GN1 = decltype(c_grid_desc_gm0_gm1_gn0_gn1);
+
+    using AGridIteratorHacks =
+        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 0+: GK0
+                                       Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 1+: GM0
+                                       Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 2+: GM10
+                                       Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 3+: GM11
+                                       Sequence<0, 0, 0, 0, 0, 0, 0>{}),   // 4+: GK1
+                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 0-: GK0
+                                       Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 1-: GM0
+                                       Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 2-: GM10
+                                       Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 3-: GM11
+                                       Sequence<0, 0, 0, 0, 0, 0, 0>{}))); // 4-: GK1
+
+    using BGridIteratorHacks = decltype(make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0>{},    // 0+: GK0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},    // 1+: GN0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},    // 2+: GN10
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},    // 3+: GN11
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),   // 4+: GK1
+        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0>{},    // 0-: GK0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},    // 1-: GN0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},    // 2-: GN10
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},    // 3-: GN11
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}))); // 4-: GK1
+
+    using CGridIteratorHacks = decltype(make_tuple(
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 0+: GM10
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},  // 1+: BM0
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},  // 2+: BM1
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 3+: GN10
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{},  // 4+: BN0
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{}), // 5+: GN1
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},    // 0-: GM10
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{},    // 1-: BM0
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{},    // 2-: BM1
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},    // 3-: GN10
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{},    // 4-: BN0
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{}))); // 5-: GN1
+
+    using AGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0>;
+
+    using BGridMoveSliceWindowIteratorHacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0>;
+
+    using GridwiseContraction =
+        GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1<
+            BlockSize,
+            FloatAB,
+            FloatAcc,
+            FloatC,
+            InMemoryDataOperationEnum_t::Set,
+            AGridDesc_GK0_GM0_GM1_GK1,
+            BGridDesc_GK0_GN0_GN1_GK1,
+            CGridDesc_GM0_GM1_GN0_GN1,
+            GM1PerBlockGM11,
+            GN1PerBlockGN11,
+            GK0PerBlock,
+            BM1PerThreadBM11,
+            BN1PerThreadBN11,
+            BK0PerThread,
+            BM10BN10ThreadClusterBM10Xs,
+            BM10BN10ThreadClusterBN10Xs,
+            ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1,
+            ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1,
+            ABlockTransferThreadClusterArrangeOrder,
+            ABlockTransferSrcAccessOrder,
+            ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
+            ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
+            ABlockTransferSrcVectorTensorContiguousDimOrder,
+            BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferThreadClusterArrangeOrder,
+            BBlockTransferSrcAccessOrder,
+            BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferSrcVectorTensorContiguousDimOrder,
+            CThreadTransferSrcDstAccessOrder,
+            CThreadTransferSrcDstVectorDim,
+            CThreadTransferDstScalarPerVector,
+            AGridIteratorHacks,
+            BGridIteratorHacks,
+            CGridIteratorHacks,
+            AGridMoveSliceWindowIteratorHacks,
+            BGridMoveSliceWindowIteratorHacks>;
+
+    if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0)
+    {
+        auto desc_tuple =
+            make_tuple(GridwiseContraction::MakeAGridDescriptor_GK0_GM0_GM10_GM11_GK1(
+                           a_grid_desc_gk0_gm0_gm1_gk1),
+                       GridwiseContraction::MakeBGridDescriptor_GK0_GN0_GN10_GN11_GK1(
+                           b_grid_desc_gk0_gn0_gn1_gk1),
+                       GridwiseContraction::MakeCGridDescriptor_GM10_BM0_BM1_GN10_BN0_BN1(
+                           c_grid_desc_gm0_gm1_gn0_gn1),
+                       GridwiseContraction::MakeCGridBlockCluster_BlockId_To_GM10_GN10(
+                           c_grid_desc_gm0_gm1_gn0_gn1));
+
+        *static_cast<decltype(desc_tuple)*>(p_desc_tuple) = desc_tuple;
+    }
+};
+
+extern "C" __global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            const void CONSTANT* p_desc_tuple)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto in_n_c_hi_wi_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 256, 28, 28));
+    constexpr auto wei_k_c_y_x_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 256, 3, 3));
+    constexpr auto out_n_k_ho_wo_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 256, 28, 28));
+
+    constexpr auto descs =
+        transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
+                                                                               in_n_c_hi_wi_desc,
+                                                                               out_n_k_ho_wo_desc,
+                                                                               make_tuple(1, 1),
+                                                                               make_tuple(1, 1),
+                                                                               make_tuple(1, 1),
+                                                                               make_tuple(1, 1),
+                                                                               GN0,
+                                                                               GK1);
+
+    constexpr auto a_grid_desc_gk0_gm0_gm1_gk1 = descs[I0];
+    constexpr auto b_grid_desc_gk0_gn0_gn1_gk1 = descs[I1];
+    constexpr auto c_grid_desc_gm0_gm1_gn0_gn1 = descs[I2];
+
+    using AGridDesc_GK0_GM0_GM1_GK1 = decltype(a_grid_desc_gk0_gm0_gm1_gk1);
+    using BGridDesc_GK0_GN0_GN1_GK1 = decltype(b_grid_desc_gk0_gn0_gn1_gk1);
+    using CGridDesc_GM0_GM1_GN0_GN1 = decltype(c_grid_desc_gm0_gm1_gn0_gn1);
+
+    using AGridIteratorHacks =
+        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 0+: GK0
+                                       Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 1+: GM0
+                                       Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 2+: GM10
+                                       Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 3+: GM11
+                                       Sequence<0, 0, 0, 0, 0, 0, 0>{}),   // 4+: GK1
+                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 0-: GK0
+                                       Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 1-: GM0
+                                       Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 2-: GM10
+                                       Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 3-: GM11
+                                       Sequence<0, 0, 0, 0, 0, 0, 0>{}))); // 4-: GK1
+
+    using BGridIteratorHacks = decltype(make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0>{},    // 0+: GK0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},    // 1+: GN0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},    // 2+: GN10
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},    // 3+: GN11
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),   // 4+: GK1
+        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0>{},    // 0-: GK0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},    // 1-: GN0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},    // 2-: GN10
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},    // 3-: GN11
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}))); // 4-: GK1
+
+    using CGridIteratorHacks = decltype(make_tuple(
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 0+: GM10
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},  // 1+: BM0
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},  // 2+: BM1
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 3+: GN10
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{},  // 4+: BN0
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{}), // 5+: GN1
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},    // 0-: GM10
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{},    // 1-: BM0
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{},    // 2-: BM1
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},    // 3-: GN10
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{},    // 4-: BN0
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{}))); // 5-: GN1
+
+    using AGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0>;
+
+    using BGridMoveSliceWindowIteratorHacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0>;
+
+    using GridwiseContraction =
+        GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1<
+            BlockSize,
+            FloatAB,
+            FloatAcc,
+            FloatC,
+            InMemoryDataOperationEnum_t::Set,
+            AGridDesc_GK0_GM0_GM1_GK1,
+            BGridDesc_GK0_GN0_GN1_GK1,
+            CGridDesc_GM0_GM1_GN0_GN1,
+            GM1PerBlockGM11,
+            GN1PerBlockGN11,
+            GK0PerBlock,
+            BM1PerThreadBM11,
+            BN1PerThreadBN11,
+            BK0PerThread,
+            BM10BN10ThreadClusterBM10Xs,
+            BM10BN10ThreadClusterBN10Xs,
+            ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1,
+            ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1,
+            ABlockTransferThreadClusterArrangeOrder,
+            ABlockTransferSrcAccessOrder,
+            ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
+            ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
+            ABlockTransferSrcVectorTensorContiguousDimOrder,
+            BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferThreadClusterArrangeOrder,
+            BBlockTransferSrcAccessOrder,
+            BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferSrcVectorTensorContiguousDimOrder,
+            CThreadTransferSrcDstAccessOrder,
+            CThreadTransferSrcDstVectorDim,
+            CThreadTransferDstScalarPerVector,
+            AGridIteratorHacks,
+            BGridIteratorHacks,
+            CGridIteratorHacks,
+            AGridMoveSliceWindowIteratorHacks,
+            BGridMoveSliceWindowIteratorHacks>;
+
+    using AGridDesc_GK0_GM0_GM10_GM11_GK1 =
+        decltype(GridwiseContraction::MakeAGridDescriptor_GK0_GM0_GM10_GM11_GK1(
+            a_grid_desc_gk0_gm0_gm1_gk1));
+    using BGridDesc_GK0_GN0_GN10_GN11_GK1 =
+        decltype(GridwiseContraction::MakeBGridDescriptor_GK0_GN0_GN10_GN11_GK1(
+            b_grid_desc_gk0_gn0_gn1_gk1));
+    using CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1 =
+        decltype(GridwiseContraction::MakeCGridDescriptor_GM10_BM0_BM1_GN10_BN0_BN1(
+            c_grid_desc_gm0_gm1_gn0_gn1));
+    using CGridBlockCluster_BlockId_To_GM10_GN10 =
+        decltype(GridwiseContraction::MakeCGridBlockCluster_BlockId_To_GM10_GN10(
+            c_grid_desc_gm0_gm1_gn0_gn1));
+
+    using DescTuple = decltype(make_tuple(AGridDesc_GK0_GM0_GM10_GM11_GK1{},
+                                          BGridDesc_GK0_GN0_GN10_GN11_GK1{},
+                                          CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1{},
+                                          CGridBlockCluster_BlockId_To_GM10_GN10{}));
+
+    const auto desc_tuple = *reinterpret_cast<const DescTuple*>(
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wold-style-cast"
+        // TODO: how to cast?
+        (const void*)p_desc_tuple
+#pragma clang diagnostic pop
+    );
+
+    const auto a_grid_desc_gk0_gm0_gm10_gm11_gk1         = desc_tuple[I0];
+    const auto b_grid_desc_gk0_gn0_gn10_gn11_gk1         = desc_tuple[I1];
+    const auto c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1     = desc_tuple[I2];
+    const auto c_grid_block_cluster_blockid_to_gm10_gn10 = desc_tuple[I3];
+
+    constexpr index_t shared_block_size =
+        GridwiseContraction::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseContraction::Run(p_a_grid,
+                             p_b_grid,
+                             p_c_grid,
+                             p_shared_block,
+                             a_grid_desc_gk0_gm0_gm10_gm11_gk1,
+                             b_grid_desc_gk0_gn0_gn10_gn11_gk1,
+                             c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
+                             c_grid_block_cluster_blockid_to_gm10_gn10,
+                             integral_constant<bool, HasMainKBlockLoop>{},
+                             integral_constant<bool, HasDoubleTailKBlockLoop>{});
+};
diff --git a/external/half/include/half.hpp b/external/half/include/half.hpp
new file mode 100644
index 0000000000..b698aac39f
--- /dev/null
+++ b/external/half/include/half.hpp
@@ -0,0 +1,5671 @@
+// half - IEEE 754-based half-precision floating-point library.
+//
+// Copyright (c) 2012-2019 Christian Rau <rauy@users.sourceforge.net>
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+// associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation
+// the rights to use, copy,
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit
+// persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+// NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+// SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+// CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+// Version 2.1.0
+
+/// \file
+/// Main header file for half-precision functionality.
+
+#ifndef HALF_HALF_HPP
+#define HALF_HALF_HPP
+
+#define HALF_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+#if defined(__INTEL_COMPILER)
+#define HALF_ICC_VERSION __INTEL_COMPILER
+#elif defined(__ICC)
+#define HALF_ICC_VERSION __ICC
+#elif defined(__ICL)
+#define HALF_ICC_VERSION __ICL
+#else
+#define HALF_ICC_VERSION 0
+#endif
+
+// check C++11 language features
+#if defined(__clang__) // clang
+#if __has_feature(cxx_static_assert) && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
+#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
+#endif
+#if __has_feature(cxx_constexpr) && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
+#define HALF_ENABLE_CPP11_CONSTEXPR 1
+#endif
+#if __has_feature(cxx_noexcept) && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
+#define HALF_ENABLE_CPP11_NOEXCEPT 1
+#endif
+#if __has_feature(cxx_user_literals) && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
+#define HALF_ENABLE_CPP11_USER_LITERALS 1
+#endif
+#if __has_feature(cxx_thread_local) && !defined(HALF_ENABLE_CPP11_THREAD_LOCAL)
+#define HALF_ENABLE_CPP11_THREAD_LOCAL 1
+#endif
+#if(defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L) && \
+    !defined(HALF_ENABLE_CPP11_LONG_LONG)
+#define HALF_ENABLE_CPP11_LONG_LONG 1
+#endif
+#elif HALF_ICC_VERSION && defined(__INTEL_CXX11_MODE__) // Intel C++
+#if HALF_ICC_VERSION >= 1500 && !defined(HALF_ENABLE_CPP11_THREAD_LOCAL)
+#define HALF_ENABLE_CPP11_THREAD_LOCAL 1
+#endif
+#if HALF_ICC_VERSION >= 1500 && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
+#define HALF_ENABLE_CPP11_USER_LITERALS 1
+#endif
+#if HALF_ICC_VERSION >= 1400 && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
+#define HALF_ENABLE_CPP11_CONSTEXPR 1
+#endif
+#if HALF_ICC_VERSION >= 1400 && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
+#define HALF_ENABLE_CPP11_NOEXCEPT 1
+#endif
+#if HALF_ICC_VERSION >= 1110 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
+#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
+#endif
+#if HALF_ICC_VERSION >= 1110 && !defined(HALF_ENABLE_CPP11_LONG_LONG)
+#define HALF_ENABLE_CPP11_LONG_LONG 1
+#endif
+#elif defined(__GNUC__) // gcc
+#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L
+#if HALF_GCC_VERSION >= 408 && !defined(HALF_ENABLE_CPP11_THREAD_LOCAL)
+#define HALF_ENABLE_CPP11_THREAD_LOCAL 1
+#endif
+#if HALF_GCC_VERSION >= 407 && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
+#define HALF_ENABLE_CPP11_USER_LITERALS 1
+#endif
+#if HALF_GCC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
+#define HALF_ENABLE_CPP11_CONSTEXPR 1
+#endif
+#if HALF_GCC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
+#define HALF_ENABLE_CPP11_NOEXCEPT 1
+#endif
+#if HALF_GCC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
+#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
+#endif
+#if !defined(HALF_ENABLE_CPP11_LONG_LONG)
+#define HALF_ENABLE_CPP11_LONG_LONG 1
+#endif
+#endif
+#define HALF_TWOS_COMPLEMENT_INT 1
+#elif defined(_MSC_VER) // Visual C++
+#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_THREAD_LOCAL)
+#define HALF_ENABLE_CPP11_THREAD_LOCAL 1
+#endif
+#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
+#define HALF_ENABLE_CPP11_USER_LITERALS 1
+#endif
+#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
+#define HALF_ENABLE_CPP11_CONSTEXPR 1
+#endif
+#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
+#define HALF_ENABLE_CPP11_NOEXCEPT 1
+#endif
+#if _MSC_VER >= 1600 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
+#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
+#endif
+#if _MSC_VER >= 1310 && !defined(HALF_ENABLE_CPP11_LONG_LONG)
+#define HALF_ENABLE_CPP11_LONG_LONG 1
+#endif
+#define HALF_TWOS_COMPLEMENT_INT 1
+#define HALF_POP_WARNINGS 1
+#pragma warning(push)
+#pragma warning(disable : 4099 4127 4146) // struct vs class, constant in if, negative unsigned
+#endif
+
+// check C++11 library features
+#include <utility>
+#if defined(_LIBCPP_VERSION) // libc++
+#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103
+#ifndef HALF_ENABLE_CPP11_TYPE_TRAITS
+#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
+#endif
+#ifndef HALF_ENABLE_CPP11_CSTDINT
+#define HALF_ENABLE_CPP11_CSTDINT 1
+#endif
+#ifndef HALF_ENABLE_CPP11_CMATH
+#define HALF_ENABLE_CPP11_CMATH 1
+#endif
+#ifndef HALF_ENABLE_CPP11_HASH
+#define HALF_ENABLE_CPP11_HASH 1
+#endif
+#ifndef HALF_ENABLE_CPP11_CFENV
+#define HALF_ENABLE_CPP11_CFENV 1
+#endif
+#endif
+#elif defined(__GLIBCXX__) // libstdc++
+#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103
+#ifdef __clang__
+#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_TYPE_TRAITS)
+#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
+#endif
+#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CSTDINT)
+#define HALF_ENABLE_CPP11_CSTDINT 1
+#endif
+#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CMATH)
+#define HALF_ENABLE_CPP11_CMATH 1
+#endif
+#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_HASH)
+#define HALF_ENABLE_CPP11_HASH 1
+#endif
+#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CFENV)
+#define HALF_ENABLE_CPP11_CFENV 1
+#endif
+#else
+#if HALF_GCC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_TYPE_TRAITS)
+#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
+#endif
+#if HALF_GCC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CSTDINT)
+#define HALF_ENABLE_CPP11_CSTDINT 1
+#endif
+#if HALF_GCC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CMATH)
+#define HALF_ENABLE_CPP11_CMATH 1
+#endif
+#if HALF_GCC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_HASH)
+#define HALF_ENABLE_CPP11_HASH 1
+#endif
+#if HALF_GCC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CFENV)
+#define HALF_ENABLE_CPP11_CFENV 1
+#endif
+#endif
+#endif
+#elif defined(_CPPLIB_VER) // Dinkumware/Visual C++
+#if _CPPLIB_VER >= 520 && !defined(HALF_ENABLE_CPP11_TYPE_TRAITS)
+#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
+#endif
+#if _CPPLIB_VER >= 520 && !defined(HALF_ENABLE_CPP11_CSTDINT)
+#define HALF_ENABLE_CPP11_CSTDINT 1
+#endif
+#if _CPPLIB_VER >= 520 && !defined(HALF_ENABLE_CPP11_HASH)
+#define HALF_ENABLE_CPP11_HASH 1
+#endif
+#if _CPPLIB_VER >= 610 && !defined(HALF_ENABLE_CPP11_CMATH)
+#define HALF_ENABLE_CPP11_CMATH 1
+#endif
+#if _CPPLIB_VER >= 610 && !defined(HALF_ENABLE_CPP11_CFENV)
+#define HALF_ENABLE_CPP11_CFENV 1
+#endif
+#endif
+#undef HALF_GCC_VERSION
+#undef HALF_ICC_VERSION
+
+// any error throwing C++ exceptions?
+#if defined(HALF_ERRHANDLING_THROW_INVALID) || defined(HALF_ERRHANDLING_THROW_DIVBYZERO) ||  \
+    defined(HALF_ERRHANDLING_THROW_OVERFLOW) || defined(HALF_ERRHANDLING_THROW_UNDERFLOW) || \
+    defined(HALF_ERRHANDLING_THROW_INEXACT)
+#define HALF_ERRHANDLING_THROWS 1
+#endif
+
+// any error handling enabled?
+#define HALF_ERRHANDLING                                                          \
+    (HALF_ERRHANDLING_FLAGS || HALF_ERRHANDLING_ERRNO || HALF_ERRHANDLING_FENV || \
+     HALF_ERRHANDLING_THROWS)
+
+#if HALF_ERRHANDLING
+#define HALF_UNUSED_NOERR(name) name
+#else
+#define HALF_UNUSED_NOERR(name)
+#endif
+
+// support constexpr
+#if HALF_ENABLE_CPP11_CONSTEXPR
+#define HALF_CONSTEXPR constexpr
+#define HALF_CONSTEXPR_CONST constexpr
+#if HALF_ERRHANDLING
+#define HALF_CONSTEXPR_NOERR
+#else
+#define HALF_CONSTEXPR_NOERR constexpr
+#endif
+#else
+#define HALF_CONSTEXPR
+#define HALF_CONSTEXPR_CONST const
+#define HALF_CONSTEXPR_NOERR
+#endif
+
+// support noexcept
+#if HALF_ENABLE_CPP11_NOEXCEPT
+#define HALF_NOEXCEPT noexcept
+#define HALF_NOTHROW noexcept
+#else
+#define HALF_NOEXCEPT
+#define HALF_NOTHROW throw()
+#endif
+
+// support thread storage
+#if HALF_ENABLE_CPP11_THREAD_LOCAL
+#define HALF_THREAD_LOCAL thread_local
+#else
+#define HALF_THREAD_LOCAL static
+#endif
+
+#include <utility>
+#include <algorithm>
+#include <istream>
+#include <ostream>
+#include <limits>
+#include <stdexcept>
+#include <climits>
+#include <cmath>
+#include <cstring>
+#include <cstdlib>
+#if HALF_ENABLE_CPP11_TYPE_TRAITS
+#include <type_traits>
+#endif
+#if HALF_ENABLE_CPP11_CSTDINT
+#include <cstdint>
+#endif
+#if HALF_ERRHANDLING_ERRNO
+#include <cerrno>
+#endif
+#if HALF_ENABLE_CPP11_CFENV
+#include <cfenv>
+#endif
+#if HALF_ENABLE_CPP11_HASH
+#include <functional>
+#endif
+#if HALF_ENABLE_F16C_INTRINSICS
+#include <immintrin.h>
+#endif
+
+#ifndef HALF_ENABLE_F16C_INTRINSICS
+/// Enable F16C intruction set intrinsics.
+/// Defining this to 1 enables the use of [F16C compiler
+/// intrinsics](https://en.wikipedia.org/wiki/F16C) for converting between
+/// half-precision and single-precision values which may result in improved performance. This will
+/// not perform additional checks
+/// for support of the F16C instruction set, so an appropriate target platform is required when
+/// enabling this feature.
+///
+/// Unless predefined it will be enabled automatically when the `__F16C__` symbol is defined, which
+/// some compilers do on supporting platforms.
+#define HALF_ENABLE_F16C_INTRINSICS __F16C__
+#endif
+
+#ifdef HALF_DOXYGEN_ONLY
+/// Type for internal floating-point computations.
+/// This can be predefined to a built-in floating-point type (`float`, `double` or `long double`) to
+/// override the internal
+/// half-precision implementation to use this type for computing arithmetic operations and
+/// mathematical function (if available).
+/// This can result in improved performance for arithmetic operators and mathematical functions but
+/// might cause results to
+/// deviate from the specified half-precision rounding mode and inhibits proper detection of
+/// half-precision exceptions.
+#define HALF_ARITHMETIC_TYPE (undefined)
+
+/// Enable internal exception flags.
+/// Defining this to 1 causes operations on half-precision values to raise internal floating-point
+/// exception flags according to
+/// the IEEE 754 standard. These can then be cleared and checked with clearexcept(), testexcept().
+#define HALF_ERRHANDLING_FLAGS 0
+
+/// Enable exception propagation to `errno`.
+/// Defining this to 1 causes operations on half-precision values to propagate floating-point
+/// exceptions to
+/// [errno](https://en.cppreference.com/w/cpp/error/errno) from `<cerrno>`. Specifically this will
+/// propagate domain errors as
+/// [EDOM](https://en.cppreference.com/w/cpp/error/errno_macros) and pole, overflow and underflow
+/// errors as
+/// [ERANGE](https://en.cppreference.com/w/cpp/error/errno_macros). Inexact errors won't be
+/// propagated.
+#define HALF_ERRHANDLING_ERRNO 0
+
+/// Enable exception propagation to built-in floating-point platform.
+/// Defining this to 1 causes operations on half-precision values to propagate floating-point
+/// exceptions to the built-in
+/// single- and double-precision implementation's exception flags using the
+/// [C++11 floating-point environment control](https://en.cppreference.com/w/cpp/numeric/fenv) from
+/// `<cfenv>`. However, this
+/// does not work in reverse and single- or double-precision exceptions will not raise the
+/// corresponding half-precision
+/// exception flags, nor will explicitly clearing flags clear the corresponding built-in flags.
+#define HALF_ERRHANDLING_FENV 0
+
+/// Throw C++ exception on domain errors.
+/// Defining this to a string literal causes operations on half-precision values to throw a
+/// [std::domain_error](https://en.cppreference.com/w/cpp/error/domain_error) with the specified
+/// message on domain errors.
+#define HALF_ERRHANDLING_THROW_INVALID (undefined)
+
+/// Throw C++ exception on pole errors.
+/// Defining this to a string literal causes operations on half-precision values to throw a
+/// [std::domain_error](https://en.cppreference.com/w/cpp/error/domain_error) with the specified
+/// message on pole errors.
+#define HALF_ERRHANDLING_THROW_DIVBYZERO (undefined)
+
+/// Throw C++ exception on overflow errors.
+/// Defining this to a string literal causes operations on half-precision values to throw a
+/// [std::overflow_error](https://en.cppreference.com/w/cpp/error/overflow_error) with the specified
+/// message on overflows.
+#define HALF_ERRHANDLING_THROW_OVERFLOW (undefined)
+
+/// Throw C++ exception on underflow errors.
+/// Defining this to a string literal causes operations on half-precision values to throw a
+/// [std::underflow_error](https://en.cppreference.com/w/cpp/error/underflow_error) with the
+/// specified message on underflows.
+#define HALF_ERRHANDLING_THROW_UNDERFLOW (undefined)
+
+/// Throw C++ exception on rounding errors.
+/// Defining this to 1 causes operations on half-precision values to throw a
+/// [std::range_error](https://en.cppreference.com/w/cpp/error/range_error) with the specified
+/// message on general rounding errors.
+#define HALF_ERRHANDLING_THROW_INEXACT (undefined)
+#endif
+
+#ifndef HALF_ERRHANDLING_OVERFLOW_TO_INEXACT
+/// Raise INEXACT exception on overflow.
+/// Defining this to 1 (default) causes overflow errors to automatically raise inexact exceptions in
+/// addition.
+/// These will be raised after any possible handling of the underflow exception.
+#define HALF_ERRHANDLING_OVERFLOW_TO_INEXACT 1
+#endif
+
+#ifndef HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT
+/// Raise INEXACT exception on underflow.
+/// Defining this to 1 (default) causes underflow errors to automatically raise inexact exceptions
+/// in addition.
+/// These will be raised after any possible handling of the underflow exception.
+///
+/// **Note:** This will actually cause underflow (and the accompanying inexact) exceptions to be
+/// raised *only* when the result
+/// is inexact, while if disabled bare underflow errors will be raised for *any* (possibly exact)
+/// subnormal result.
+#define HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT 1
+#endif
+
+/// Default rounding mode.
+/// This specifies the rounding mode used for all conversions between [half](\ref half_float::half)s
+/// and more precise types
+/// (unless using half_cast() and specifying the rounding mode directly) as well as in arithmetic
+/// operations and mathematical
+/// functions. It can be redefined (before including half.hpp) to one of the standard rounding modes
+/// using their respective
+/// constants or the equivalent values of
+/// [std::float_round_style](https://en.cppreference.com/w/cpp/types/numeric_limits/float_round_style):
+///
+/// `std::float_round_style`         | value | rounding
+/// ---------------------------------|-------|-------------------------
+/// `std::round_indeterminate`       | -1    | fastest
+/// `std::round_toward_zero`         | 0     | toward zero
+/// `std::round_to_nearest`          | 1     | to nearest (default)
+/// `std::round_toward_infinity`     | 2     | toward positive infinity
+/// `std::round_toward_neg_infinity` | 3     | toward negative infinity
+///
+/// By default this is set to `1` (`std::round_to_nearest`), which rounds results to the nearest
+/// representable value. It can even
+/// be set to
+/// [std::numeric_limits<float>::round_style](https://en.cppreference.com/w/cpp/types/numeric_limits/round_style)
+/// to synchronize
+/// the rounding mode with that of the built-in single-precision implementation (which is likely
+/// `std::round_to_nearest`, though).
+#ifndef HALF_ROUND_STYLE
+#define HALF_ROUND_STYLE 1 // = std::round_to_nearest
+#endif
+
+/// Value signaling overflow.
+/// In correspondence with `HUGE_VAL[F|L]` from `<cmath>` this symbol expands to a positive value
+/// signaling the overflow of an
+/// operation, in particular it just evaluates to positive infinity.
+///
+/// **See also:** Documentation for
+/// [HUGE_VAL](https://en.cppreference.com/w/cpp/numeric/math/HUGE_VAL)
+#define HUGE_VALH std::numeric_limits<half_float::half>::infinity()
+
+/// Fast half-precision fma function.
+/// This symbol is defined if the fma() function generally executes as fast as, or faster than, a
+/// separate
+/// half-precision multiplication followed by an addition, which is always the case.
+///
+/// **See also:** Documentation for
+/// [FP_FAST_FMA](https://en.cppreference.com/w/cpp/numeric/math/fma)
+#define FP_FAST_FMAH 1
+
+///	Half rounding mode.
+/// In correspondence with `FLT_ROUNDS` from `<cfloat>` this symbol expands to the rounding mode
+/// used for
+/// half-precision operations. It is an alias for [HALF_ROUND_STYLE](\ref HALF_ROUND_STYLE).
+///
+/// **See also:** Documentation for
+/// [FLT_ROUNDS](https://en.cppreference.com/w/cpp/types/climits/FLT_ROUNDS)
+#define HLF_ROUNDS HALF_ROUND_STYLE
+
+#ifndef FP_ILOGB0
+#define FP_ILOGB0 INT_MIN
+#endif
+#ifndef FP_ILOGBNAN
+#define FP_ILOGBNAN INT_MAX
+#endif
+#ifndef FP_SUBNORMAL
+#define FP_SUBNORMAL 0
+#endif
+#ifndef FP_ZERO
+#define FP_ZERO 1
+#endif
+#ifndef FP_NAN
+#define FP_NAN 2
+#endif
+#ifndef FP_INFINITE
+#define FP_INFINITE 3
+#endif
+#ifndef FP_NORMAL
+#define FP_NORMAL 4
+#endif
+
+#if !HALF_ENABLE_CPP11_CFENV && !defined(FE_ALL_EXCEPT)
+#define FE_INVALID 0x10
+#define FE_DIVBYZERO 0x08
+#define FE_OVERFLOW 0x04
+#define FE_UNDERFLOW 0x02
+#define FE_INEXACT 0x01
+#define FE_ALL_EXCEPT (FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW | FE_UNDERFLOW | FE_INEXACT)
+#endif
+
+/// Main namespace for half-precision functionality.
+/// This namespace contains all the functionality provided by the library.
+namespace half_float {
+class half;
+
+#if HALF_ENABLE_CPP11_USER_LITERALS
+/// Library-defined half-precision literals.
+/// Import this namespace to enable half-precision floating-point literals:
+/// ~~~~{.cpp}
+/// using namespace half_float::literal;
+/// half_float::half = 4.2_h;
+/// ~~~~
+namespace literal {
+half operator"" _h(long double);
+}
+#endif
+
+/// \internal
+/// \brief Implementation details.
+namespace detail {
+#if HALF_ENABLE_CPP11_TYPE_TRAITS
+/// Conditional type.
+template <bool B, typename T, typename F>
+struct conditional : std::conditional<B, T, F>
+{
+};
+
+/// Helper for tag dispatching.
+template <bool B>
+struct bool_type : std::integral_constant<bool, B>
+{
+};
+using std::false_type;
+using std::true_type;
+
+/// Type traits for floating-point types.
+template <typename T>
+struct is_float : std::is_floating_point<T>
+{
+};
+#else
+/// Conditional type.
+template <bool, typename T, typename>
+struct conditional
+{
+    typedef T type;
+};
+template <typename T, typename F>
+struct conditional<false, T, F>
+{
+    typedef F type;
+};
+
+/// Helper for tag dispatching.
+template <bool>
+struct bool_type
+{
+};
+typedef bool_type<true> true_type;
+typedef bool_type<false> false_type;
+
+/// Type traits for floating-point types.
+template <typename>
+struct is_float : false_type
+{
+};
+template <typename T>
+struct is_float<const T> : is_float<T>
+{
+};
+template <typename T>
+struct is_float<volatile T> : is_float<T>
+{
+};
+template <typename T>
+struct is_float<const volatile T> : is_float<T>
+{
+};
+template <>
+struct is_float<float> : true_type
+{
+};
+template <>
+struct is_float<double> : true_type
+{
+};
+template <>
+struct is_float<long double> : true_type
+{
+};
+#endif
+
+/// Type traits for floating-point bits.
+template <typename T>
+struct bits
+{
+    typedef unsigned char type;
+};
+template <typename T>
+struct bits<const T> : bits<T>
+{
+};
+template <typename T>
+struct bits<volatile T> : bits<T>
+{
+};
+template <typename T>
+struct bits<const volatile T> : bits<T>
+{
+};
+
+#if HALF_ENABLE_CPP11_CSTDINT
+/// Unsigned integer of (at least) 16 bits width.
+typedef std::uint_least16_t uint16;
+
+/// Fastest unsigned integer of (at least) 32 bits width.
+typedef std::uint_fast32_t uint32;
+
+/// Fastest signed integer of (at least) 32 bits width.
+typedef std::int_fast32_t int32;
+
+/// Unsigned integer of (at least) 32 bits width.
+template <>
+struct bits<float>
+{
+    typedef std::uint_least32_t type;
+};
+
+/// Unsigned integer of (at least) 64 bits width.
+template <>
+struct bits<double>
+{
+    typedef std::uint_least64_t type;
+};
+#else
+/// Unsigned integer of (at least) 16 bits width.
+typedef unsigned short uint16;
+
+/// Fastest unsigned integer of (at least) 32 bits width.
+typedef unsigned long uint32;
+
+/// Fastest unsigned integer of (at least) 32 bits width.
+typedef long int32;
+
+/// Unsigned integer of (at least) 32 bits width.
+template <>
+struct bits<float>
+    : conditional<std::numeric_limits<unsigned int>::digits >= 32, unsigned int, unsigned long>
+{
+};
+
+#if HALF_ENABLE_CPP11_LONG_LONG
+/// Unsigned integer of (at least) 64 bits width.
+template <>
+struct bits<double> : conditional<std::numeric_limits<unsigned long>::digits >= 64,
+                                  unsigned long,
+                                  unsigned long long>
+{
+};
+#else
+/// Unsigned integer of (at least) 64 bits width.
+template <>
+struct bits<double>
+{
+    typedef unsigned long type;
+};
+#endif
+#endif
+
+#ifdef HALF_ARITHMETIC_TYPE
+/// Type to use for arithmetic computations and mathematic functions internally.
+typedef HALF_ARITHMETIC_TYPE internal_t;
+#endif
+
+/// Tag type for binary construction.
+struct binary_t
+{
+};
+
+/// Tag for binary construction.
+HALF_CONSTEXPR_CONST binary_t binary = binary_t();
+
+/// \name Implementation defined classification and arithmetic
+/// \{
+
+/// Check for infinity.
+/// \tparam T argument type (builtin floating-point type)
+/// \param arg value to query
+/// \retval true if infinity
+/// \retval false else
+template <typename T>
+bool builtin_isinf(T arg)
+{
+#if HALF_ENABLE_CPP11_CMATH
+    return std::isinf(arg);
+#elif defined(_MSC_VER)
+    return !::_finite(static_cast<double>(arg)) && !::_isnan(static_cast<double>(arg));
+#else
+    return arg == std::numeric_limits<T>::infinity() || arg == -std::numeric_limits<T>::infinity();
+#endif
+}
+
+/// Check for NaN.
+/// \tparam T argument type (builtin floating-point type)
+/// \param arg value to query
+/// \retval true if not a number
+/// \retval false else
+template <typename T>
+bool builtin_isnan(T arg)
+{
+#if HALF_ENABLE_CPP11_CMATH
+    return std::isnan(arg);
+#elif defined(_MSC_VER)
+    return ::_isnan(static_cast<double>(arg)) != 0;
+#else
+    return arg != arg;
+#endif
+}
+
+/// Check sign.
+/// \tparam T argument type (builtin floating-point type)
+/// \param arg value to query
+/// \retval true if signbit set
+/// \retval false else
+template <typename T>
+bool builtin_signbit(T arg)
+{
+#if HALF_ENABLE_CPP11_CMATH
+    return std::signbit(arg);
+#else
+    return arg < T() || (arg == T() && T(1) / arg < T());
+#endif
+}
+
+/// Platform-independent sign mask.
+/// \param arg integer value in two's complement
+/// \retval -1 if \a arg negative
+/// \retval 0 if \a arg positive
+inline uint32 sign_mask(uint32 arg)
+{
+    static const int N = std::numeric_limits<uint32>::digits - 1;
+#if HALF_TWOS_COMPLEMENT_INT
+    return static_cast<int32>(arg) >> N;
+#else
+    return -((arg >> N) & 1);
+#endif
+}
+
+/// Platform-independent arithmetic right shift.
+/// \param arg integer value in two's complement
+/// \param i shift amount (at most 31)
+/// \return \a arg right shifted for \a i bits with possible sign extension
+inline uint32 arithmetic_shift(uint32 arg, int i)
+{
+#if HALF_TWOS_COMPLEMENT_INT
+    return static_cast<int32>(arg) >> i;
+#else
+    return static_cast<int32>(arg) / (static_cast<int32>(1) << i) -
+           ((arg >> (std::numeric_limits<uint32>::digits - 1)) & 1);
+#endif
+}
+
+/// \}
+/// \name Error handling
+/// \{
+
+/// Internal exception flags.
+/// \return reference to global exception flags
+inline int& errflags()
+{
+    HALF_THREAD_LOCAL int flags = 0;
+    return flags;
+}
+
+/// Raise floating-point exception.
+/// \param flags exceptions to raise
+/// \param cond condition to raise exceptions for
+inline void raise(int HALF_UNUSED_NOERR(flags), bool HALF_UNUSED_NOERR(cond) = true)
+{
+#if HALF_ERRHANDLING
+    if(!cond)
+        return;
+#if HALF_ERRHANDLING_FLAGS
+    errflags() |= flags;
+#endif
+#if HALF_ERRHANDLING_ERRNO
+    if(flags & FE_INVALID)
+        errno = EDOM;
+    else if(flags & (FE_DIVBYZERO | FE_OVERFLOW | FE_UNDERFLOW))
+        errno = ERANGE;
+#endif
+#if HALF_ERRHANDLING_FENV && HALF_ENABLE_CPP11_CFENV
+    std::feraiseexcept(flags);
+#endif
+#ifdef HALF_ERRHANDLING_THROW_INVALID
+    if(flags & FE_INVALID)
+        throw std::domain_error(HALF_ERRHANDLING_THROW_INVALID);
+#endif
+#ifdef HALF_ERRHANDLING_THROW_DIVBYZERO
+    if(flags & FE_DIVBYZERO)
+        throw std::domain_error(HALF_ERRHANDLING_THROW_DIVBYZERO);
+#endif
+#ifdef HALF_ERRHANDLING_THROW_OVERFLOW
+    if(flags & FE_OVERFLOW)
+        throw std::overflow_error(HALF_ERRHANDLING_THROW_OVERFLOW);
+#endif
+#ifdef HALF_ERRHANDLING_THROW_UNDERFLOW
+    if(flags & FE_UNDERFLOW)
+        throw std::underflow_error(HALF_ERRHANDLING_THROW_UNDERFLOW);
+#endif
+#ifdef HALF_ERRHANDLING_THROW_INEXACT
+    if(flags & FE_INEXACT)
+        throw std::range_error(HALF_ERRHANDLING_THROW_INEXACT);
+#endif
+#if HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT
+    if((flags & FE_UNDERFLOW) && !(flags & FE_INEXACT))
+        raise(FE_INEXACT);
+#endif
+#if HALF_ERRHANDLING_OVERFLOW_TO_INEXACT
+    if((flags & FE_OVERFLOW) && !(flags & FE_INEXACT))
+        raise(FE_INEXACT);
+#endif
+#endif
+}
+
+/// Check and signal for any NaN.
+/// \param x first half-precision value to check
+/// \param y second half-precision value to check
+/// \retval true if either \a x or \a y is NaN
+/// \retval false else
+/// \exception FE_INVALID if \a x or \a y is NaN
+inline HALF_CONSTEXPR_NOERR bool compsignal(unsigned int x, unsigned int y)
+{
+#if HALF_ERRHANDLING
+    raise(FE_INVALID, (x & 0x7FFF) > 0x7C00 || (y & 0x7FFF) > 0x7C00);
+#endif
+    return (x & 0x7FFF) > 0x7C00 || (y & 0x7FFF) > 0x7C00;
+}
+
+/// Signal and silence signaling NaN.
+/// \param nan half-precision NaN value
+/// \return quiet NaN
+/// \exception FE_INVALID if \a nan is signaling NaN
+inline HALF_CONSTEXPR_NOERR unsigned int signal(unsigned int nan)
+{
+#if HALF_ERRHANDLING
+    raise(FE_INVALID, !(nan & 0x200));
+#endif
+    return nan | 0x200;
+}
+
+/// Signal and silence signaling NaNs.
+/// \param x first half-precision value to check
+/// \param y second half-precision value to check
+/// \return quiet NaN
+/// \exception FE_INVALID if \a x or \a y is signaling NaN
+inline HALF_CONSTEXPR_NOERR unsigned int signal(unsigned int x, unsigned int y)
+{
+#if HALF_ERRHANDLING
+    raise(FE_INVALID,
+          ((x & 0x7FFF) > 0x7C00 && !(x & 0x200)) || ((y & 0x7FFF) > 0x7C00 && !(y & 0x200)));
+#endif
+    return ((x & 0x7FFF) > 0x7C00) ? (x | 0x200) : (y | 0x200);
+}
+
+/// Signal and silence signaling NaNs.
+/// \param x first half-precision value to check
+/// \param y second half-precision value to check
+/// \param z third half-precision value to check
+/// \return quiet NaN
+/// \exception FE_INVALID if \a x, \a y or \a z is signaling NaN
+inline HALF_CONSTEXPR_NOERR unsigned int signal(unsigned int x, unsigned int y, unsigned int z)
+{
+#if HALF_ERRHANDLING
+    raise(FE_INVALID,
+          ((x & 0x7FFF) > 0x7C00 && !(x & 0x200)) || ((y & 0x7FFF) > 0x7C00 && !(y & 0x200)) ||
+              ((z & 0x7FFF) > 0x7C00 && !(z & 0x200)));
+#endif
+    return ((x & 0x7FFF) > 0x7C00) ? (x | 0x200)
+                                   : ((y & 0x7FFF) > 0x7C00) ? (y | 0x200) : (z | 0x200);
+}
+
+/// Select value or signaling NaN.
+/// \param x preferred half-precision value
+/// \param y ignored half-precision value except for signaling NaN
+/// \return \a y if signaling NaN, \a x otherwise
+/// \exception FE_INVALID if \a y is signaling NaN
+inline HALF_CONSTEXPR_NOERR unsigned int select(unsigned int x, unsigned int HALF_UNUSED_NOERR(y))
+{
+#if HALF_ERRHANDLING
+    return (((y & 0x7FFF) > 0x7C00) && !(y & 0x200)) ? signal(y) : x;
+#else
+    return x;
+#endif
+}
+
+/// Raise domain error and return NaN.
+/// return quiet NaN
+/// \exception FE_INVALID
+inline HALF_CONSTEXPR_NOERR unsigned int invalid()
+{
+#if HALF_ERRHANDLING
+    raise(FE_INVALID);
+#endif
+    return 0x7FFF;
+}
+
+/// Raise pole error and return infinity.
+/// \param sign half-precision value with sign bit only
+/// \return half-precision infinity with sign of \a sign
+/// \exception FE_DIVBYZERO
+inline HALF_CONSTEXPR_NOERR unsigned int pole(unsigned int sign = 0)
+{
+#if HALF_ERRHANDLING
+    raise(FE_DIVBYZERO);
+#endif
+    return sign | 0x7C00;
+}
+
+/// Check value for underflow.
+/// \param arg non-zero half-precision value to check
+/// \return \a arg
+/// \exception FE_UNDERFLOW if arg is subnormal
+inline HALF_CONSTEXPR_NOERR unsigned int check_underflow(unsigned int arg)
+{
+#if HALF_ERRHANDLING && !HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT
+    raise(FE_UNDERFLOW, !(arg & 0x7C00));
+#endif
+    return arg;
+}
+
+/// \}
+/// \name Conversion and rounding
+/// \{
+
+/// Half-precision overflow.
+/// \tparam R rounding mode to use
+/// \param sign half-precision value with sign bit only
+/// \return rounded overflowing half-precision value
+/// \exception FE_OVERFLOW
+template <std::float_round_style R>
+HALF_CONSTEXPR_NOERR unsigned int overflow(unsigned int sign = 0)
+{
+#if HALF_ERRHANDLING
+    raise(FE_OVERFLOW);
+#endif
+    return (R == std::round_toward_infinity)
+               ? (sign + 0x7C00 - (sign >> 15))
+               : (R == std::round_toward_neg_infinity)
+                     ? (sign + 0x7BFF + (sign >> 15))
+                     : (R == std::round_toward_zero) ? (sign | 0x7BFF) : (sign | 0x7C00);
+}
+
+/// Half-precision underflow.
+/// \tparam R rounding mode to use
+/// \param sign half-precision value with sign bit only
+/// \return rounded underflowing half-precision value
+/// \exception FE_UNDERFLOW
+template <std::float_round_style R>
+HALF_CONSTEXPR_NOERR unsigned int underflow(unsigned int sign = 0)
+{
+#if HALF_ERRHANDLING
+    raise(FE_UNDERFLOW);
+#endif
+    return (R == std::round_toward_infinity)
+               ? (sign + 1 - (sign >> 15))
+               : (R == std::round_toward_neg_infinity) ? (sign + (sign >> 15)) : sign;
+}
+
+/// Round half-precision number.
+/// \tparam R rounding mode to use
+/// \tparam I `true` to always raise INEXACT exception, `false` to raise only for rounded results
+/// \param value finite half-precision number to round
+/// \param g guard bit (most significant discarded bit)
+/// \param s sticky bit (or of all but the most significant discarded bits)
+/// \return rounded half-precision value
+/// \exception FE_OVERFLOW on overflows
+/// \exception FE_UNDERFLOW on underflows
+/// \exception FE_INEXACT if value had to be rounded or \a I is `true`
+template <std::float_round_style R, bool I>
+HALF_CONSTEXPR_NOERR unsigned int rounded(unsigned int value, int g, int s)
+{
+#if HALF_ERRHANDLING
+    value += (R == std::round_to_nearest)
+                 ? (g & (s | value))
+                 : (R == std::round_toward_infinity)
+                       ? (~(value >> 15) & (g | s))
+                       : (R == std::round_toward_neg_infinity) ? ((value >> 15) & (g | s)) : 0;
+    if((value & 0x7C00) == 0x7C00)
+        raise(FE_OVERFLOW);
+    else if(value & 0x7C00)
+        raise(FE_INEXACT, I || (g | s) != 0);
+    else
+        raise(FE_UNDERFLOW, !(HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT) || I || (g | s) != 0);
+    return value;
+#else
+    return (R == std::round_to_nearest)
+               ? (value + (g & (s | value)))
+               : (R == std::round_toward_infinity)
+                     ? (value + (~(value >> 15) & (g | s)))
+                     : (R == std::round_toward_neg_infinity) ? (value + ((value >> 15) & (g | s)))
+                                                             : value;
+#endif
+}
+
+/// Round half-precision number to nearest integer value.
+/// \tparam R rounding mode to use
+/// \tparam E `true` for round to even, `false` for round away from zero
+/// \tparam I `true` to raise INEXACT exception (if inexact), `false` to never raise it
+/// \param value half-precision value to round
+/// \return half-precision bits for nearest integral value
+/// \exception FE_INVALID for signaling NaN
+/// \exception FE_INEXACT if value had to be rounded and \a I is `true`
+template <std::float_round_style R, bool E, bool I>
+unsigned int integral(unsigned int value)
+{
+    unsigned int abs = value & 0x7FFF;
+    if(abs < 0x3C00)
+    {
+        raise(FE_INEXACT, I);
+        return ((R == std::round_to_nearest)
+                    ? (0x3C00 & -static_cast<unsigned>(abs >= (0x3800 + E)))
+                    : (R == std::round_toward_infinity)
+                          ? (0x3C00 & -(~(value >> 15) & (abs != 0)))
+                          : (R == std::round_toward_neg_infinity)
+                                ? (0x3C00 & -static_cast<unsigned>(value > 0x8000))
+                                : 0) |
+               (value & 0x8000);
+    }
+    if(abs >= 0x6400)
+        return (abs > 0x7C00) ? signal(value) : value;
+    unsigned int exp = 25 - (abs >> 10), mask = (1 << exp) - 1;
+    raise(FE_INEXACT, I && (value & mask));
+    return (((R == std::round_to_nearest)
+                 ? ((1 << (exp - 1)) - (~(value >> exp) & E))
+                 : (R == std::round_toward_infinity)
+                       ? (mask & ((value >> 15) - 1))
+                       : (R == std::round_toward_neg_infinity) ? (mask & -(value >> 15)) : 0) +
+            value) &
+           ~mask;
+}
+
+/// Convert fixed point to half-precision floating-point.
+/// \tparam R rounding mode to use
+/// \tparam F number of fractional bits (at least 11)
+/// \tparam S `true` for signed, `false` for unsigned
+/// \tparam N `true` for additional normalization step, `false` if already normalized to 1.F
+/// \tparam I `true` to always raise INEXACT exception, `false` to raise only for rounded results
+/// \param m mantissa in Q1.F fixed point format
+/// \param exp exponent
+/// \param sign half-precision value with sign bit only
+/// \param s sticky bit (or of all but the most significant already discarded bits)
+/// \return value converted to half-precision
+/// \exception FE_OVERFLOW on overflows
+/// \exception FE_UNDERFLOW on underflows
+/// \exception FE_INEXACT if value had to be rounded or \a I is `true`
+template <std::float_round_style R, unsigned int F, bool S, bool N, bool I>
+unsigned int fixed2half(uint32 m, int exp = 14, unsigned int sign = 0, int s = 0)
+{
+    if(S)
+    {
+        uint32 msign = sign_mask(m);
+        m            = (m ^ msign) - msign;
+        sign         = msign & 0x8000;
+    }
+    if(N)
+        for(; m < (static_cast<uint32>(1) << F) && exp; m <<= 1, --exp)
+            ;
+    else if(exp < 0)
+        return rounded<R, I>(sign + (m >> (F - 10 - exp)),
+                             (m >> (F - 11 - exp)) & 1,
+                             s | ((m & ((static_cast<uint32>(1) << (F - 11 - exp)) - 1)) != 0));
+    return rounded<R, I>(sign + (exp << 10) + (m >> (F - 10)),
+                         (m >> (F - 11)) & 1,
+                         s | ((m & ((static_cast<uint32>(1) << (F - 11)) - 1)) != 0));
+}
+
+/// Convert IEEE single-precision to half-precision.
+/// Credit for this goes to [Jeroen van der
+/// Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf).
+/// \tparam R rounding mode to use
+/// \param value single-precision value to convert
+/// \return rounded half-precision value
+/// \exception FE_OVERFLOW on overflows
+/// \exception FE_UNDERFLOW on underflows
+/// \exception FE_INEXACT if value had to be rounded
+template <std::float_round_style R>
+unsigned int float2half_impl(float value, true_type)
+{
+#if HALF_ENABLE_F16C_INTRINSICS
+    return _mm_cvtsi128_si32(_mm_cvtps_ph(_mm_set_ss(value),
+                                          (R == std::round_to_nearest)
+                                              ? _MM_FROUND_TO_NEAREST_INT
+                                              : (R == std::round_toward_zero)
+                                                    ? _MM_FROUND_TO_ZERO
+                                                    : (R == std::round_toward_infinity)
+                                                          ? _MM_FROUND_TO_POS_INF
+                                                          : (R == std::round_toward_neg_infinity)
+                                                                ? _MM_FROUND_TO_NEG_INF
+                                                                : _MM_FROUND_CUR_DIRECTION));
+#else
+    bits<float>::type fbits;
+    std::memcpy(&fbits, &value, sizeof(float));
+#if 1
+    unsigned int sign = (fbits >> 16) & 0x8000;
+    fbits &= 0x7FFFFFFF;
+    if(fbits >= 0x7F800000)
+        return sign | 0x7C00 | ((fbits > 0x7F800000) ? (0x200 | ((fbits >> 13) & 0x3FF)) : 0);
+    if(fbits >= 0x47800000)
+        return overflow<R>(sign);
+    if(fbits >= 0x38800000)
+        return rounded<R, false>(sign | (((fbits >> 23) - 112) << 10) | ((fbits >> 13) & 0x3FF),
+                                 (fbits >> 12) & 1,
+                                 (fbits & 0xFFF) != 0);
+    if(fbits >= 0x33000000)
+    {
+        int i = 125 - (fbits >> 23);
+        fbits = (fbits & 0x7FFFFF) | 0x800000;
+        return rounded<R, false>(sign | (fbits >> (i + 1)),
+                                 (fbits >> i) & 1,
+                                 (fbits & ((static_cast<uint32>(1) << i) - 1)) != 0);
+    }
+    if(fbits != 0)
+        return underflow<R>(sign);
+    return sign;
+#else
+    static const uint16 base_table[512] = {
+        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040,
+        0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000,
+        0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00, 0x4000, 0x4400, 0x4800, 0x4C00,
+        0x5000, 0x5400, 0x5800, 0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800,
+        0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF,
+        0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF,
+        0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF,
+        0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF,
+        0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF,
+        0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF,
+        0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF,
+        0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF,
+        0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF,
+        0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF,
+        0x7BFF, 0x7BFF, 0x7C00, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8002, 0x8004, 0x8008,
+        0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 0x8200, 0x8400, 0x8800, 0x8C00, 0x9000, 0x9400,
+        0x9800, 0x9C00, 0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00, 0xC000,
+        0xC400, 0xC800, 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00,
+        0xF000, 0xF400, 0xF800, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF,
+        0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF,
+        0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF,
+        0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF,
+        0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF,
+        0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF,
+        0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF,
+        0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF,
+        0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF,
+        0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF,
+        0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFC00};
+    static const unsigned char shift_table[256] = {
+        24, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+        25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+        25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+        25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+        25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 24, 23, 22, 21, 20, 19, 18, 17,
+        16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+        13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13};
+    int sexp = fbits >> 23, exp = sexp & 0xFF, i = shift_table[exp];
+    fbits &= 0x7FFFFF;
+    uint32 m = (fbits | ((exp != 0) << 23)) & -static_cast<uint32>(exp != 0xFF);
+    return rounded<R, false>(base_table[sexp] + (fbits >> i),
+                             (m >> (i - 1)) & 1,
+                             (((static_cast<uint32>(1) << (i - 1)) - 1) & m) != 0);
+#endif
+#endif
+}
+
+/// Convert IEEE double-precision to half-precision.
+/// \tparam R rounding mode to use
+/// \param value double-precision value to convert
+/// \return rounded half-precision value
+/// \exception FE_OVERFLOW on overflows
+/// \exception FE_UNDERFLOW on underflows
+/// \exception FE_INEXACT if value had to be rounded
+template <std::float_round_style R>
+unsigned int float2half_impl(double value, true_type)
+{
+#if HALF_ENABLE_F16C_INTRINSICS
+    if(R == std::round_indeterminate)
+        return _mm_cvtsi128_si32(
+            _mm_cvtps_ph(_mm_cvtpd_ps(_mm_set_sd(value)), _MM_FROUND_CUR_DIRECTION));
+#endif
+    bits<double>::type dbits;
+    std::memcpy(&dbits, &value, sizeof(double));
+    uint32 hi = dbits >> 32, lo = dbits & 0xFFFFFFFF;
+    unsigned int sign = (hi >> 16) & 0x8000;
+    hi &= 0x7FFFFFFF;
+    if(hi >= 0x7FF00000)
+        return sign | 0x7C00 | ((dbits & 0xFFFFFFFFFFFFF) ? (0x200 | ((hi >> 10) & 0x3FF)) : 0);
+    if(hi >= 0x40F00000)
+        return overflow<R>(sign);
+    if(hi >= 0x3F100000)
+        return rounded<R, false>(sign | (((hi >> 20) - 1008) << 10) | ((hi >> 10) & 0x3FF),
+                                 (hi >> 9) & 1,
+                                 ((hi & 0x1FF) | lo) != 0);
+    if(hi >= 0x3E600000)
+    {
+        int i = 1018 - (hi >> 20);
+        hi    = (hi & 0xFFFFF) | 0x100000;
+        return rounded<R, false>(sign | (hi >> (i + 1)),
+                                 (hi >> i) & 1,
+                                 ((hi & ((static_cast<uint32>(1) << i) - 1)) | lo) != 0);
+    }
+    if((hi | lo) != 0)
+        return underflow<R>(sign);
+    return sign;
+}
+
+/// Convert non-IEEE floating-point to half-precision.
+/// \tparam R rounding mode to use
+/// \tparam T source type (builtin floating-point type)
+/// \param value floating-point value to convert
+/// \return rounded half-precision value
+/// \exception FE_OVERFLOW on overflows
+/// \exception FE_UNDERFLOW on underflows
+/// \exception FE_INEXACT if value had to be rounded
+template <std::float_round_style R, typename T>
+unsigned int float2half_impl(T value, ...)
+{
+    unsigned int hbits = static_cast<unsigned>(builtin_signbit(value)) << 15;
+    if(value == T())
+        return hbits;
+    if(builtin_isnan(value))
+        return hbits | 0x7FFF;
+    if(builtin_isinf(value))
+        return hbits | 0x7C00;
+    int exp;
+    std::frexp(value, &exp);
+    if(exp > 16)
+        return overflow<R>(hbits);
+    if(exp < -13)
+        value = std::ldexp(value, 25);
+    else
+    {
+        value = std::ldexp(value, 12 - exp);
+        hbits |= ((exp + 13) << 10);
+    }
+    T ival, frac = std::modf(value, &ival);
+    int m = std::abs(static_cast<int>(ival));
+    return rounded<R, false>(hbits + (m >> 1), m & 1, frac != T());
+}
+
+/// Convert floating-point to half-precision.
+/// \tparam R rounding mode to use
+/// \tparam T source type (builtin floating-point type)
+/// \param value floating-point value to convert
+/// \return rounded half-precision value
+/// \exception FE_OVERFLOW on overflows
+/// \exception FE_UNDERFLOW on underflows
+/// \exception FE_INEXACT if value had to be rounded
+template <std::float_round_style R, typename T>
+unsigned int float2half(T value)
+{
+    return float2half_impl<R>(value,
+                              bool_type < std::numeric_limits<T>::is_iec559 &&
+                                  sizeof(typename bits<T>::type) == sizeof(T) > ());
+}
+
+/// Convert integer to half-precision floating-point.
+/// \tparam R rounding mode to use
+/// \tparam T type to convert (builtin integer type)
+/// \param value integral value to convert
+/// \return rounded half-precision value
+/// \exception FE_OVERFLOW on overflows
+/// \exception FE_INEXACT if value had to be rounded
+template <std::float_round_style R, typename T>
+unsigned int int2half(T value)
+{
+    unsigned int bits = static_cast<unsigned>(value < 0) << 15;
+    if(!value)
+        return bits;
+    if(bits)
+        value = -value;
+    if(value > 0xFFFF)
+        return overflow<R>(bits);
+    unsigned int m = static_cast<unsigned int>(value), exp = 24;
+    for(; m < 0x400; m <<= 1, --exp)
+        ;
+    for(; m > 0x7FF; m >>= 1, ++exp)
+        ;
+    bits |= (exp << 10) + m;
+    return (exp > 24) ? rounded<R, false>(
+                            bits, (value >> (exp - 25)) & 1, (((1 << (exp - 25)) - 1) & value) != 0)
+                      : bits;
+}
+
+/// Convert half-precision to IEEE single-precision.
+/// Credit for this goes to [Jeroen van der
+/// Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf).
+/// \param value half-precision value to convert
+/// \return single-precision value
+inline float half2float_impl(unsigned int value, float, true_type)
+{
+#if HALF_ENABLE_F16C_INTRINSICS
+    return _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(value)));
+#else
+#if 0
+			bits<float>::type fbits = static_cast<bits<float>::type>(value&0x8000) << 16;
+			int abs = value & 0x7FFF;
+			if(abs)
+			{
+				fbits |= 0x38000000 << static_cast<unsigned>(abs>=0x7C00);
+				for(; abs<0x400; abs<<=1,fbits-=0x800000) ;
+				fbits += static_cast<bits<float>::type>(abs) << 13;
+			}
+#else
+    static const bits<float>::type mantissa_table[2048] = {
+        0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, 0x34C00000,
+        0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000,
+        0x35600000, 0x35700000, 0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000,
+        0x35A80000, 0x35B00000, 0x35B80000, 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000,
+        0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000, 0x36000000, 0x36040000, 0x36080000,
+        0x360C0000, 0x36100000, 0x36140000, 0x36180000, 0x361C0000, 0x36200000, 0x36240000,
+        0x36280000, 0x362C0000, 0x36300000, 0x36340000, 0x36380000, 0x363C0000, 0x36400000,
+        0x36440000, 0x36480000, 0x364C0000, 0x36500000, 0x36540000, 0x36580000, 0x365C0000,
+        0x36600000, 0x36640000, 0x36680000, 0x366C0000, 0x36700000, 0x36740000, 0x36780000,
+        0x367C0000, 0x36800000, 0x36820000, 0x36840000, 0x36860000, 0x36880000, 0x368A0000,
+        0x368C0000, 0x368E0000, 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000,
+        0x369A0000, 0x369C0000, 0x369E0000, 0x36A00000, 0x36A20000, 0x36A40000, 0x36A60000,
+        0x36A80000, 0x36AA0000, 0x36AC0000, 0x36AE0000, 0x36B00000, 0x36B20000, 0x36B40000,
+        0x36B60000, 0x36B80000, 0x36BA0000, 0x36BC0000, 0x36BE0000, 0x36C00000, 0x36C20000,
+        0x36C40000, 0x36C60000, 0x36C80000, 0x36CA0000, 0x36CC0000, 0x36CE0000, 0x36D00000,
+        0x36D20000, 0x36D40000, 0x36D60000, 0x36D80000, 0x36DA0000, 0x36DC0000, 0x36DE0000,
+        0x36E00000, 0x36E20000, 0x36E40000, 0x36E60000, 0x36E80000, 0x36EA0000, 0x36EC0000,
+        0x36EE0000, 0x36F00000, 0x36F20000, 0x36F40000, 0x36F60000, 0x36F80000, 0x36FA0000,
+        0x36FC0000, 0x36FE0000, 0x37000000, 0x37010000, 0x37020000, 0x37030000, 0x37040000,
+        0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, 0x370A0000, 0x370B0000,
+        0x370C0000, 0x370D0000, 0x370E0000, 0x370F0000, 0x37100000, 0x37110000, 0x37120000,
+        0x37130000, 0x37140000, 0x37150000, 0x37160000, 0x37170000, 0x37180000, 0x37190000,
+        0x371A0000, 0x371B0000, 0x371C0000, 0x371D0000, 0x371E0000, 0x371F0000, 0x37200000,
+        0x37210000, 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000,
+        0x37280000, 0x37290000, 0x372A0000, 0x372B0000, 0x372C0000, 0x372D0000, 0x372E0000,
+        0x372F0000, 0x37300000, 0x37310000, 0x37320000, 0x37330000, 0x37340000, 0x37350000,
+        0x37360000, 0x37370000, 0x37380000, 0x37390000, 0x373A0000, 0x373B0000, 0x373C0000,
+        0x373D0000, 0x373E0000, 0x373F0000, 0x37400000, 0x37410000, 0x37420000, 0x37430000,
+        0x37440000, 0x37450000, 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374A0000,
+        0x374B0000, 0x374C0000, 0x374D0000, 0x374E0000, 0x374F0000, 0x37500000, 0x37510000,
+        0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, 0x37580000,
+        0x37590000, 0x375A0000, 0x375B0000, 0x375C0000, 0x375D0000, 0x375E0000, 0x375F0000,
+        0x37600000, 0x37610000, 0x37620000, 0x37630000, 0x37640000, 0x37650000, 0x37660000,
+        0x37670000, 0x37680000, 0x37690000, 0x376A0000, 0x376B0000, 0x376C0000, 0x376D0000,
+        0x376E0000, 0x376F0000, 0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000,
+        0x37750000, 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377A0000, 0x377B0000,
+        0x377C0000, 0x377D0000, 0x377E0000, 0x377F0000, 0x37800000, 0x37808000, 0x37810000,
+        0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, 0x37840000, 0x37848000,
+        0x37850000, 0x37858000, 0x37860000, 0x37868000, 0x37870000, 0x37878000, 0x37880000,
+        0x37888000, 0x37890000, 0x37898000, 0x378A0000, 0x378A8000, 0x378B0000, 0x378B8000,
+        0x378C0000, 0x378C8000, 0x378D0000, 0x378D8000, 0x378E0000, 0x378E8000, 0x378F0000,
+        0x378F8000, 0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000,
+        0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, 0x37960000,
+        0x37968000, 0x37970000, 0x37978000, 0x37980000, 0x37988000, 0x37990000, 0x37998000,
+        0x379A0000, 0x379A8000, 0x379B0000, 0x379B8000, 0x379C0000, 0x379C8000, 0x379D0000,
+        0x379D8000, 0x379E0000, 0x379E8000, 0x379F0000, 0x379F8000, 0x37A00000, 0x37A08000,
+        0x37A10000, 0x37A18000, 0x37A20000, 0x37A28000, 0x37A30000, 0x37A38000, 0x37A40000,
+        0x37A48000, 0x37A50000, 0x37A58000, 0x37A60000, 0x37A68000, 0x37A70000, 0x37A78000,
+        0x37A80000, 0x37A88000, 0x37A90000, 0x37A98000, 0x37AA0000, 0x37AA8000, 0x37AB0000,
+        0x37AB8000, 0x37AC0000, 0x37AC8000, 0x37AD0000, 0x37AD8000, 0x37AE0000, 0x37AE8000,
+        0x37AF0000, 0x37AF8000, 0x37B00000, 0x37B08000, 0x37B10000, 0x37B18000, 0x37B20000,
+        0x37B28000, 0x37B30000, 0x37B38000, 0x37B40000, 0x37B48000, 0x37B50000, 0x37B58000,
+        0x37B60000, 0x37B68000, 0x37B70000, 0x37B78000, 0x37B80000, 0x37B88000, 0x37B90000,
+        0x37B98000, 0x37BA0000, 0x37BA8000, 0x37BB0000, 0x37BB8000, 0x37BC0000, 0x37BC8000,
+        0x37BD0000, 0x37BD8000, 0x37BE0000, 0x37BE8000, 0x37BF0000, 0x37BF8000, 0x37C00000,
+        0x37C08000, 0x37C10000, 0x37C18000, 0x37C20000, 0x37C28000, 0x37C30000, 0x37C38000,
+        0x37C40000, 0x37C48000, 0x37C50000, 0x37C58000, 0x37C60000, 0x37C68000, 0x37C70000,
+        0x37C78000, 0x37C80000, 0x37C88000, 0x37C90000, 0x37C98000, 0x37CA0000, 0x37CA8000,
+        0x37CB0000, 0x37CB8000, 0x37CC0000, 0x37CC8000, 0x37CD0000, 0x37CD8000, 0x37CE0000,
+        0x37CE8000, 0x37CF0000, 0x37CF8000, 0x37D00000, 0x37D08000, 0x37D10000, 0x37D18000,
+        0x37D20000, 0x37D28000, 0x37D30000, 0x37D38000, 0x37D40000, 0x37D48000, 0x37D50000,
+        0x37D58000, 0x37D60000, 0x37D68000, 0x37D70000, 0x37D78000, 0x37D80000, 0x37D88000,
+        0x37D90000, 0x37D98000, 0x37DA0000, 0x37DA8000, 0x37DB0000, 0x37DB8000, 0x37DC0000,
+        0x37DC8000, 0x37DD0000, 0x37DD8000, 0x37DE0000, 0x37DE8000, 0x37DF0000, 0x37DF8000,
+        0x37E00000, 0x37E08000, 0x37E10000, 0x37E18000, 0x37E20000, 0x37E28000, 0x37E30000,
+        0x37E38000, 0x37E40000, 0x37E48000, 0x37E50000, 0x37E58000, 0x37E60000, 0x37E68000,
+        0x37E70000, 0x37E78000, 0x37E80000, 0x37E88000, 0x37E90000, 0x37E98000, 0x37EA0000,
+        0x37EA8000, 0x37EB0000, 0x37EB8000, 0x37EC0000, 0x37EC8000, 0x37ED0000, 0x37ED8000,
+        0x37EE0000, 0x37EE8000, 0x37EF0000, 0x37EF8000, 0x37F00000, 0x37F08000, 0x37F10000,
+        0x37F18000, 0x37F20000, 0x37F28000, 0x37F30000, 0x37F38000, 0x37F40000, 0x37F48000,
+        0x37F50000, 0x37F58000, 0x37F60000, 0x37F68000, 0x37F70000, 0x37F78000, 0x37F80000,
+        0x37F88000, 0x37F90000, 0x37F98000, 0x37FA0000, 0x37FA8000, 0x37FB0000, 0x37FB8000,
+        0x37FC0000, 0x37FC8000, 0x37FD0000, 0x37FD8000, 0x37FE0000, 0x37FE8000, 0x37FF0000,
+        0x37FF8000, 0x38000000, 0x38004000, 0x38008000, 0x3800C000, 0x38010000, 0x38014000,
+        0x38018000, 0x3801C000, 0x38020000, 0x38024000, 0x38028000, 0x3802C000, 0x38030000,
+        0x38034000, 0x38038000, 0x3803C000, 0x38040000, 0x38044000, 0x38048000, 0x3804C000,
+        0x38050000, 0x38054000, 0x38058000, 0x3805C000, 0x38060000, 0x38064000, 0x38068000,
+        0x3806C000, 0x38070000, 0x38074000, 0x38078000, 0x3807C000, 0x38080000, 0x38084000,
+        0x38088000, 0x3808C000, 0x38090000, 0x38094000, 0x38098000, 0x3809C000, 0x380A0000,
+        0x380A4000, 0x380A8000, 0x380AC000, 0x380B0000, 0x380B4000, 0x380B8000, 0x380BC000,
+        0x380C0000, 0x380C4000, 0x380C8000, 0x380CC000, 0x380D0000, 0x380D4000, 0x380D8000,
+        0x380DC000, 0x380E0000, 0x380E4000, 0x380E8000, 0x380EC000, 0x380F0000, 0x380F4000,
+        0x380F8000, 0x380FC000, 0x38100000, 0x38104000, 0x38108000, 0x3810C000, 0x38110000,
+        0x38114000, 0x38118000, 0x3811C000, 0x38120000, 0x38124000, 0x38128000, 0x3812C000,
+        0x38130000, 0x38134000, 0x38138000, 0x3813C000, 0x38140000, 0x38144000, 0x38148000,
+        0x3814C000, 0x38150000, 0x38154000, 0x38158000, 0x3815C000, 0x38160000, 0x38164000,
+        0x38168000, 0x3816C000, 0x38170000, 0x38174000, 0x38178000, 0x3817C000, 0x38180000,
+        0x38184000, 0x38188000, 0x3818C000, 0x38190000, 0x38194000, 0x38198000, 0x3819C000,
+        0x381A0000, 0x381A4000, 0x381A8000, 0x381AC000, 0x381B0000, 0x381B4000, 0x381B8000,
+        0x381BC000, 0x381C0000, 0x381C4000, 0x381C8000, 0x381CC000, 0x381D0000, 0x381D4000,
+        0x381D8000, 0x381DC000, 0x381E0000, 0x381E4000, 0x381E8000, 0x381EC000, 0x381F0000,
+        0x381F4000, 0x381F8000, 0x381FC000, 0x38200000, 0x38204000, 0x38208000, 0x3820C000,
+        0x38210000, 0x38214000, 0x38218000, 0x3821C000, 0x38220000, 0x38224000, 0x38228000,
+        0x3822C000, 0x38230000, 0x38234000, 0x38238000, 0x3823C000, 0x38240000, 0x38244000,
+        0x38248000, 0x3824C000, 0x38250000, 0x38254000, 0x38258000, 0x3825C000, 0x38260000,
+        0x38264000, 0x38268000, 0x3826C000, 0x38270000, 0x38274000, 0x38278000, 0x3827C000,
+        0x38280000, 0x38284000, 0x38288000, 0x3828C000, 0x38290000, 0x38294000, 0x38298000,
+        0x3829C000, 0x382A0000, 0x382A4000, 0x382A8000, 0x382AC000, 0x382B0000, 0x382B4000,
+        0x382B8000, 0x382BC000, 0x382C0000, 0x382C4000, 0x382C8000, 0x382CC000, 0x382D0000,
+        0x382D4000, 0x382D8000, 0x382DC000, 0x382E0000, 0x382E4000, 0x382E8000, 0x382EC000,
+        0x382F0000, 0x382F4000, 0x382F8000, 0x382FC000, 0x38300000, 0x38304000, 0x38308000,
+        0x3830C000, 0x38310000, 0x38314000, 0x38318000, 0x3831C000, 0x38320000, 0x38324000,
+        0x38328000, 0x3832C000, 0x38330000, 0x38334000, 0x38338000, 0x3833C000, 0x38340000,
+        0x38344000, 0x38348000, 0x3834C000, 0x38350000, 0x38354000, 0x38358000, 0x3835C000,
+        0x38360000, 0x38364000, 0x38368000, 0x3836C000, 0x38370000, 0x38374000, 0x38378000,
+        0x3837C000, 0x38380000, 0x38384000, 0x38388000, 0x3838C000, 0x38390000, 0x38394000,
+        0x38398000, 0x3839C000, 0x383A0000, 0x383A4000, 0x383A8000, 0x383AC000, 0x383B0000,
+        0x383B4000, 0x383B8000, 0x383BC000, 0x383C0000, 0x383C4000, 0x383C8000, 0x383CC000,
+        0x383D0000, 0x383D4000, 0x383D8000, 0x383DC000, 0x383E0000, 0x383E4000, 0x383E8000,
+        0x383EC000, 0x383F0000, 0x383F4000, 0x383F8000, 0x383FC000, 0x38400000, 0x38404000,
+        0x38408000, 0x3840C000, 0x38410000, 0x38414000, 0x38418000, 0x3841C000, 0x38420000,
+        0x38424000, 0x38428000, 0x3842C000, 0x38430000, 0x38434000, 0x38438000, 0x3843C000,
+        0x38440000, 0x38444000, 0x38448000, 0x3844C000, 0x38450000, 0x38454000, 0x38458000,
+        0x3845C000, 0x38460000, 0x38464000, 0x38468000, 0x3846C000, 0x38470000, 0x38474000,
+        0x38478000, 0x3847C000, 0x38480000, 0x38484000, 0x38488000, 0x3848C000, 0x38490000,
+        0x38494000, 0x38498000, 0x3849C000, 0x384A0000, 0x384A4000, 0x384A8000, 0x384AC000,
+        0x384B0000, 0x384B4000, 0x384B8000, 0x384BC000, 0x384C0000, 0x384C4000, 0x384C8000,
+        0x384CC000, 0x384D0000, 0x384D4000, 0x384D8000, 0x384DC000, 0x384E0000, 0x384E4000,
+        0x384E8000, 0x384EC000, 0x384F0000, 0x384F4000, 0x384F8000, 0x384FC000, 0x38500000,
+        0x38504000, 0x38508000, 0x3850C000, 0x38510000, 0x38514000, 0x38518000, 0x3851C000,
+        0x38520000, 0x38524000, 0x38528000, 0x3852C000, 0x38530000, 0x38534000, 0x38538000,
+        0x3853C000, 0x38540000, 0x38544000, 0x38548000, 0x3854C000, 0x38550000, 0x38554000,
+        0x38558000, 0x3855C000, 0x38560000, 0x38564000, 0x38568000, 0x3856C000, 0x38570000,
+        0x38574000, 0x38578000, 0x3857C000, 0x38580000, 0x38584000, 0x38588000, 0x3858C000,
+        0x38590000, 0x38594000, 0x38598000, 0x3859C000, 0x385A0000, 0x385A4000, 0x385A8000,
+        0x385AC000, 0x385B0000, 0x385B4000, 0x385B8000, 0x385BC000, 0x385C0000, 0x385C4000,
+        0x385C8000, 0x385CC000, 0x385D0000, 0x385D4000, 0x385D8000, 0x385DC000, 0x385E0000,
+        0x385E4000, 0x385E8000, 0x385EC000, 0x385F0000, 0x385F4000, 0x385F8000, 0x385FC000,
+        0x38600000, 0x38604000, 0x38608000, 0x3860C000, 0x38610000, 0x38614000, 0x38618000,
+        0x3861C000, 0x38620000, 0x38624000, 0x38628000, 0x3862C000, 0x38630000, 0x38634000,
+        0x38638000, 0x3863C000, 0x38640000, 0x38644000, 0x38648000, 0x3864C000, 0x38650000,
+        0x38654000, 0x38658000, 0x3865C000, 0x38660000, 0x38664000, 0x38668000, 0x3866C000,
+        0x38670000, 0x38674000, 0x38678000, 0x3867C000, 0x38680000, 0x38684000, 0x38688000,
+        0x3868C000, 0x38690000, 0x38694000, 0x38698000, 0x3869C000, 0x386A0000, 0x386A4000,
+        0x386A8000, 0x386AC000, 0x386B0000, 0x386B4000, 0x386B8000, 0x386BC000, 0x386C0000,
+        0x386C4000, 0x386C8000, 0x386CC000, 0x386D0000, 0x386D4000, 0x386D8000, 0x386DC000,
+        0x386E0000, 0x386E4000, 0x386E8000, 0x386EC000, 0x386F0000, 0x386F4000, 0x386F8000,
+        0x386FC000, 0x38700000, 0x38704000, 0x38708000, 0x3870C000, 0x38710000, 0x38714000,
+        0x38718000, 0x3871C000, 0x38720000, 0x38724000, 0x38728000, 0x3872C000, 0x38730000,
+        0x38734000, 0x38738000, 0x3873C000, 0x38740000, 0x38744000, 0x38748000, 0x3874C000,
+        0x38750000, 0x38754000, 0x38758000, 0x3875C000, 0x38760000, 0x38764000, 0x38768000,
+        0x3876C000, 0x38770000, 0x38774000, 0x38778000, 0x3877C000, 0x38780000, 0x38784000,
+        0x38788000, 0x3878C000, 0x38790000, 0x38794000, 0x38798000, 0x3879C000, 0x387A0000,
+        0x387A4000, 0x387A8000, 0x387AC000, 0x387B0000, 0x387B4000, 0x387B8000, 0x387BC000,
+        0x387C0000, 0x387C4000, 0x387C8000, 0x387CC000, 0x387D0000, 0x387D4000, 0x387D8000,
+        0x387DC000, 0x387E0000, 0x387E4000, 0x387E8000, 0x387EC000, 0x387F0000, 0x387F4000,
+        0x387F8000, 0x387FC000, 0x38000000, 0x38002000, 0x38004000, 0x38006000, 0x38008000,
+        0x3800A000, 0x3800C000, 0x3800E000, 0x38010000, 0x38012000, 0x38014000, 0x38016000,
+        0x38018000, 0x3801A000, 0x3801C000, 0x3801E000, 0x38020000, 0x38022000, 0x38024000,
+        0x38026000, 0x38028000, 0x3802A000, 0x3802C000, 0x3802E000, 0x38030000, 0x38032000,
+        0x38034000, 0x38036000, 0x38038000, 0x3803A000, 0x3803C000, 0x3803E000, 0x38040000,
+        0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804A000, 0x3804C000, 0x3804E000,
+        0x38050000, 0x38052000, 0x38054000, 0x38056000, 0x38058000, 0x3805A000, 0x3805C000,
+        0x3805E000, 0x38060000, 0x38062000, 0x38064000, 0x38066000, 0x38068000, 0x3806A000,
+        0x3806C000, 0x3806E000, 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000,
+        0x3807A000, 0x3807C000, 0x3807E000, 0x38080000, 0x38082000, 0x38084000, 0x38086000,
+        0x38088000, 0x3808A000, 0x3808C000, 0x3808E000, 0x38090000, 0x38092000, 0x38094000,
+        0x38096000, 0x38098000, 0x3809A000, 0x3809C000, 0x3809E000, 0x380A0000, 0x380A2000,
+        0x380A4000, 0x380A6000, 0x380A8000, 0x380AA000, 0x380AC000, 0x380AE000, 0x380B0000,
+        0x380B2000, 0x380B4000, 0x380B6000, 0x380B8000, 0x380BA000, 0x380BC000, 0x380BE000,
+        0x380C0000, 0x380C2000, 0x380C4000, 0x380C6000, 0x380C8000, 0x380CA000, 0x380CC000,
+        0x380CE000, 0x380D0000, 0x380D2000, 0x380D4000, 0x380D6000, 0x380D8000, 0x380DA000,
+        0x380DC000, 0x380DE000, 0x380E0000, 0x380E2000, 0x380E4000, 0x380E6000, 0x380E8000,
+        0x380EA000, 0x380EC000, 0x380EE000, 0x380F0000, 0x380F2000, 0x380F4000, 0x380F6000,
+        0x380F8000, 0x380FA000, 0x380FC000, 0x380FE000, 0x38100000, 0x38102000, 0x38104000,
+        0x38106000, 0x38108000, 0x3810A000, 0x3810C000, 0x3810E000, 0x38110000, 0x38112000,
+        0x38114000, 0x38116000, 0x38118000, 0x3811A000, 0x3811C000, 0x3811E000, 0x38120000,
+        0x38122000, 0x38124000, 0x38126000, 0x38128000, 0x3812A000, 0x3812C000, 0x3812E000,
+        0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813A000, 0x3813C000,
+        0x3813E000, 0x38140000, 0x38142000, 0x38144000, 0x38146000, 0x38148000, 0x3814A000,
+        0x3814C000, 0x3814E000, 0x38150000, 0x38152000, 0x38154000, 0x38156000, 0x38158000,
+        0x3815A000, 0x3815C000, 0x3815E000, 0x38160000, 0x38162000, 0x38164000, 0x38166000,
+        0x38168000, 0x3816A000, 0x3816C000, 0x3816E000, 0x38170000, 0x38172000, 0x38174000,
+        0x38176000, 0x38178000, 0x3817A000, 0x3817C000, 0x3817E000, 0x38180000, 0x38182000,
+        0x38184000, 0x38186000, 0x38188000, 0x3818A000, 0x3818C000, 0x3818E000, 0x38190000,
+        0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819A000, 0x3819C000, 0x3819E000,
+        0x381A0000, 0x381A2000, 0x381A4000, 0x381A6000, 0x381A8000, 0x381AA000, 0x381AC000,
+        0x381AE000, 0x381B0000, 0x381B2000, 0x381B4000, 0x381B6000, 0x381B8000, 0x381BA000,
+        0x381BC000, 0x381BE000, 0x381C0000, 0x381C2000, 0x381C4000, 0x381C6000, 0x381C8000,
+        0x381CA000, 0x381CC000, 0x381CE000, 0x381D0000, 0x381D2000, 0x381D4000, 0x381D6000,
+        0x381D8000, 0x381DA000, 0x381DC000, 0x381DE000, 0x381E0000, 0x381E2000, 0x381E4000,
+        0x381E6000, 0x381E8000, 0x381EA000, 0x381EC000, 0x381EE000, 0x381F0000, 0x381F2000,
+        0x381F4000, 0x381F6000, 0x381F8000, 0x381FA000, 0x381FC000, 0x381FE000, 0x38200000,
+        0x38202000, 0x38204000, 0x38206000, 0x38208000, 0x3820A000, 0x3820C000, 0x3820E000,
+        0x38210000, 0x38212000, 0x38214000, 0x38216000, 0x38218000, 0x3821A000, 0x3821C000,
+        0x3821E000, 0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822A000,
+        0x3822C000, 0x3822E000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, 0x38238000,
+        0x3823A000, 0x3823C000, 0x3823E000, 0x38240000, 0x38242000, 0x38244000, 0x38246000,
+        0x38248000, 0x3824A000, 0x3824C000, 0x3824E000, 0x38250000, 0x38252000, 0x38254000,
+        0x38256000, 0x38258000, 0x3825A000, 0x3825C000, 0x3825E000, 0x38260000, 0x38262000,
+        0x38264000, 0x38266000, 0x38268000, 0x3826A000, 0x3826C000, 0x3826E000, 0x38270000,
+        0x38272000, 0x38274000, 0x38276000, 0x38278000, 0x3827A000, 0x3827C000, 0x3827E000,
+        0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828A000, 0x3828C000,
+        0x3828E000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, 0x38298000, 0x3829A000,
+        0x3829C000, 0x3829E000, 0x382A0000, 0x382A2000, 0x382A4000, 0x382A6000, 0x382A8000,
+        0x382AA000, 0x382AC000, 0x382AE000, 0x382B0000, 0x382B2000, 0x382B4000, 0x382B6000,
+        0x382B8000, 0x382BA000, 0x382BC000, 0x382BE000, 0x382C0000, 0x382C2000, 0x382C4000,
+        0x382C6000, 0x382C8000, 0x382CA000, 0x382CC000, 0x382CE000, 0x382D0000, 0x382D2000,
+        0x382D4000, 0x382D6000, 0x382D8000, 0x382DA000, 0x382DC000, 0x382DE000, 0x382E0000,
+        0x382E2000, 0x382E4000, 0x382E6000, 0x382E8000, 0x382EA000, 0x382EC000, 0x382EE000,
+        0x382F0000, 0x382F2000, 0x382F4000, 0x382F6000, 0x382F8000, 0x382FA000, 0x382FC000,
+        0x382FE000, 0x38300000, 0x38302000, 0x38304000, 0x38306000, 0x38308000, 0x3830A000,
+        0x3830C000, 0x3830E000, 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000,
+        0x3831A000, 0x3831C000, 0x3831E000, 0x38320000, 0x38322000, 0x38324000, 0x38326000,
+        0x38328000, 0x3832A000, 0x3832C000, 0x3832E000, 0x38330000, 0x38332000, 0x38334000,
+        0x38336000, 0x38338000, 0x3833A000, 0x3833C000, 0x3833E000, 0x38340000, 0x38342000,
+        0x38344000, 0x38346000, 0x38348000, 0x3834A000, 0x3834C000, 0x3834E000, 0x38350000,
+        0x38352000, 0x38354000, 0x38356000, 0x38358000, 0x3835A000, 0x3835C000, 0x3835E000,
+        0x38360000, 0x38362000, 0x38364000, 0x38366000, 0x38368000, 0x3836A000, 0x3836C000,
+        0x3836E000, 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837A000,
+        0x3837C000, 0x3837E000, 0x38380000, 0x38382000, 0x38384000, 0x38386000, 0x38388000,
+        0x3838A000, 0x3838C000, 0x3838E000, 0x38390000, 0x38392000, 0x38394000, 0x38396000,
+        0x38398000, 0x3839A000, 0x3839C000, 0x3839E000, 0x383A0000, 0x383A2000, 0x383A4000,
+        0x383A6000, 0x383A8000, 0x383AA000, 0x383AC000, 0x383AE000, 0x383B0000, 0x383B2000,
+        0x383B4000, 0x383B6000, 0x383B8000, 0x383BA000, 0x383BC000, 0x383BE000, 0x383C0000,
+        0x383C2000, 0x383C4000, 0x383C6000, 0x383C8000, 0x383CA000, 0x383CC000, 0x383CE000,
+        0x383D0000, 0x383D2000, 0x383D4000, 0x383D6000, 0x383D8000, 0x383DA000, 0x383DC000,
+        0x383DE000, 0x383E0000, 0x383E2000, 0x383E4000, 0x383E6000, 0x383E8000, 0x383EA000,
+        0x383EC000, 0x383EE000, 0x383F0000, 0x383F2000, 0x383F4000, 0x383F6000, 0x383F8000,
+        0x383FA000, 0x383FC000, 0x383FE000, 0x38400000, 0x38402000, 0x38404000, 0x38406000,
+        0x38408000, 0x3840A000, 0x3840C000, 0x3840E000, 0x38410000, 0x38412000, 0x38414000,
+        0x38416000, 0x38418000, 0x3841A000, 0x3841C000, 0x3841E000, 0x38420000, 0x38422000,
+        0x38424000, 0x38426000, 0x38428000, 0x3842A000, 0x3842C000, 0x3842E000, 0x38430000,
+        0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843A000, 0x3843C000, 0x3843E000,
+        0x38440000, 0x38442000, 0x38444000, 0x38446000, 0x38448000, 0x3844A000, 0x3844C000,
+        0x3844E000, 0x38450000, 0x38452000, 0x38454000, 0x38456000, 0x38458000, 0x3845A000,
+        0x3845C000, 0x3845E000, 0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000,
+        0x3846A000, 0x3846C000, 0x3846E000, 0x38470000, 0x38472000, 0x38474000, 0x38476000,
+        0x38478000, 0x3847A000, 0x3847C000, 0x3847E000, 0x38480000, 0x38482000, 0x38484000,
+        0x38486000, 0x38488000, 0x3848A000, 0x3848C000, 0x3848E000, 0x38490000, 0x38492000,
+        0x38494000, 0x38496000, 0x38498000, 0x3849A000, 0x3849C000, 0x3849E000, 0x384A0000,
+        0x384A2000, 0x384A4000, 0x384A6000, 0x384A8000, 0x384AA000, 0x384AC000, 0x384AE000,
+        0x384B0000, 0x384B2000, 0x384B4000, 0x384B6000, 0x384B8000, 0x384BA000, 0x384BC000,
+        0x384BE000, 0x384C0000, 0x384C2000, 0x384C4000, 0x384C6000, 0x384C8000, 0x384CA000,
+        0x384CC000, 0x384CE000, 0x384D0000, 0x384D2000, 0x384D4000, 0x384D6000, 0x384D8000,
+        0x384DA000, 0x384DC000, 0x384DE000, 0x384E0000, 0x384E2000, 0x384E4000, 0x384E6000,
+        0x384E8000, 0x384EA000, 0x384EC000, 0x384EE000, 0x384F0000, 0x384F2000, 0x384F4000,
+        0x384F6000, 0x384F8000, 0x384FA000, 0x384FC000, 0x384FE000, 0x38500000, 0x38502000,
+        0x38504000, 0x38506000, 0x38508000, 0x3850A000, 0x3850C000, 0x3850E000, 0x38510000,
+        0x38512000, 0x38514000, 0x38516000, 0x38518000, 0x3851A000, 0x3851C000, 0x3851E000,
+        0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852A000, 0x3852C000,
+        0x3852E000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, 0x38538000, 0x3853A000,
+        0x3853C000, 0x3853E000, 0x38540000, 0x38542000, 0x38544000, 0x38546000, 0x38548000,
+        0x3854A000, 0x3854C000, 0x3854E000, 0x38550000, 0x38552000, 0x38554000, 0x38556000,
+        0x38558000, 0x3855A000, 0x3855C000, 0x3855E000, 0x38560000, 0x38562000, 0x38564000,
+        0x38566000, 0x38568000, 0x3856A000, 0x3856C000, 0x3856E000, 0x38570000, 0x38572000,
+        0x38574000, 0x38576000, 0x38578000, 0x3857A000, 0x3857C000, 0x3857E000, 0x38580000,
+        0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858A000, 0x3858C000, 0x3858E000,
+        0x38590000, 0x38592000, 0x38594000, 0x38596000, 0x38598000, 0x3859A000, 0x3859C000,
+        0x3859E000, 0x385A0000, 0x385A2000, 0x385A4000, 0x385A6000, 0x385A8000, 0x385AA000,
+        0x385AC000, 0x385AE000, 0x385B0000, 0x385B2000, 0x385B4000, 0x385B6000, 0x385B8000,
+        0x385BA000, 0x385BC000, 0x385BE000, 0x385C0000, 0x385C2000, 0x385C4000, 0x385C6000,
+        0x385C8000, 0x385CA000, 0x385CC000, 0x385CE000, 0x385D0000, 0x385D2000, 0x385D4000,
+        0x385D6000, 0x385D8000, 0x385DA000, 0x385DC000, 0x385DE000, 0x385E0000, 0x385E2000,
+        0x385E4000, 0x385E6000, 0x385E8000, 0x385EA000, 0x385EC000, 0x385EE000, 0x385F0000,
+        0x385F2000, 0x385F4000, 0x385F6000, 0x385F8000, 0x385FA000, 0x385FC000, 0x385FE000,
+        0x38600000, 0x38602000, 0x38604000, 0x38606000, 0x38608000, 0x3860A000, 0x3860C000,
+        0x3860E000, 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861A000,
+        0x3861C000, 0x3861E000, 0x38620000, 0x38622000, 0x38624000, 0x38626000, 0x38628000,
+        0x3862A000, 0x3862C000, 0x3862E000, 0x38630000, 0x38632000, 0x38634000, 0x38636000,
+        0x38638000, 0x3863A000, 0x3863C000, 0x3863E000, 0x38640000, 0x38642000, 0x38644000,
+        0x38646000, 0x38648000, 0x3864A000, 0x3864C000, 0x3864E000, 0x38650000, 0x38652000,
+        0x38654000, 0x38656000, 0x38658000, 0x3865A000, 0x3865C000, 0x3865E000, 0x38660000,
+        0x38662000, 0x38664000, 0x38666000, 0x38668000, 0x3866A000, 0x3866C000, 0x3866E000,
+        0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867A000, 0x3867C000,
+        0x3867E000, 0x38680000, 0x38682000, 0x38684000, 0x38686000, 0x38688000, 0x3868A000,
+        0x3868C000, 0x3868E000, 0x38690000, 0x38692000, 0x38694000, 0x38696000, 0x38698000,
+        0x3869A000, 0x3869C000, 0x3869E000, 0x386A0000, 0x386A2000, 0x386A4000, 0x386A6000,
+        0x386A8000, 0x386AA000, 0x386AC000, 0x386AE000, 0x386B0000, 0x386B2000, 0x386B4000,
+        0x386B6000, 0x386B8000, 0x386BA000, 0x386BC000, 0x386BE000, 0x386C0000, 0x386C2000,
+        0x386C4000, 0x386C6000, 0x386C8000, 0x386CA000, 0x386CC000, 0x386CE000, 0x386D0000,
+        0x386D2000, 0x386D4000, 0x386D6000, 0x386D8000, 0x386DA000, 0x386DC000, 0x386DE000,
+        0x386E0000, 0x386E2000, 0x386E4000, 0x386E6000, 0x386E8000, 0x386EA000, 0x386EC000,
+        0x386EE000, 0x386F0000, 0x386F2000, 0x386F4000, 0x386F6000, 0x386F8000, 0x386FA000,
+        0x386FC000, 0x386FE000, 0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000,
+        0x3870A000, 0x3870C000, 0x3870E000, 0x38710000, 0x38712000, 0x38714000, 0x38716000,
+        0x38718000, 0x3871A000, 0x3871C000, 0x3871E000, 0x38720000, 0x38722000, 0x38724000,
+        0x38726000, 0x38728000, 0x3872A000, 0x3872C000, 0x3872E000, 0x38730000, 0x38732000,
+        0x38734000, 0x38736000, 0x38738000, 0x3873A000, 0x3873C000, 0x3873E000, 0x38740000,
+        0x38742000, 0x38744000, 0x38746000, 0x38748000, 0x3874A000, 0x3874C000, 0x3874E000,
+        0x38750000, 0x38752000, 0x38754000, 0x38756000, 0x38758000, 0x3875A000, 0x3875C000,
+        0x3875E000, 0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876A000,
+        0x3876C000, 0x3876E000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, 0x38778000,
+        0x3877A000, 0x3877C000, 0x3877E000, 0x38780000, 0x38782000, 0x38784000, 0x38786000,
+        0x38788000, 0x3878A000, 0x3878C000, 0x3878E000, 0x38790000, 0x38792000, 0x38794000,
+        0x38796000, 0x38798000, 0x3879A000, 0x3879C000, 0x3879E000, 0x387A0000, 0x387A2000,
+        0x387A4000, 0x387A6000, 0x387A8000, 0x387AA000, 0x387AC000, 0x387AE000, 0x387B0000,
+        0x387B2000, 0x387B4000, 0x387B6000, 0x387B8000, 0x387BA000, 0x387BC000, 0x387BE000,
+        0x387C0000, 0x387C2000, 0x387C4000, 0x387C6000, 0x387C8000, 0x387CA000, 0x387CC000,
+        0x387CE000, 0x387D0000, 0x387D2000, 0x387D4000, 0x387D6000, 0x387D8000, 0x387DA000,
+        0x387DC000, 0x387DE000, 0x387E0000, 0x387E2000, 0x387E4000, 0x387E6000, 0x387E8000,
+        0x387EA000, 0x387EC000, 0x387EE000, 0x387F0000, 0x387F2000, 0x387F4000, 0x387F6000,
+        0x387F8000, 0x387FA000, 0x387FC000, 0x387FE000};
+    static const bits<float>::type exponent_table[64] = {
+        0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, 0x03000000,
+        0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, 0x06000000, 0x06800000,
+        0x07000000, 0x07800000, 0x08000000, 0x08800000, 0x09000000, 0x09800000, 0x0A000000,
+        0x0A800000, 0x0B000000, 0x0B800000, 0x0C000000, 0x0C800000, 0x0D000000, 0x0D800000,
+        0x0E000000, 0x0E800000, 0x0F000000, 0x47800000, 0x80000000, 0x80800000, 0x81000000,
+        0x81800000, 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000,
+        0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000, 0x88000000,
+        0x88800000, 0x89000000, 0x89800000, 0x8A000000, 0x8A800000, 0x8B000000, 0x8B800000,
+        0x8C000000, 0x8C800000, 0x8D000000, 0x8D800000, 0x8E000000, 0x8E800000, 0x8F000000,
+        0xC7800000};
+    static const unsigned short offset_table[64] = {
+        0,    1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024,
+        1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024,
+        1024, 1024, 1024, 1024, 1024, 1024, 0,    1024, 1024, 1024, 1024, 1024, 1024,
+        1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024,
+        1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024};
+    bits<float>::type fbits =
+        mantissa_table[offset_table[value >> 10] + (value & 0x3FF)] + exponent_table[value >> 10];
+#endif
+    float out;
+    std::memcpy(&out, &fbits, sizeof(float));
+    return out;
+#endif
+}
+
+/// Convert half-precision to IEEE double-precision.
+/// \param value half-precision value to convert
+/// \return double-precision value
+inline double half2float_impl(unsigned int value, double, true_type)
+{
+#if HALF_ENABLE_F16C_INTRINSICS
+    return _mm_cvtsd_f64(_mm_cvtps_pd(_mm_cvtph_ps(_mm_cvtsi32_si128(value))));
+#else
+    uint32 hi        = static_cast<uint32>(value & 0x8000) << 16;
+    unsigned int abs = value & 0x7FFF;
+    if(abs)
+    {
+        hi |= 0x3F000000 << static_cast<unsigned>(abs >= 0x7C00);
+        for(; abs < 0x400; abs <<= 1, hi -= 0x100000)
+            ;
+        hi += static_cast<uint32>(abs) << 10;
+    }
+    bits<double>::type dbits = static_cast<bits<double>::type>(hi) << 32;
+    double out;
+    std::memcpy(&out, &dbits, sizeof(double));
+    return out;
+#endif
+}
+
+/// Convert half-precision to non-IEEE floating-point.
+/// \tparam T type to convert to (builtin integer type)
+/// \param value half-precision value to convert
+/// \return floating-point value
+template <typename T>
+T half2float_impl(unsigned int value, T, ...)
+{
+    T out;
+    unsigned int abs = value & 0x7FFF;
+    if(abs > 0x7C00)
+        out =
+            (std::numeric_limits<T>::has_signaling_NaN && !(abs & 0x200))
+                ? std::numeric_limits<T>::signaling_NaN()
+                : std::numeric_limits<T>::has_quiet_NaN ? std::numeric_limits<T>::quiet_NaN() : T();
+    else if(abs == 0x7C00)
+        out = std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity()
+                                                   : std::numeric_limits<T>::max();
+    else if(abs > 0x3FF)
+        out = std::ldexp(static_cast<T>((abs & 0x3FF) | 0x400), (abs >> 10) - 25);
+    else
+        out = std::ldexp(static_cast<T>(abs), -24);
+    return (value & 0x8000) ? -out : out;
+}
+
+/// Convert half-precision to floating-point.
+/// \tparam T type to convert to (builtin integer type)
+/// \param value half-precision value to convert
+/// \return floating-point value
+template <typename T>
+T half2float(unsigned int value)
+{
+    return half2float_impl(value,
+                           T(),
+                           bool_type < std::numeric_limits<T>::is_iec559 &&
+                               sizeof(typename bits<T>::type) == sizeof(T) > ());
+}
+
+/// Convert half-precision floating-point to integer.
+/// \tparam R rounding mode to use
+/// \tparam E `true` for round to even, `false` for round away from zero
+/// \tparam I `true` to raise INEXACT exception (if inexact), `false` to never raise it
+/// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding
+/// any implicit sign bits)
+/// \param value half-precision value to convert
+/// \return rounded integer value
+/// \exception FE_INVALID if value is not representable in type \a T
+/// \exception FE_INEXACT if value had to be rounded and \a I is `true`
+template <std::float_round_style R, bool E, bool I, typename T>
+T half2int(unsigned int value)
+{
+    unsigned int abs = value & 0x7FFF;
+    if(abs >= 0x7C00)
+    {
+        raise(FE_INVALID);
+        return (value & 0x8000) ? std::numeric_limits<T>::min() : std::numeric_limits<T>::max();
+    }
+    if(abs < 0x3800)
+    {
+        raise(FE_INEXACT, I);
+        return (R == std::round_toward_infinity)
+                   ? T(~(value >> 15) & (abs != 0))
+                   : (R == std::round_toward_neg_infinity) ? -T(value > 0x8000) : T();
+    }
+    int exp        = 25 - (abs >> 10);
+    unsigned int m = (value & 0x3FF) | 0x400;
+    int32 i        = static_cast<int32>(
+        (exp <= 0)
+            ? (m << -exp)
+            : ((m + ((R == std::round_to_nearest) ? ((1 << (exp - 1)) - (~(m >> exp) & E))
+                                                  : (R == std::round_toward_infinity)
+                                                        ? (((1 << exp) - 1) & ((value >> 15) - 1))
+                                                        : (R == std::round_toward_neg_infinity)
+                                                              ? (((1 << exp) - 1) & -(value >> 15))
+                                                              : 0)) >>
+               exp));
+    if((!std::numeric_limits<T>::is_signed && (value & 0x8000)) ||
+       (std::numeric_limits<T>::digits < 16 &&
+        ((value & 0x8000) ? (-i < std::numeric_limits<T>::min())
+                          : (i > std::numeric_limits<T>::max()))))
+        raise(FE_INVALID);
+    else if(I && exp > 0 && (m & ((1 << exp) - 1)))
+        raise(FE_INEXACT);
+    return static_cast<T>((value & 0x8000) ? -i : i);
+}
+
+/// \}
+/// \name Mathematics
+/// \{
+
+/// upper part of 64-bit multiplication.
+/// \tparam R rounding mode to use
+/// \param x first factor
+/// \param y second factor
+/// \return upper 32 bit of \a x * \a y
+template <std::float_round_style R>
+uint32 mulhi(uint32 x, uint32 y)
+{
+    uint32 xy = (x >> 16) * (y & 0xFFFF), yx = (x & 0xFFFF) * (y >> 16),
+           c = (xy & 0xFFFF) + (yx & 0xFFFF) + (((x & 0xFFFF) * (y & 0xFFFF)) >> 16);
+    return (x >> 16) * (y >> 16) + (xy >> 16) + (yx >> 16) + (c >> 16) +
+           ((R == std::round_to_nearest)
+                ? ((c >> 15) & 1)
+                : (R == std::round_toward_infinity) ? ((c & 0xFFFF) != 0) : 0);
+}
+
+/// 64-bit multiplication.
+/// \param x first factor
+/// \param y second factor
+/// \return upper 32 bit of \a x * \a y rounded to nearest
+inline uint32 multiply64(uint32 x, uint32 y)
+{
+#if HALF_ENABLE_CPP11_LONG_LONG
+    return static_cast<uint32>(
+        (static_cast<unsigned long long>(x) * static_cast<unsigned long long>(y) + 0x80000000) >>
+        32);
+#else
+    return mulhi<std::round_to_nearest>(x, y);
+#endif
+}
+
+/// 64-bit division.
+/// \param x upper 32 bit of dividend
+/// \param y divisor
+/// \param s variable to store sticky bit for rounding
+/// \return (\a x << 32) / \a y
+inline uint32 divide64(uint32 x, uint32 y, int& s)
+{
+#if HALF_ENABLE_CPP11_LONG_LONG
+    unsigned long long xx = static_cast<unsigned long long>(x) << 32;
+    return s              = (xx % y != 0), static_cast<uint32>(xx / y);
+#else
+    y >>= 1;
+    uint32 rem = x, div = 0;
+    for(unsigned int i = 0; i < 32; ++i)
+    {
+        div <<= 1;
+        if(rem >= y)
+        {
+            rem -= y;
+            div |= 1;
+        }
+        rem <<= 1;
+    }
+    return s    = rem > 1, div;
+#endif
+}
+
+/// Half precision positive modulus.
+/// \tparam Q `true` to compute full quotient, `false` else
+/// \tparam R `true` to compute signed remainder, `false` for positive remainder
+/// \param x first operand as positive finite half-precision value
+/// \param y second operand as positive finite half-precision value
+/// \param quo adress to store quotient at, `nullptr` if \a Q `false`
+/// \return modulus of \a x / \a y
+template <bool Q, bool R>
+unsigned int mod(unsigned int x, unsigned int y, int* quo = NULL)
+{
+    unsigned int q = 0;
+    if(x > y)
+    {
+        int absx = x, absy = y, expx = 0, expy = 0;
+        for(; absx < 0x400; absx <<= 1, --expx)
+            ;
+        for(; absy < 0x400; absy <<= 1, --expy)
+            ;
+        expx += absx >> 10;
+        expy += absy >> 10;
+        int mx = (absx & 0x3FF) | 0x400, my = (absy & 0x3FF) | 0x400;
+        for(int d = expx - expy; d; --d)
+        {
+            if(!Q && mx == my)
+                return 0;
+            if(mx >= my)
+            {
+                mx -= my;
+                q += Q;
+            }
+            mx <<= 1;
+            q <<= static_cast<int>(Q);
+        }
+        if(!Q && mx == my)
+            return 0;
+        if(mx >= my)
+        {
+            mx -= my;
+            ++q;
+        }
+        if(Q)
+        {
+            q &= (1 << (std::numeric_limits<int>::digits - 1)) - 1;
+            if(!mx)
+                return *quo = q, 0;
+        }
+        for(; mx < 0x400; mx <<= 1, --expy)
+            ;
+        x = (expy > 0) ? ((expy << 10) | (mx & 0x3FF)) : (mx >> (1 - expy));
+    }
+    if(R)
+    {
+        unsigned int a, b;
+        if(y < 0x800)
+        {
+            a = (x < 0x400) ? (x << 1) : (x + 0x400);
+            b = y;
+        }
+        else
+        {
+            a = x;
+            b = y - 0x400;
+        }
+        if(a > b || (a == b && (q & 1)))
+        {
+            int exp = (y >> 10) + (y <= 0x3FF), d = exp - (x >> 10) - (x <= 0x3FF);
+            int m = (((y & 0x3FF) | ((y > 0x3FF) << 10)) << 1) -
+                    (((x & 0x3FF) | ((x > 0x3FF) << 10)) << (1 - d));
+            for(; m < 0x800 && exp > 1; m <<= 1, --exp)
+                ;
+            x = 0x8000 + ((exp - 1) << 10) + (m >> 1);
+            q += Q;
+        }
+    }
+    if(Q)
+        *quo = q;
+    return x;
+}
+
+/// Fixed point square root.
+/// \tparam F number of fractional bits
+/// \param r radicand in Q1.F fixed point format
+/// \param exp exponent
+/// \return square root as Q1.F/2
+template <unsigned int F>
+uint32 sqrt(uint32& r, int& exp)
+{
+    int i = exp & 1;
+    r <<= i;
+    exp      = (exp - i) / 2;
+    uint32 m = 0;
+    for(uint32 bit = static_cast<uint32>(1) << F; bit; bit >>= 2)
+    {
+        if(r < m + bit)
+            m >>= 1;
+        else
+        {
+            r -= m + bit;
+            m = (m >> 1) + bit;
+        }
+    }
+    return m;
+}
+
+/// Fixed point binary exponential.
+/// This uses the BKM algorithm in E-mode.
+/// \param m exponent in [0,1) as Q0.31
+/// \param n number of iterations (at most 32)
+/// \return 2 ^ \a m as Q1.31
+inline uint32 exp2(uint32 m, unsigned int n = 32)
+{
+    static const uint32 logs[] = {
+        0x80000000, 0x4AE00D1D, 0x2934F098, 0x15C01A3A, 0x0B31FB7D, 0x05AEB4DD, 0x02DCF2D1,
+        0x016FE50B, 0x00B84E23, 0x005C3E10, 0x002E24CA, 0x001713D6, 0x000B8A47, 0x0005C53B,
+        0x0002E2A3, 0x00017153, 0x0000B8AA, 0x00005C55, 0x00002E2B, 0x00001715, 0x00000B8B,
+        0x000005C5, 0x000002E3, 0x00000171, 0x000000B9, 0x0000005C, 0x0000002E, 0x00000017,
+        0x0000000C, 0x00000006, 0x00000003, 0x00000001};
+    if(!m)
+        return 0x80000000;
+    uint32 mx = 0x80000000, my = 0;
+    for(unsigned int i = 1; i < n; ++i)
+    {
+        uint32 mz = my + logs[i];
+        if(mz <= m)
+        {
+            my = mz;
+            mx += mx >> i;
+        }
+    }
+    return mx;
+}
+
+/// Fixed point binary logarithm.
+/// This uses the BKM algorithm in L-mode.
+/// \param m mantissa in [1,2) as Q1.30
+/// \param n number of iterations (at most 32)
+/// \return log2(\a m) as Q0.31
+inline uint32 log2(uint32 m, unsigned int n = 32)
+{
+    static const uint32 logs[] = {
+        0x80000000, 0x4AE00D1D, 0x2934F098, 0x15C01A3A, 0x0B31FB7D, 0x05AEB4DD, 0x02DCF2D1,
+        0x016FE50B, 0x00B84E23, 0x005C3E10, 0x002E24CA, 0x001713D6, 0x000B8A47, 0x0005C53B,
+        0x0002E2A3, 0x00017153, 0x0000B8AA, 0x00005C55, 0x00002E2B, 0x00001715, 0x00000B8B,
+        0x000005C5, 0x000002E3, 0x00000171, 0x000000B9, 0x0000005C, 0x0000002E, 0x00000017,
+        0x0000000C, 0x00000006, 0x00000003, 0x00000001};
+    if(m == 0x40000000)
+        return 0;
+    uint32 mx = 0x40000000, my = 0;
+    for(unsigned int i = 1; i < n; ++i)
+    {
+        uint32 mz = mx + (mx >> i);
+        if(mz <= m)
+        {
+            mx = mz;
+            my += logs[i];
+        }
+    }
+    return my;
+}
+
+/// Fixed point sine and cosine.
+/// This uses the CORDIC algorithm in rotation mode.
+/// \param mz angle in [-pi/2,pi/2] as Q1.30
+/// \param n number of iterations (at most 31)
+/// \return sine and cosine of \a mz as Q1.30
+inline std::pair<uint32, uint32> sincos(uint32 mz, unsigned int n = 31)
+{
+    static const uint32 angles[] = {
+        0x3243F6A9, 0x1DAC6705, 0x0FADBAFD, 0x07F56EA7, 0x03FEAB77, 0x01FFD55C, 0x00FFFAAB,
+        0x007FFF55, 0x003FFFEB, 0x001FFFFD, 0x00100000, 0x00080000, 0x00040000, 0x00020000,
+        0x00010000, 0x00008000, 0x00004000, 0x00002000, 0x00001000, 0x00000800, 0x00000400,
+        0x00000200, 0x00000100, 0x00000080, 0x00000040, 0x00000020, 0x00000010, 0x00000008,
+        0x00000004, 0x00000002, 0x00000001};
+    uint32 mx = 0x26DD3B6A, my = 0;
+    for(unsigned int i = 0; i < n; ++i)
+    {
+        uint32 sign = sign_mask(mz);
+        uint32 tx   = mx - (arithmetic_shift(my, i) ^ sign) + sign;
+        uint32 ty   = my + (arithmetic_shift(mx, i) ^ sign) - sign;
+        mx          = tx;
+        my          = ty;
+        mz -= (angles[i] ^ sign) - sign;
+    }
+    return std::make_pair(my, mx);
+}
+
+/// Fixed point arc tangent.
+/// This uses the CORDIC algorithm in vectoring mode.
+/// \param my y coordinate as Q0.30
+/// \param mx x coordinate as Q0.30
+/// \param n number of iterations (at most 31)
+/// \return arc tangent of \a my / \a mx as Q1.30
+inline uint32 atan2(uint32 my, uint32 mx, unsigned int n = 31)
+{
+    static const uint32 angles[] = {
+        0x3243F6A9, 0x1DAC6705, 0x0FADBAFD, 0x07F56EA7, 0x03FEAB77, 0x01FFD55C, 0x00FFFAAB,
+        0x007FFF55, 0x003FFFEB, 0x001FFFFD, 0x00100000, 0x00080000, 0x00040000, 0x00020000,
+        0x00010000, 0x00008000, 0x00004000, 0x00002000, 0x00001000, 0x00000800, 0x00000400,
+        0x00000200, 0x00000100, 0x00000080, 0x00000040, 0x00000020, 0x00000010, 0x00000008,
+        0x00000004, 0x00000002, 0x00000001};
+    uint32 mz = 0;
+    for(unsigned int i = 0; i < n; ++i)
+    {
+        uint32 sign = sign_mask(my);
+        uint32 tx   = mx + (arithmetic_shift(my, i) ^ sign) - sign;
+        uint32 ty   = my - (arithmetic_shift(mx, i) ^ sign) + sign;
+        mx          = tx;
+        my          = ty;
+        mz += (angles[i] ^ sign) - sign;
+    }
+    return mz;
+}
+
+/// Reduce argument for trigonometric functions.
+/// \param abs half-precision floating-point value
+/// \param k value to take quarter period
+/// \return \a abs reduced to [-pi/4,pi/4] as Q0.30
+inline uint32 angle_arg(unsigned int abs, int& k)
+{
+    uint32 m = (abs & 0x3FF) | ((abs > 0x3FF) << 10);
+    int exp  = (abs >> 10) + (abs <= 0x3FF) - 15;
+    if(abs < 0x3A48)
+        return k = 0, m << (exp + 20);
+#if HALF_ENABLE_CPP11_LONG_LONG
+    unsigned long long y = m * 0xA2F9836E4E442, mask = (1ULL << (62 - exp)) - 1,
+                       yi = (y + (mask >> 1)) & ~mask, f = y - yi;
+    uint32 sign = -static_cast<uint32>(f >> 63);
+    k           = static_cast<int>(yi >> (62 - exp));
+    return (multiply64(static_cast<uint32>((sign ? -f : f) >> (31 - exp)), 0xC90FDAA2) ^ sign) -
+           sign;
+#else
+    uint32 yh   = m * 0xA2F98 + mulhi<std::round_toward_zero>(m, 0x36E4E442),
+           yl   = (m * 0x36E4E442) & 0xFFFFFFFF;
+    uint32 mask = (static_cast<uint32>(1) << (30 - exp)) - 1, yi = (yh + (mask >> 1)) & ~mask,
+           sign = -static_cast<uint32>(yi > yh);
+    k           = static_cast<int>(yi >> (30 - exp));
+    uint32 fh = (yh ^ sign) + (yi ^ ~sign) - ~sign, fl = (yl ^ sign) - sign;
+    return (multiply64((exp > -1)
+                           ? (((fh << (1 + exp)) & 0xFFFFFFFF) | ((fl & 0xFFFFFFFF) >> (31 - exp)))
+                           : fh,
+                       0xC90FDAA2) ^
+            sign) -
+           sign;
+#endif
+}
+
+/// Get arguments for atan2 function.
+/// \param abs half-precision floating-point value
+/// \return \a abs and sqrt(1 - \a abs^2) as Q0.30
+inline std::pair<uint32, uint32> atan2_args(unsigned int abs)
+{
+    int exp = -15;
+    for(; abs < 0x400; abs <<= 1, --exp)
+        ;
+    exp += abs >> 10;
+    uint32 my = ((abs & 0x3FF) | 0x400) << 5, r = my * my;
+    int rexp = 2 * exp;
+    r        = 0x40000000 -
+        ((rexp > -31) ? ((r >> -rexp) | ((r & ((static_cast<uint32>(1) << -rexp) - 1)) != 0)) : 1);
+    for(rexp = 0; r < 0x40000000; r <<= 1, --rexp)
+        ;
+    uint32 mx = sqrt<30>(r, rexp);
+    int d     = exp - rexp;
+    if(d < 0)
+        return std::make_pair((d < -14) ? ((my >> (-d - 14)) + ((my >> (-d - 15)) & 1))
+                                        : (my << (14 + d)),
+                              (mx << 14) + (r << 13) / mx);
+    if(d > 0)
+        return std::make_pair(my << 14,
+                              (d > 14)
+                                  ? ((mx >> (d - 14)) + ((mx >> (d - 15)) & 1))
+                                  : ((d == 14) ? mx : ((mx << (14 - d)) + (r << (13 - d)) / mx)));
+    return std::make_pair(my << 13, (mx << 13) + (r << 12) / mx);
+}
+
+/// Get exponentials for hyperbolic computation
+/// \param abs half-precision floating-point value
+/// \param exp variable to take unbiased exponent of larger result
+/// \param n number of BKM iterations (at most 32)
+/// \return exp(abs) and exp(-\a abs) as Q1.31 with same exponent
+inline std::pair<uint32, uint32> hyperbolic_args(unsigned int abs, int& exp, unsigned int n = 32)
+{
+    uint32 mx = detail::multiply64(static_cast<uint32>((abs & 0x3FF) + ((abs > 0x3FF) << 10)) << 21,
+                                   0xB8AA3B29),
+           my;
+    int e = (abs >> 10) + (abs <= 0x3FF);
+    if(e < 14)
+    {
+        exp = 0;
+        mx >>= 14 - e;
+    }
+    else
+    {
+        exp = mx >> (45 - e);
+        mx  = (mx << (e - 14)) & 0x7FFFFFFF;
+    }
+    mx    = exp2(mx, n);
+    int d = exp << 1, s;
+    if(mx > 0x80000000)
+    {
+        my = divide64(0x80000000, mx, s);
+        my |= s;
+        ++d;
+    }
+    else
+        my = mx;
+    return std::make_pair(
+        mx, (d < 31) ? ((my >> d) | ((my & ((static_cast<uint32>(1) << d) - 1)) != 0)) : 1);
+}
+
+/// Postprocessing for binary exponential.
+/// \tparam R rounding mode to use
+/// \tparam I `true` to always raise INEXACT exception, `false` to raise only for rounded results
+/// \param m mantissa as Q1.31
+/// \param exp absolute value of unbiased exponent
+/// \param esign sign of actual exponent
+/// \param sign sign bit of result
+/// \return value converted to half-precision
+/// \exception FE_OVERFLOW on overflows
+/// \exception FE_UNDERFLOW on underflows
+/// \exception FE_INEXACT if value had to be rounded or \a I is `true`
+template <std::float_round_style R, bool I>
+unsigned int exp2_post(uint32 m, int exp, bool esign, unsigned int sign = 0)
+{
+    int s = 0;
+    if(esign)
+    {
+        if(m > 0x80000000)
+        {
+            m = divide64(0x80000000, m, s);
+            ++exp;
+        }
+        if(exp > 25)
+            return underflow<R>(sign);
+        else if(exp == 25)
+            return rounded<R, I>(sign, 1, (m & 0x7FFFFFFF) != 0);
+        exp = -exp;
+    }
+    else if(exp > 15)
+        return overflow<R>(sign);
+    return fixed2half<R, 31, false, false, I>(m, exp + 14, sign, s);
+}
+
+/// Postprocessing for binary logarithm.
+/// \tparam R rounding mode to use
+/// \tparam L logarithm for base transformation as Q1.31
+/// \param m fractional part of logarithm as Q0.31
+/// \param ilog signed integer part of logarithm
+/// \param exp biased exponent of result
+/// \param sign sign bit of result
+/// \return value base-transformed and converted to half-precision
+/// \exception FE_OVERFLOW on overflows
+/// \exception FE_UNDERFLOW on underflows
+/// \exception FE_INEXACT if no other exception occurred
+template <std::float_round_style R, uint32 L>
+unsigned int log2_post(uint32 m, int ilog, int exp, unsigned int sign = 0)
+{
+    uint32 msign = sign_mask(ilog);
+    m            = (((static_cast<uint32>(ilog) << 27) + (m >> 4)) ^ msign) - msign;
+    if(!m)
+        return 0;
+    for(; m < 0x80000000; m <<= 1, --exp)
+        ;
+    int i = m >= L, s;
+    exp += i;
+    m >>= 1 + i;
+    sign ^= msign & 0x8000;
+    if(exp < -11)
+        return underflow<R>(sign);
+    m = divide64(m, L, s);
+    return fixed2half<R, 30, false, false, true>(m, exp, sign, 1);
+}
+
+/// Hypotenuse square root and postprocessing.
+/// \tparam R rounding mode to use
+/// \param r mantissa as Q2.30
+/// \param exp unbiased exponent
+/// \return square root converted to half-precision
+/// \exception FE_OVERFLOW on overflows
+/// \exception FE_UNDERFLOW on underflows
+/// \exception FE_INEXACT if value had to be rounded
+template <std::float_round_style R>
+unsigned int hypot_post(uint32 r, int exp)
+{
+    int i = r >> 31;
+    if((exp += i) > 46)
+        return overflow<R>();
+    if(exp < -34)
+        return underflow<R>();
+    r        = (r >> i) | (r & i);
+    uint32 m = sqrt<30>(r, exp += 15);
+    return fixed2half<R, 15, false, false, false>(m, exp - 1, 0, r != 0);
+}
+
+/// Division and postprocessing for tangents.
+/// \tparam R rounding mode to use
+/// \param my dividend as Q1.31
+/// \param mx divisor as Q1.31
+/// \param exp biased exponent of result
+/// \param sign sign bit of result
+/// \return quotient converted to half-precision
+/// \exception FE_OVERFLOW on overflows
+/// \exception FE_UNDERFLOW on underflows
+/// \exception FE_INEXACT if no other exception occurred
+template <std::float_round_style R>
+unsigned int tangent_post(uint32 my, uint32 mx, int exp, unsigned int sign = 0)
+{
+    int i = my >= mx, s;
+    exp += i;
+    if(exp > 29)
+        return overflow<R>(sign);
+    if(exp < -11)
+        return underflow<R>(sign);
+    uint32 m = divide64(my >> (i + 1), mx, s);
+    return fixed2half<R, 30, false, false, true>(m, exp, sign, s);
+}
+
+/// Area function and postprocessing.
+/// This computes the value directly in Q2.30 using the representation `asinh|acosh(x) =
+/// log(x+sqrt(x^2+|-1))`.
+/// \tparam R rounding mode to use
+/// \tparam S `true` for asinh, `false` for acosh
+/// \param arg half-precision argument
+/// \return asinh|acosh(\a arg) converted to half-precision
+/// \exception FE_OVERFLOW on overflows
+/// \exception FE_UNDERFLOW on underflows
+/// \exception FE_INEXACT if no other exception occurred
+template <std::float_round_style R, bool S>
+unsigned int area(unsigned int arg)
+{
+    int abs = arg & 0x7FFF, expx = (abs >> 10) + (abs <= 0x3FF) - 15, expy = -15, ilog, i;
+    uint32 mx = static_cast<uint32>((abs & 0x3FF) | ((abs > 0x3FF) << 10)) << 20, my, r;
+    for(; abs < 0x400; abs <<= 1, --expy)
+        ;
+    expy += abs >> 10;
+    r = ((abs & 0x3FF) | 0x400) << 5;
+    r *= r;
+    i    = r >> 31;
+    expy = 2 * expy + i;
+    r >>= i;
+    if(S)
+    {
+        if(expy < 0)
+        {
+            r    = 0x40000000 + ((expy > -30) ? ((r >> -expy) |
+                                              ((r & ((static_cast<uint32>(1) << -expy) - 1)) != 0))
+                                           : 1);
+            expy = 0;
+        }
+        else
+        {
+            r += 0x40000000 >> expy;
+            i = r >> 31;
+            r = (r >> i) | (r & i);
+            expy += i;
+        }
+    }
+    else
+    {
+        r -= 0x40000000 >> expy;
+        for(; r < 0x40000000; r <<= 1, --expy)
+            ;
+    }
+    my = sqrt<30>(r, expy);
+    my = (my << 15) + (r << 14) / my;
+    if(S)
+    {
+        mx >>= expy - expx;
+        ilog = expy;
+    }
+    else
+    {
+        my >>= expx - expy;
+        ilog = expx;
+    }
+    my += mx;
+    i                  = my >> 31;
+    static const int G = S && (R == std::round_to_nearest);
+    return log2_post<R, 0xB8AA3B2A>(
+        log2(my >> i, 26 + S + G) + (G << 3), ilog + i, 17, arg & (static_cast<unsigned>(S) << 15));
+}
+
+/// Class for 1.31 unsigned floating-point computation
+struct f31
+{
+    /// Constructor.
+    /// \param mant mantissa as 1.31
+    /// \param e exponent
+    HALF_CONSTEXPR f31(uint32 mant, int e) : m(mant), exp(e) {}
+
+    /// Constructor.
+    /// \param abs unsigned half-precision value
+    f31(unsigned int abs) : exp(-15)
+    {
+        for(; abs < 0x400; abs <<= 1, --exp)
+            ;
+        m = static_cast<uint32>((abs & 0x3FF) | 0x400) << 21;
+        exp += (abs >> 10);
+    }
+
+    /// Addition operator.
+    /// \param a first operand
+    /// \param b second operand
+    /// \return \a a + \a b
+    friend f31 operator+(f31 a, f31 b)
+    {
+        if(b.exp > a.exp)
+            std::swap(a, b);
+        int d    = a.exp - b.exp;
+        uint32 m = a.m + ((d < 32) ? (b.m >> d) : 0);
+        int i    = (m & 0xFFFFFFFF) < a.m;
+        return f31(((m + i) >> i) | 0x80000000, a.exp + i);
+    }
+
+    /// Subtraction operator.
+    /// \param a first operand
+    /// \param b second operand
+    /// \return \a a - \a b
+    friend f31 operator-(f31 a, f31 b)
+    {
+        int d = a.exp - b.exp, exp = a.exp;
+        uint32 m = a.m - ((d < 32) ? (b.m >> d) : 0);
+        if(!m)
+            return f31(0, -32);
+        for(; m < 0x80000000; m <<= 1, --exp)
+            ;
+        return f31(m, exp);
+    }
+
+    /// Multiplication operator.
+    /// \param a first operand
+    /// \param b second operand
+    /// \return \a a * \a b
+    friend f31 operator*(f31 a, f31 b)
+    {
+        uint32 m = multiply64(a.m, b.m);
+        int i    = m >> 31;
+        return f31(m << (1 - i), a.exp + b.exp + i);
+    }
+
+    /// Division operator.
+    /// \param a first operand
+    /// \param b second operand
+    /// \return \a a / \a b
+    friend f31 operator/(f31 a, f31 b)
+    {
+        int i    = a.m >= b.m, s;
+        uint32 m = divide64((a.m + i) >> i, b.m, s);
+        return f31(m, a.exp - b.exp + i - 1);
+    }
+
+    uint32 m; ///< mantissa as 1.31.
+    int exp;  ///< exponent.
+};
+
+/// Error function and postprocessing.
+/// This computes the value directly in Q1.31 using the approximations given
+/// [here](https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions).
+/// \tparam R rounding mode to use
+/// \tparam C `true` for comlementary error function, `false` else
+/// \param arg half-precision function argument
+/// \return approximated value of error function in half-precision
+/// \exception FE_OVERFLOW on overflows
+/// \exception FE_UNDERFLOW on underflows
+/// \exception FE_INEXACT if no other exception occurred
+template <std::float_round_style R, bool C>
+unsigned int erf(unsigned int arg)
+{
+    unsigned int abs = arg & 0x7FFF, sign = arg & 0x8000;
+    f31 x(abs), x2                        = x * x * f31(0xB8AA3B29, 0),
+                t = f31(0x80000000, 0) / (f31(0x80000000, 0) + f31(0xA7BA054A, -2) * x), t2 = t * t;
+    f31 e = ((f31(0x87DC2213, 0) * t2 + f31(0xB5F0E2AE, 0)) * t2 + f31(0x82790637, -2) -
+             (f31(0xBA00E2B8, 0) * t2 + f31(0x91A98E62, -2)) * t) *
+            t /
+            ((x2.exp < 0) ? f31(exp2((x2.exp > -32) ? (x2.m >> -x2.exp) : 0, 30), 0)
+                          : f31(exp2((x2.m << x2.exp) & 0x7FFFFFFF, 22), x2.m >> (31 - x2.exp)));
+    return (!C || sign)
+               ? fixed2half<R, 31, false, true, true>(
+                     0x80000000 - (e.m >> (C - e.exp)), 14 + C, sign & (C - 1U))
+               : (e.exp < -25)
+                     ? underflow<R>()
+                     : fixed2half<R, 30, false, false, true>(e.m >> 1, e.exp + 14, 0, e.m & 1);
+}
+
+/// Gamma function and postprocessing.
+/// This approximates the value of either the gamma function or its logarithm directly in Q1.31.
+/// \tparam R rounding mode to use
+/// \tparam L `true` for lograithm of gamma function, `false` for gamma function
+/// \param arg half-precision floating-point value
+/// \return lgamma/tgamma(\a arg) in half-precision
+/// \exception FE_OVERFLOW on overflows
+/// \exception FE_UNDERFLOW on underflows
+/// \exception FE_INEXACT if \a arg is not a positive integer
+template <std::float_round_style R, bool L>
+unsigned int gamma(unsigned int arg)
+{
+    /*			static const double p[] ={ 2.50662827563479526904, 225.525584619175212544,
+       -268.295973841304927459, 80.9030806934622512966, -5.00757863970517583837,
+       0.0114684895434781459556 }; double t = arg + 4.65, s = p[0]; for(unsigned int i=0; i<5; ++i)
+                                s += p[i+1] / (arg+i);
+                        return std::log(s) + (arg-0.5)*std::log(t) - t;
+*/ static const f31 pi(0xC90FDAA2, 1),
+        lbe(0xB8AA3B29, 0);
+    unsigned int abs = arg & 0x7FFF, sign = arg & 0x8000;
+    bool bsign = sign != 0;
+    f31 z(abs), x = sign ? (z + f31(0x80000000, 0)) : z, t = x + f31(0x94CCCCCD, 2),
+                s = f31(0xA06C9901, 1) + f31(0xBBE654E2, -7) / (x + f31(0x80000000, 2)) +
+                    f31(0xA1CE6098, 6) / (x + f31(0x80000000, 1)) + f31(0xE1868CB7, 7) / x -
+                    f31(0x8625E279, 8) / (x + f31(0x80000000, 0)) -
+                    f31(0xA03E158F, 2) / (x + f31(0xC0000000, 1));
+    int i = (s.exp >= 2) + (s.exp >= 4) + (s.exp >= 8) + (s.exp >= 16);
+    s     = f31((static_cast<uint32>(s.exp) << (31 - i)) + (log2(s.m >> 1, 28) >> i), i) / lbe;
+    if(x.exp != -1 || x.m != 0x80000000)
+    {
+        i     = (t.exp >= 2) + (t.exp >= 4) + (t.exp >= 8);
+        f31 l = f31((static_cast<uint32>(t.exp) << (31 - i)) + (log2(t.m >> 1, 30) >> i), i) / lbe;
+        s     = (x.exp < -1) ? (s - (f31(0x80000000, -1) - x) * l)
+                         : (s + (x - f31(0x80000000, -1)) * l);
+    }
+    s = x.exp ? (s - t) : (t - s);
+    if(bsign)
+    {
+        if(z.exp >= 0)
+        {
+            sign &= (L | ((z.m >> (31 - z.exp)) & 1)) - 1;
+            for(z = f31((z.m << (1 + z.exp)) & 0xFFFFFFFF, -1); z.m < 0x80000000;
+                z.m <<= 1, --z.exp)
+                ;
+        }
+        if(z.exp == -1)
+            z = f31(0x80000000, 0) - z;
+        if(z.exp < -1)
+        {
+            z   = z * pi;
+            z.m = sincos(z.m >> (1 - z.exp), 30).first;
+            for(z.exp = 1; z.m < 0x80000000; z.m <<= 1, --z.exp)
+                ;
+        }
+        else
+            z = f31(0x80000000, 0);
+    }
+    if(L)
+    {
+        if(bsign)
+        {
+            f31 l(0x92868247, 0);
+            if(z.exp < 0)
+            {
+                uint32 m = log2((z.m + 1) >> 1, 27);
+                z        = f31(-((static_cast<uint32>(z.exp) << 26) + (m >> 5)), 5);
+                for(; z.m < 0x80000000; z.m <<= 1, --z.exp)
+                    ;
+                l = l + z / lbe;
+            }
+            sign = static_cast<unsigned>(x.exp && (l.exp < s.exp || (l.exp == s.exp && l.m < s.m)))
+                   << 15;
+            s = sign ? (s - l) : x.exp ? (l - s) : (l + s);
+        }
+        else
+        {
+            sign = static_cast<unsigned>(x.exp == 0) << 15;
+            if(s.exp < -24)
+                return underflow<R>(sign);
+            if(s.exp > 15)
+                return overflow<R>(sign);
+        }
+    }
+    else
+    {
+        s = s * lbe;
+        uint32 m;
+        if(s.exp < 0)
+        {
+            m     = s.m >> -s.exp;
+            s.exp = 0;
+        }
+        else
+        {
+            m     = (s.m << s.exp) & 0x7FFFFFFF;
+            s.exp = (s.m >> (31 - s.exp));
+        }
+        s.m = exp2(m, 27);
+        if(!x.exp)
+            s = f31(0x80000000, 0) / s;
+        if(bsign)
+        {
+            if(z.exp < 0)
+                s = s * z;
+            s = pi / s;
+            if(s.exp < -24)
+                return underflow<R>(sign);
+        }
+        else if(z.exp > 0 && !(z.m & ((1 << (31 - z.exp)) - 1)))
+            return ((s.exp + 14) << 10) + (s.m >> 21);
+        if(s.exp > 15)
+            return overflow<R>(sign);
+    }
+    return fixed2half<R, 31, false, false, true>(s.m, s.exp + 14, sign);
+}
+/// \}
+
+template <typename, typename, std::float_round_style>
+struct half_caster;
+} // namespace detail
+
+/// Half-precision floating-point type.
+/// This class implements an IEEE-conformant half-precision floating-point type with the usual
+/// arithmetic
+/// operators and conversions. It is implicitly convertible to single-precision floating-point,
+/// which makes artihmetic
+/// expressions and functions with mixed-type operands to be of the most precise operand type.
+///
+/// According to the C++98/03 definition, the half type is not a POD type. But according to C++11's
+/// less strict and
+/// extended definitions it is both a standard layout type and a trivially copyable type (even if
+/// not a POD type), which
+/// means it can be standard-conformantly copied using raw binary copies. But in this context some
+/// more words about the
+/// actual size of the type. Although the half is representing an IEEE 16-bit type, it does not
+/// neccessarily have to be of
+/// exactly 16-bits size. But on any reasonable implementation the actual binary representation of
+/// this type will most
+/// probably not ivolve any additional "magic" or padding beyond the simple binary representation of
+/// the underlying 16-bit
+/// IEEE number, even if not strictly guaranteed by the standard. But even then it only has an
+/// actual size of 16 bits if
+/// your C++ implementation supports an unsigned integer type of exactly 16 bits width. But this
+/// should be the case on
+/// nearly any reasonable platform.
+///
+/// So if your C++ implementation is not totally exotic or imposes special alignment requirements,
+/// it is a reasonable
+/// assumption that the data of a half is just comprised of the 2 bytes of the underlying IEEE
+/// representation.
+class half
+{
+    public:
+    /// \name Construction and assignment
+    /// \{
+
+    /// Default constructor.
+    /// This initializes the half to 0. Although this does not match the builtin types'
+    /// default-initialization semantics
+    /// and may be less efficient than no initialization, it is needed to provide proper
+    /// value-initialization semantics.
+    HALF_CONSTEXPR half() HALF_NOEXCEPT : data_() {}
+
+    /// Conversion constructor.
+    /// \param rhs float to convert
+    /// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+    explicit half(float rhs)
+        : data_(static_cast<detail::uint16>(detail::float2half<round_style>(rhs)))
+    {
+    }
+
+    /// Conversion to single-precision.
+    /// \return single precision value representing expression value
+    operator float() const { return detail::half2float<float>(data_); }
+
+    /// Assignment operator.
+    /// \param rhs single-precision value to copy from
+    /// \return reference to this half
+    /// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+    half& operator=(float rhs)
+    {
+        data_ = static_cast<detail::uint16>(detail::float2half<round_style>(rhs));
+        return *this;
+    }
+
+    /// \}
+    /// \name Arithmetic updates
+    /// \{
+
+    /// Arithmetic assignment.
+    /// \tparam T type of concrete half expression
+    /// \param rhs half expression to add
+    /// \return reference to this half
+    /// \exception FE_... according to operator+(half,half)
+    half& operator+=(half rhs) { return *this = *this + rhs; }
+
+    /// Arithmetic assignment.
+    /// \tparam T type of concrete half expression
+    /// \param rhs half expression to subtract
+    /// \return reference to this half
+    /// \exception FE_... according to operator-(half,half)
+    half& operator-=(half rhs) { return *this = *this - rhs; }
+
+    /// Arithmetic assignment.
+    /// \tparam T type of concrete half expression
+    /// \param rhs half expression to multiply with
+    /// \return reference to this half
+    /// \exception FE_... according to operator*(half,half)
+    half& operator*=(half rhs) { return *this = *this * rhs; }
+
+    /// Arithmetic assignment.
+    /// \tparam T type of concrete half expression
+    /// \param rhs half expression to divide by
+    /// \return reference to this half
+    /// \exception FE_... according to operator/(half,half)
+    half& operator/=(half rhs) { return *this = *this / rhs; }
+
+    /// Arithmetic assignment.
+    /// \param rhs single-precision value to add
+    /// \return reference to this half
+    /// \exception FE_... according to operator=()
+    half& operator+=(float rhs) { return *this = *this + rhs; }
+
+    /// Arithmetic assignment.
+    /// \param rhs single-precision value to subtract
+    /// \return reference to this half
+    /// \exception FE_... according to operator=()
+    half& operator-=(float rhs) { return *this = *this - rhs; }
+
+    /// Arithmetic assignment.
+    /// \param rhs single-precision value to multiply with
+    /// \return reference to this half
+    /// \exception FE_... according to operator=()
+    half& operator*=(float rhs) { return *this = *this * rhs; }
+
+    /// Arithmetic assignment.
+    /// \param rhs single-precision value to divide by
+    /// \return reference to this half
+    /// \exception FE_... according to operator=()
+    half& operator/=(float rhs) { return *this = *this / rhs; }
+
+    /// \}
+    /// \name Increment and decrement
+    /// \{
+
+    /// Prefix increment.
+    /// \return incremented half value
+    /// \exception FE_... according to operator+(half,half)
+    half& operator++() { return *this = *this + half(detail::binary, 0x3C00); }
+
+    /// Prefix decrement.
+    /// \return decremented half value
+    /// \exception FE_... according to operator-(half,half)
+    half& operator--() { return *this = *this + half(detail::binary, 0xBC00); }
+
+    /// Postfix increment.
+    /// \return non-incremented half value
+    /// \exception FE_... according to operator+(half,half)
+    half operator++(int)
+    {
+        half out(*this);
+        ++*this;
+        return out;
+    }
+
+    /// Postfix decrement.
+    /// \return non-decremented half value
+    /// \exception FE_... according to operator-(half,half)
+    half operator--(int)
+    {
+        half out(*this);
+        --*this;
+        return out;
+    }
+    /// \}
+
+    private:
+    /// Rounding mode to use
+    static const std::float_round_style round_style = (std::float_round_style)(HALF_ROUND_STYLE);
+
+    /// Constructor.
+    /// \param bits binary representation to set half to
+    HALF_CONSTEXPR half(detail::binary_t, unsigned int bits) HALF_NOEXCEPT
+        : data_(static_cast<detail::uint16>(bits))
+    {
+    }
+
+    /// Internal binary representation
+    detail::uint16 data_;
+
+#ifndef HALF_DOXYGEN_ONLY
+    friend HALF_CONSTEXPR_NOERR bool operator==(half, half);
+    friend HALF_CONSTEXPR_NOERR bool operator!=(half, half);
+    friend HALF_CONSTEXPR_NOERR bool operator<(half, half);
+    friend HALF_CONSTEXPR_NOERR bool operator>(half, half);
+    friend HALF_CONSTEXPR_NOERR bool operator<=(half, half);
+    friend HALF_CONSTEXPR_NOERR bool operator>=(half, half);
+    friend HALF_CONSTEXPR half operator-(half);
+    friend half operator+(half, half);
+    friend half operator-(half, half);
+    friend half operator*(half, half);
+    friend half operator/(half, half);
+    template <typename charT, typename traits>
+    friend std::basic_ostream<charT, traits>& operator<<(std::basic_ostream<charT, traits>&, half);
+    template <typename charT, typename traits>
+    friend std::basic_istream<charT, traits>& operator>>(std::basic_istream<charT, traits>&, half&);
+    friend HALF_CONSTEXPR half fabs(half);
+    friend half fmod(half, half);
+    friend half remainder(half, half);
+    friend half remquo(half, half, int*);
+    friend half fma(half, half, half);
+    friend HALF_CONSTEXPR_NOERR half fmax(half, half);
+    friend HALF_CONSTEXPR_NOERR half fmin(half, half);
+    friend half fdim(half, half);
+    friend half nanh(const char*);
+    friend half exp(half);
+    friend half exp2(half);
+    friend half expm1(half);
+    friend half log(half);
+    friend half log10(half);
+    friend half log2(half);
+    friend half log1p(half);
+    friend half sqrt(half);
+    friend half cbrt(half);
+    friend half hypot(half, half);
+    friend half hypot(half, half, half);
+    friend half pow(half, half);
+    friend void sincos(half, half*, half*);
+    friend half sin(half);
+    friend half cos(half);
+    friend half tan(half);
+    friend half asin(half);
+    friend half acos(half);
+    friend half atan(half);
+    friend half atan2(half, half);
+    friend half sinh(half);
+    friend half cosh(half);
+    friend half tanh(half);
+    friend half asinh(half);
+    friend half acosh(half);
+    friend half atanh(half);
+    friend half erf(half);
+    friend half erfc(half);
+    friend half lgamma(half);
+    friend half tgamma(half);
+    friend half ceil(half);
+    friend half floor(half);
+    friend half trunc(half);
+    friend half round(half);
+    friend long lround(half);
+    friend half rint(half);
+    friend long lrint(half);
+    friend half nearbyint(half);
+#ifdef HALF_ENABLE_CPP11_LONG_LONG
+    friend long long llround(half);
+    friend long long llrint(half);
+#endif
+    friend half frexp(half, int*);
+    friend half scalbln(half, long);
+    friend half modf(half, half*);
+    friend int ilogb(half);
+    friend half logb(half);
+    friend half nextafter(half, half);
+    friend half nexttoward(half, long double);
+    friend HALF_CONSTEXPR half copysign(half, half);
+    friend HALF_CONSTEXPR int fpclassify(half);
+    friend HALF_CONSTEXPR bool isfinite(half);
+    friend HALF_CONSTEXPR bool isinf(half);
+    friend HALF_CONSTEXPR bool isnan(half);
+    friend HALF_CONSTEXPR bool isnormal(half);
+    friend HALF_CONSTEXPR bool signbit(half);
+    friend HALF_CONSTEXPR bool isgreater(half, half);
+    friend HALF_CONSTEXPR bool isgreaterequal(half, half);
+    friend HALF_CONSTEXPR bool isless(half, half);
+    friend HALF_CONSTEXPR bool islessequal(half, half);
+    friend HALF_CONSTEXPR bool islessgreater(half, half);
+    template <typename, typename, std::float_round_style>
+    friend struct detail::half_caster;
+    friend class std::numeric_limits<half>;
+#if HALF_ENABLE_CPP11_HASH
+    friend struct std::hash<half>;
+#endif
+#if HALF_ENABLE_CPP11_USER_LITERALS
+    friend half literal::operator"" _h(long double);
+#endif
+#endif
+};
+
+#if HALF_ENABLE_CPP11_USER_LITERALS
+namespace literal {
+/// Half literal.
+/// While this returns a properly rounded half-precision value, half literals can unfortunately not
+/// be constant
+/// expressions due to rather involved conversions. So don't expect this to be a literal literal
+/// without involving
+/// conversion operations at runtime. It is a convenience feature, not a performance optimization.
+/// \param value literal value
+/// \return half with of given value (possibly rounded)
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+inline half operator"" _h(long double value)
+{
+    return half(detail::binary, detail::float2half<half::round_style>(value));
+}
+} // namespace literal
+#endif
+
+namespace detail {
+/// Helper class for half casts.
+/// This class template has to be specialized for all valid cast arguments to define an appropriate
+/// static
+/// `cast` member function and a corresponding `type` member denoting its return type.
+/// \tparam T destination type
+/// \tparam U source type
+/// \tparam R rounding mode to use
+template <typename T,
+          typename U,
+          std::float_round_style R = (std::float_round_style)(HALF_ROUND_STYLE)>
+struct half_caster
+{
+};
+template <typename U, std::float_round_style R>
+struct half_caster<half, U, R>
+{
+#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
+    static_assert(std::is_arithmetic<U>::value, "half_cast from non-arithmetic type unsupported");
+#endif
+
+    static half cast(U arg) { return cast_impl(arg, is_float<U>()); };
+
+    private:
+    static half cast_impl(U arg, true_type) { return half(binary, float2half<R>(arg)); }
+    static half cast_impl(U arg, false_type) { return half(binary, int2half<R>(arg)); }
+};
+template <typename T, std::float_round_style R>
+struct half_caster<T, half, R>
+{
+#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
+    static_assert(std::is_arithmetic<T>::value, "half_cast to non-arithmetic type unsupported");
+#endif
+
+    static T cast(half arg) { return cast_impl(arg, is_float<T>()); }
+
+    private:
+    static T cast_impl(half arg, true_type) { return half2float<T>(arg.data_); }
+    static T cast_impl(half arg, false_type) { return half2int<R, true, true, T>(arg.data_); }
+};
+template <std::float_round_style R>
+struct half_caster<half, half, R>
+{
+    static half cast(half arg) { return arg; }
+};
+} // namespace detail
+} // namespace half_float
+
+/// Extensions to the C++ standard library.
+namespace std {
+/// Numeric limits for half-precision floats.
+/// **See also:** Documentation for
+/// [std::numeric_limits](https://en.cppreference.com/w/cpp/types/numeric_limits)
+template <>
+class numeric_limits<half_float::half>
+{
+    public:
+    /// Is template specialization.
+    static HALF_CONSTEXPR_CONST bool is_specialized = true;
+
+    /// Supports signed values.
+    static HALF_CONSTEXPR_CONST bool is_signed = true;
+
+    /// Is not an integer type.
+    static HALF_CONSTEXPR_CONST bool is_integer = false;
+
+    /// Is not exact.
+    static HALF_CONSTEXPR_CONST bool is_exact = false;
+
+    /// Doesn't provide modulo arithmetic.
+    static HALF_CONSTEXPR_CONST bool is_modulo = false;
+
+    /// Has a finite set of values.
+    static HALF_CONSTEXPR_CONST bool is_bounded = true;
+
+    /// IEEE conformant.
+    static HALF_CONSTEXPR_CONST bool is_iec559 = true;
+
+    /// Supports infinity.
+    static HALF_CONSTEXPR_CONST bool has_infinity = true;
+
+    /// Supports quiet NaNs.
+    static HALF_CONSTEXPR_CONST bool has_quiet_NaN = true;
+
+    /// Supports signaling NaNs.
+    static HALF_CONSTEXPR_CONST bool has_signaling_NaN = true;
+
+    /// Supports subnormal values.
+    static HALF_CONSTEXPR_CONST float_denorm_style has_denorm = denorm_present;
+
+    /// Supports no denormalization detection.
+    static HALF_CONSTEXPR_CONST bool has_denorm_loss = false;
+
+#if HALF_ERRHANDLING_THROWS
+    static HALF_CONSTEXPR_CONST bool traps = true;
+#else
+    /// Traps only if [HALF_ERRHANDLING_THROW_...](\ref HALF_ERRHANDLING_THROW_INVALID) is
+    /// acitvated.
+    static HALF_CONSTEXPR_CONST bool traps = false;
+#endif
+
+    /// Does not support no pre-rounding underflow detection.
+    static HALF_CONSTEXPR_CONST bool tinyness_before = false;
+
+    /// Rounding mode.
+    static HALF_CONSTEXPR_CONST float_round_style round_style = half_float::half::round_style;
+
+    /// Significant digits.
+    static HALF_CONSTEXPR_CONST int digits = 11;
+
+    /// Significant decimal digits.
+    static HALF_CONSTEXPR_CONST int digits10 = 3;
+
+    /// Required decimal digits to represent all possible values.
+    static HALF_CONSTEXPR_CONST int max_digits10 = 5;
+
+    /// Number base.
+    static HALF_CONSTEXPR_CONST int radix = 2;
+
+    /// One more than smallest exponent.
+    static HALF_CONSTEXPR_CONST int min_exponent = -13;
+
+    /// Smallest normalized representable power of 10.
+    static HALF_CONSTEXPR_CONST int min_exponent10 = -4;
+
+    /// One more than largest exponent
+    static HALF_CONSTEXPR_CONST int max_exponent = 16;
+
+    /// Largest finitely representable power of 10.
+    static HALF_CONSTEXPR_CONST int max_exponent10 = 4;
+
+    /// Smallest positive normal value.
+    static HALF_CONSTEXPR half_float::half min() HALF_NOTHROW
+    {
+        return half_float::half(half_float::detail::binary, 0x0400);
+    }
+
+    /// Smallest finite value.
+    static HALF_CONSTEXPR half_float::half lowest() HALF_NOTHROW
+    {
+        return half_float::half(half_float::detail::binary, 0xFBFF);
+    }
+
+    /// Largest finite value.
+    static HALF_CONSTEXPR half_float::half max() HALF_NOTHROW
+    {
+        return half_float::half(half_float::detail::binary, 0x7BFF);
+    }
+
+    /// Difference between 1 and next representable value.
+    static HALF_CONSTEXPR half_float::half epsilon() HALF_NOTHROW
+    {
+        return half_float::half(half_float::detail::binary, 0x1400);
+    }
+
+    /// Maximum rounding error in ULP (units in the last place).
+    static HALF_CONSTEXPR half_float::half round_error() HALF_NOTHROW
+    {
+        return half_float::half(half_float::detail::binary,
+                                (round_style == std::round_to_nearest) ? 0x3800 : 0x3C00);
+    }
+
+    /// Positive infinity.
+    static HALF_CONSTEXPR half_float::half infinity() HALF_NOTHROW
+    {
+        return half_float::half(half_float::detail::binary, 0x7C00);
+    }
+
+    /// Quiet NaN.
+    static HALF_CONSTEXPR half_float::half quiet_NaN() HALF_NOTHROW
+    {
+        return half_float::half(half_float::detail::binary, 0x7FFF);
+    }
+
+    /// Signaling NaN.
+    static HALF_CONSTEXPR half_float::half signaling_NaN() HALF_NOTHROW
+    {
+        return half_float::half(half_float::detail::binary, 0x7DFF);
+    }
+
+    /// Smallest positive subnormal value.
+    static HALF_CONSTEXPR half_float::half denorm_min() HALF_NOTHROW
+    {
+        return half_float::half(half_float::detail::binary, 0x0001);
+    }
+};
+
+#if HALF_ENABLE_CPP11_HASH
+/// Hash function for half-precision floats.
+/// This is only defined if C++11 `std::hash` is supported and enabled.
+///
+/// **See also:** Documentation for [std::hash](https://en.cppreference.com/w/cpp/utility/hash)
+template <>
+struct hash<half_float::half>
+{
+    /// Type of function argument.
+    typedef half_float::half argument_type;
+
+    /// Function return type.
+    typedef size_t result_type;
+
+    /// Compute hash function.
+    /// \param arg half to hash
+    /// \return hash value
+    result_type operator()(argument_type arg) const
+    {
+        return hash<half_float::detail::uint16>()(arg.data_ &
+                                                  -static_cast<unsigned>(arg.data_ != 0x8000));
+    }
+};
+#endif
+} // namespace std
+
+namespace half_float {
+/// \anchor compop
+/// \name Comparison operators
+/// \{
+
+/// Comparison for equality.
+/// \param x first operand
+/// \param y second operand
+/// \retval true if operands equal
+/// \retval false else
+/// \exception FE_INVALID if \a x or \a y is NaN
+inline HALF_CONSTEXPR_NOERR bool operator==(half x, half y)
+{
+    return !detail::compsignal(x.data_, y.data_) &&
+           (x.data_ == y.data_ || !((x.data_ | y.data_) & 0x7FFF));
+}
+
+/// Comparison for inequality.
+/// \param x first operand
+/// \param y second operand
+/// \retval true if operands not equal
+/// \retval false else
+/// \exception FE_INVALID if \a x or \a y is NaN
+inline HALF_CONSTEXPR_NOERR bool operator!=(half x, half y)
+{
+    return detail::compsignal(x.data_, y.data_) ||
+           (x.data_ != y.data_ && ((x.data_ | y.data_) & 0x7FFF));
+}
+
+/// Comparison for less than.
+/// \param x first operand
+/// \param y second operand
+/// \retval true if \a x less than \a y
+/// \retval false else
+/// \exception FE_INVALID if \a x or \a y is NaN
+inline HALF_CONSTEXPR_NOERR bool operator<(half x, half y)
+{
+    return !detail::compsignal(x.data_, y.data_) &&
+           ((x.data_ ^ (0x8000 | (0x8000 - (x.data_ >> 15)))) + (x.data_ >> 15)) <
+               ((y.data_ ^ (0x8000 | (0x8000 - (y.data_ >> 15)))) + (y.data_ >> 15));
+}
+
+/// Comparison for greater than.
+/// \param x first operand
+/// \param y second operand
+/// \retval true if \a x greater than \a y
+/// \retval false else
+/// \exception FE_INVALID if \a x or \a y is NaN
+inline HALF_CONSTEXPR_NOERR bool operator>(half x, half y)
+{
+    return !detail::compsignal(x.data_, y.data_) &&
+           ((x.data_ ^ (0x8000 | (0x8000 - (x.data_ >> 15)))) + (x.data_ >> 15)) >
+               ((y.data_ ^ (0x8000 | (0x8000 - (y.data_ >> 15)))) + (y.data_ >> 15));
+}
+
+/// Comparison for less equal.
+/// \param x first operand
+/// \param y second operand
+/// \retval true if \a x less equal \a y
+/// \retval false else
+/// \exception FE_INVALID if \a x or \a y is NaN
+inline HALF_CONSTEXPR_NOERR bool operator<=(half x, half y)
+{
+    return !detail::compsignal(x.data_, y.data_) &&
+           ((x.data_ ^ (0x8000 | (0x8000 - (x.data_ >> 15)))) + (x.data_ >> 15)) <=
+               ((y.data_ ^ (0x8000 | (0x8000 - (y.data_ >> 15)))) + (y.data_ >> 15));
+}
+
+/// Comparison for greater equal.
+/// \param x first operand
+/// \param y second operand
+/// \retval true if \a x greater equal \a y
+/// \retval false else
+/// \exception FE_INVALID if \a x or \a y is NaN
+inline HALF_CONSTEXPR_NOERR bool operator>=(half x, half y)
+{
+    return !detail::compsignal(x.data_, y.data_) &&
+           ((x.data_ ^ (0x8000 | (0x8000 - (x.data_ >> 15)))) + (x.data_ >> 15)) >=
+               ((y.data_ ^ (0x8000 | (0x8000 - (y.data_ >> 15)))) + (y.data_ >> 15));
+}
+
+/// \}
+/// \anchor arithmetics
+/// \name Arithmetic operators
+/// \{
+
+/// Identity.
+/// \param arg operand
+/// \return unchanged operand
+inline HALF_CONSTEXPR half operator+(half arg) { return arg; }
+
+/// Negation.
+/// \param arg operand
+/// \return negated operand
+inline HALF_CONSTEXPR half operator-(half arg) { return half(detail::binary, arg.data_ ^ 0x8000); }
+
+/// Addition.
+/// This operation is exact to rounding for all rounding modes.
+/// \param x left operand
+/// \param y right operand
+/// \return sum of half expressions
+/// \exception FE_INVALID if \a x and \a y are infinities with different signs or signaling NaNs
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+inline half operator+(half x, half y)
+{
+#ifdef HALF_ARITHMETIC_TYPE
+    return half(
+        detail::binary,
+        detail::float2half<half::round_style>(detail::half2float<detail::internal_t>(x.data_) +
+                                              detail::half2float<detail::internal_t>(y.data_)));
+#else
+    int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF;
+    bool sub = ((x.data_ ^ y.data_) & 0x8000) != 0;
+    if(absx >= 0x7C00 || absy >= 0x7C00)
+        return half(detail::binary,
+                    (absx > 0x7C00 || absy > 0x7C00)
+                        ? detail::signal(x.data_, y.data_)
+                        : (absy != 0x7C00) ? x.data_
+                                           : (sub && absx == 0x7C00) ? detail::invalid() : y.data_);
+    if(!absx)
+        return absy ? y
+                    : half(detail::binary,
+                           (half::round_style == std::round_toward_neg_infinity)
+                               ? (x.data_ | y.data_)
+                               : (x.data_ & y.data_));
+    if(!absy)
+        return x;
+    unsigned int sign = ((sub && absy > absx) ? y.data_ : x.data_) & 0x8000;
+    if(absy > absx)
+        std::swap(absx, absy);
+    int exp = (absx >> 10) + (absx <= 0x3FF), d = exp - (absy >> 10) - (absy <= 0x3FF),
+        mx = ((absx & 0x3FF) | ((absx > 0x3FF) << 10)) << 3, my;
+    if(d < 13)
+    {
+        my = ((absy & 0x3FF) | ((absy > 0x3FF) << 10)) << 3;
+        my = (my >> d) | ((my & ((1 << d) - 1)) != 0);
+    }
+    else
+        my = 1;
+    if(sub)
+    {
+        if(!(mx -= my))
+            return half(detail::binary,
+                        static_cast<unsigned>(half::round_style == std::round_toward_neg_infinity)
+                            << 15);
+        for(; mx < 0x2000 && exp > 1; mx <<= 1, --exp)
+            ;
+    }
+    else
+    {
+        mx += my;
+        int i = mx >> 14;
+        if((exp += i) > 30)
+            return half(detail::binary, detail::overflow<half::round_style>(sign));
+        mx = (mx >> i) | (mx & i);
+    }
+    return half(detail::binary,
+                detail::rounded<half::round_style, false>(
+                    sign + ((exp - 1) << 10) + (mx >> 3), (mx >> 2) & 1, (mx & 0x3) != 0));
+#endif
+}
+
+/// Subtraction.
+/// This operation is exact to rounding for all rounding modes.
+/// \param x left operand
+/// \param y right operand
+/// \return difference of half expressions
+/// \exception FE_INVALID if \a x and \a y are infinities with equal signs or signaling NaNs
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+inline half operator-(half x, half y)
+{
+#ifdef HALF_ARITHMETIC_TYPE
+    return half(
+        detail::binary,
+        detail::float2half<half::round_style>(detail::half2float<detail::internal_t>(x.data_) -
+                                              detail::half2float<detail::internal_t>(y.data_)));
+#else
+    return x + -y;
+#endif
+}
+
+/// Multiplication.
+/// This operation is exact to rounding for all rounding modes.
+/// \param x left operand
+/// \param y right operand
+/// \return product of half expressions
+/// \exception FE_INVALID if multiplying 0 with infinity or if \a x or \a y is signaling NaN
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+inline half operator*(half x, half y)
+{
+#ifdef HALF_ARITHMETIC_TYPE
+    return half(
+        detail::binary,
+        detail::float2half<half::round_style>(detail::half2float<detail::internal_t>(x.data_) *
+                                              detail::half2float<detail::internal_t>(y.data_)));
+#else
+    int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, exp = -16;
+    unsigned int sign = (x.data_ ^ y.data_) & 0x8000;
+    if(absx >= 0x7C00 || absy >= 0x7C00)
+        return half(detail::binary,
+                    (absx > 0x7C00 || absy > 0x7C00)
+                        ? detail::signal(x.data_, y.data_)
+                        : ((absx == 0x7C00 && !absy) || (absy == 0x7C00 && !absx))
+                              ? detail::invalid()
+                              : (sign | 0x7C00));
+    if(!absx || !absy)
+        return half(detail::binary, sign);
+    for(; absx < 0x400; absx <<= 1, --exp)
+        ;
+    for(; absy < 0x400; absy <<= 1, --exp)
+        ;
+    detail::uint32 m = static_cast<detail::uint32>((absx & 0x3FF) | 0x400) *
+                       static_cast<detail::uint32>((absy & 0x3FF) | 0x400);
+    int i = m >> 21, s = m & i;
+    exp += (absx >> 10) + (absy >> 10) + i;
+    if(exp > 29)
+        return half(detail::binary, detail::overflow<half::round_style>(sign));
+    else if(exp < -11)
+        return half(detail::binary, detail::underflow<half::round_style>(sign));
+    return half(
+        detail::binary,
+        detail::fixed2half<half::round_style, 20, false, false, false>(m >> i, exp, sign, s));
+#endif
+}
+
+/// Division.
+/// This operation is exact to rounding for all rounding modes.
+/// \param x left operand
+/// \param y right operand
+/// \return quotient of half expressions
+/// \exception FE_INVALID if dividing 0s or infinities with each other or if \a x or \a y is
+/// signaling NaN
+/// \exception FE_DIVBYZERO if dividing finite value by 0
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+inline half operator/(half x, half y)
+{
+#ifdef HALF_ARITHMETIC_TYPE
+    return half(
+        detail::binary,
+        detail::float2half<half::round_style>(detail::half2float<detail::internal_t>(x.data_) /
+                                              detail::half2float<detail::internal_t>(y.data_)));
+#else
+    int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, exp = 14;
+    unsigned int sign = (x.data_ ^ y.data_) & 0x8000;
+    if(absx >= 0x7C00 || absy >= 0x7C00)
+        return half(detail::binary,
+                    (absx > 0x7C00 || absy > 0x7C00)
+                        ? detail::signal(x.data_, y.data_)
+                        : (absx == absy) ? detail::invalid()
+                                         : (sign | ((absx == 0x7C00) ? 0x7C00 : 0)));
+    if(!absx)
+        return half(detail::binary, absy ? sign : detail::invalid());
+    if(!absy)
+        return half(detail::binary, detail::pole(sign));
+    for(; absx < 0x400; absx <<= 1, --exp)
+        ;
+    for(; absy < 0x400; absy <<= 1, ++exp)
+        ;
+    detail::uint32 mx = (absx & 0x3FF) | 0x400, my = (absy & 0x3FF) | 0x400;
+    int i = mx < my;
+    exp += (absx >> 10) - (absy >> 10) - i;
+    if(exp > 29)
+        return half(detail::binary, detail::overflow<half::round_style>(sign));
+    else if(exp < -11)
+        return half(detail::binary, detail::underflow<half::round_style>(sign));
+    mx <<= 12 + i;
+    my <<= 1;
+    return half(detail::binary,
+                detail::fixed2half<half::round_style, 11, false, false, false>(
+                    mx / my, exp, sign, mx % my != 0));
+#endif
+}
+
+/// \}
+/// \anchor streaming
+/// \name Input and output
+/// \{
+
+/// Output operator.
+///	This uses the built-in functionality for streaming out floating-point numbers.
+/// \param out output stream to write into
+/// \param arg half expression to write
+/// \return reference to output stream
+template <typename charT, typename traits>
+std::basic_ostream<charT, traits>& operator<<(std::basic_ostream<charT, traits>& out, half arg)
+{
+#ifdef HALF_ARITHMETIC_TYPE
+    return out << detail::half2float<detail::internal_t>(arg.data_);
+#else
+    return out << detail::half2float<float>(arg.data_);
+#endif
+}
+
+/// Input operator.
+///	This uses the built-in functionality for streaming in floating-point numbers, specifically
+/// double precision floating
+/// point numbers (unless overridden with [HALF_ARITHMETIC_TYPE](\ref HALF_ARITHMETIC_TYPE)). So the
+/// input string is first
+/// rounded to double precision using the underlying platform's current floating-point rounding mode
+/// before being rounded
+/// to half-precision using the library's half-precision rounding mode.
+/// \param in input stream to read from
+/// \param arg half to read into
+/// \return reference to input stream
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+template <typename charT, typename traits>
+std::basic_istream<charT, traits>& operator>>(std::basic_istream<charT, traits>& in, half& arg)
+{
+#ifdef HALF_ARITHMETIC_TYPE
+    detail::internal_t f;
+#else
+    double f;
+#endif
+    if(in >> f)
+        arg.data_ = detail::float2half<half::round_style>(f);
+    return in;
+}
+
+/// \}
+/// \anchor basic
+/// \name Basic mathematical operations
+/// \{
+
+/// Absolute value.
+/// **See also:** Documentation for
+/// [std::fabs](https://en.cppreference.com/w/cpp/numeric/math/fabs).
+/// \param arg operand
+/// \return absolute value of \a arg
+inline HALF_CONSTEXPR half fabs(half arg) { return half(detail::binary, arg.data_ & 0x7FFF); }
+
+/// Absolute value.
+/// **See also:** Documentation for [std::abs](https://en.cppreference.com/w/cpp/numeric/math/fabs).
+/// \param arg operand
+/// \return absolute value of \a arg
+inline HALF_CONSTEXPR half abs(half arg) { return fabs(arg); }
+
+/// Remainder of division.
+/// **See also:** Documentation for
+/// [std::fmod](https://en.cppreference.com/w/cpp/numeric/math/fmod).
+/// \param x first operand
+/// \param y second operand
+/// \return remainder of floating-point division.
+/// \exception FE_INVALID if \a x is infinite or \a y is 0 or if \a x or \a y is signaling NaN
+inline half fmod(half x, half y)
+{
+    unsigned int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, sign = x.data_ & 0x8000;
+    if(absx >= 0x7C00 || absy >= 0x7C00)
+        return half(detail::binary,
+                    (absx > 0x7C00 || absy > 0x7C00)
+                        ? detail::signal(x.data_, y.data_)
+                        : (absx == 0x7C00) ? detail::invalid() : x.data_);
+    if(!absy)
+        return half(detail::binary, detail::invalid());
+    if(!absx)
+        return x;
+    if(absx == absy)
+        return half(detail::binary, sign);
+    return half(detail::binary, sign | detail::mod<false, false>(absx, absy));
+}
+
+/// Remainder of division.
+/// **See also:** Documentation for
+/// [std::remainder](https://en.cppreference.com/w/cpp/numeric/math/remainder).
+/// \param x first operand
+/// \param y second operand
+/// \return remainder of floating-point division.
+/// \exception FE_INVALID if \a x is infinite or \a y is 0 or if \a x or \a y is signaling NaN
+inline half remainder(half x, half y)
+{
+    unsigned int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, sign = x.data_ & 0x8000;
+    if(absx >= 0x7C00 || absy >= 0x7C00)
+        return half(detail::binary,
+                    (absx > 0x7C00 || absy > 0x7C00)
+                        ? detail::signal(x.data_, y.data_)
+                        : (absx == 0x7C00) ? detail::invalid() : x.data_);
+    if(!absy)
+        return half(detail::binary, detail::invalid());
+    if(absx == absy)
+        return half(detail::binary, sign);
+    return half(detail::binary, sign ^ detail::mod<false, true>(absx, absy));
+}
+
+/// Remainder of division.
+/// **See also:** Documentation for
+/// [std::remquo](https://en.cppreference.com/w/cpp/numeric/math/remquo).
+/// \param x first operand
+/// \param y second operand
+/// \param quo address to store some bits of quotient at
+/// \return remainder of floating-point division.
+/// \exception FE_INVALID if \a x is infinite or \a y is 0 or if \a x or \a y is signaling NaN
+inline half remquo(half x, half y, int* quo)
+{
+    unsigned int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, value = x.data_ & 0x8000;
+    if(absx >= 0x7C00 || absy >= 0x7C00)
+        return half(detail::binary,
+                    (absx > 0x7C00 || absy > 0x7C00)
+                        ? detail::signal(x.data_, y.data_)
+                        : (absx == 0x7C00) ? detail::invalid() : (*quo = 0, x.data_));
+    if(!absy)
+        return half(detail::binary, detail::invalid());
+    bool qsign = ((value ^ y.data_) & 0x8000) != 0;
+    int q      = 1;
+    if(absx != absy)
+        value ^= detail::mod<true, true>(absx, absy, &q);
+    return *quo = qsign ? -q : q, half(detail::binary, value);
+}
+
+/// Fused multiply add.
+/// This function is exact to rounding for all rounding modes.
+///
+/// **See also:** Documentation for [std::fma](https://en.cppreference.com/w/cpp/numeric/math/fma).
+/// \param x first operand
+/// \param y second operand
+/// \param z third operand
+/// \return ( \a x * \a y ) + \a z rounded as one operation.
+/// \exception FE_INVALID according to operator*() and operator+() unless any argument is a quiet
+/// NaN and no argument is a signaling NaN
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding the final addition
+inline half fma(half x, half y, half z)
+{
+#ifdef HALF_ARITHMETIC_TYPE
+    detail::internal_t fx = detail::half2float<detail::internal_t>(x.data_),
+                       fy = detail::half2float<detail::internal_t>(y.data_),
+                       fz = detail::half2float<detail::internal_t>(z.data_);
+#if HALF_ENABLE_CPP11_CMATH && FP_FAST_FMA
+    return half(detail::binary, detail::float2half<half::round_style>(std::fma(fx, fy, fz)));
+#else
+    return half(detail::binary, detail::float2half<half::round_style>(fx * fy + fz));
+#endif
+#else
+    int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, absz = z.data_ & 0x7FFF, exp = -15;
+    unsigned int sign = (x.data_ ^ y.data_) & 0x8000;
+    bool sub          = ((sign ^ z.data_) & 0x8000) != 0;
+    if(absx >= 0x7C00 || absy >= 0x7C00 || absz >= 0x7C00)
+        return (absx > 0x7C00 || absy > 0x7C00 || absz > 0x7C00)
+                   ? half(detail::binary, detail::signal(x.data_, y.data_, z.data_))
+                   : (absx == 0x7C00) ? half(detail::binary,
+                                             (!absy || (sub && absz == 0x7C00)) ? detail::invalid()
+                                                                                : (sign | 0x7C00))
+                                      : (absy == 0x7C00) ? half(detail::binary,
+                                                                (!absx || (sub && absz == 0x7C00))
+                                                                    ? detail::invalid()
+                                                                    : (sign | 0x7C00))
+                                                         : z;
+    if(!absx || !absy)
+        return absz
+                   ? z
+                   : half(detail::binary,
+                          (half::round_style == std::round_toward_neg_infinity) ? (z.data_ | sign)
+                                                                                : (z.data_ & sign));
+    for(; absx < 0x400; absx <<= 1, --exp)
+        ;
+    for(; absy < 0x400; absy <<= 1, --exp)
+        ;
+    detail::uint32 m = static_cast<detail::uint32>((absx & 0x3FF) | 0x400) *
+                       static_cast<detail::uint32>((absy & 0x3FF) | 0x400);
+    int i = m >> 21;
+    exp += (absx >> 10) + (absy >> 10) + i;
+    m <<= 3 - i;
+    if(absz)
+    {
+        int expz = 0;
+        for(; absz < 0x400; absz <<= 1, --expz)
+            ;
+        expz += absz >> 10;
+        detail::uint32 mz = static_cast<detail::uint32>((absz & 0x3FF) | 0x400) << 13;
+        if(expz > exp || (expz == exp && mz > m))
+        {
+            std::swap(m, mz);
+            std::swap(exp, expz);
+            if(sub)
+                sign = z.data_ & 0x8000;
+        }
+        int d = exp - expz;
+        mz = (d < 23) ? ((mz >> d) | ((mz & ((static_cast<detail::uint32>(1) << d) - 1)) != 0)) : 1;
+        if(sub)
+        {
+            m = m - mz;
+            if(!m)
+                return half(
+                    detail::binary,
+                    static_cast<unsigned>(half::round_style == std::round_toward_neg_infinity)
+                        << 15);
+            for(; m < 0x800000; m <<= 1, --exp)
+                ;
+        }
+        else
+        {
+            m += mz;
+            i = m >> 24;
+            m = (m >> i) | (m & i);
+            exp += i;
+        }
+    }
+    if(exp > 30)
+        return half(detail::binary, detail::overflow<half::round_style>(sign));
+    else if(exp < -10)
+        return half(detail::binary, detail::underflow<half::round_style>(sign));
+    return half(detail::binary,
+                detail::fixed2half<half::round_style, 23, false, false, false>(m, exp - 1, sign));
+#endif
+}
+
+/// Maximum of half expressions.
+/// **See also:** Documentation for
+/// [std::fmax](https://en.cppreference.com/w/cpp/numeric/math/fmax).
+/// \param x first operand
+/// \param y second operand
+/// \return maximum of operands, ignoring quiet NaNs
+/// \exception FE_INVALID if \a x or \a y is signaling NaN
+inline HALF_CONSTEXPR_NOERR half fmax(half x, half y)
+{
+    return half(detail::binary,
+                (!isnan(y) && (isnan(x) || (x.data_ ^ (0x8000 | (0x8000 - (x.data_ >> 15)))) <
+                                               (y.data_ ^ (0x8000 | (0x8000 - (y.data_ >> 15))))))
+                    ? detail::select(y.data_, x.data_)
+                    : detail::select(x.data_, y.data_));
+}
+
+/// Minimum of half expressions.
+/// **See also:** Documentation for
+/// [std::fmin](https://en.cppreference.com/w/cpp/numeric/math/fmin).
+/// \param x first operand
+/// \param y second operand
+/// \return minimum of operands, ignoring quiet NaNs
+/// \exception FE_INVALID if \a x or \a y is signaling NaN
+inline HALF_CONSTEXPR_NOERR half fmin(half x, half y)
+{
+    return half(detail::binary,
+                (!isnan(y) && (isnan(x) || (x.data_ ^ (0x8000 | (0x8000 - (x.data_ >> 15)))) >
+                                               (y.data_ ^ (0x8000 | (0x8000 - (y.data_ >> 15))))))
+                    ? detail::select(y.data_, x.data_)
+                    : detail::select(x.data_, y.data_));
+}
+
+/// Positive difference.
+/// This function is exact to rounding for all rounding modes.
+///
+/// **See also:** Documentation for
+/// [std::fdim](https://en.cppreference.com/w/cpp/numeric/math/fdim).
+/// \param x first operand
+/// \param y second operand
+/// \return \a x - \a y or 0 if difference negative
+/// \exception FE_... according to operator-(half,half)
+inline half fdim(half x, half y)
+{
+    if(isnan(x) || isnan(y))
+        return half(detail::binary, detail::signal(x.data_, y.data_));
+    return (x.data_ ^ (0x8000 | (0x8000 - (x.data_ >> 15)))) <=
+                   (y.data_ ^ (0x8000 | (0x8000 - (y.data_ >> 15))))
+               ? half(detail::binary, 0)
+               : (x - y);
+}
+
+/// Get NaN value.
+/// **See also:** Documentation for [std::nan](https://en.cppreference.com/w/cpp/numeric/math/nan).
+/// \param arg string code
+/// \return quiet NaN
+inline half nanh(const char* arg)
+{
+    unsigned int value = 0x7FFF;
+    while(*arg)
+        value ^= static_cast<unsigned>(*arg++) & 0xFF;
+    return half(detail::binary, value);
+}
+
+/// \}
+/// \anchor exponential
+/// \name Exponential functions
+/// \{
+
+/// Exponential function.
+/// This function is exact to rounding for all rounding modes.
+///
+/// **See also:** Documentation for [std::exp](https://en.cppreference.com/w/cpp/numeric/math/exp).
+/// \param arg function argument
+/// \return e raised to \a arg
+/// \exception FE_INVALID for signaling NaN
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+inline half exp(half arg)
+{
+#ifdef HALF_ARITHMETIC_TYPE
+    return half(detail::binary,
+                detail::float2half<half::round_style>(
+                    std::exp(detail::half2float<detail::internal_t>(arg.data_))));
+#else
+    int abs = arg.data_ & 0x7FFF;
+    if(!abs)
+        return half(detail::binary, 0x3C00);
+    if(abs >= 0x7C00)
+        return half(detail::binary,
+                    (abs == 0x7C00) ? (0x7C00 & ((arg.data_ >> 15) - 1U))
+                                    : detail::signal(arg.data_));
+    if(abs >= 0x4C80)
+        return half(detail::binary,
+                    (arg.data_ & 0x8000) ? detail::underflow<half::round_style>()
+                                         : detail::overflow<half::round_style>());
+    detail::uint32 m = detail::multiply64(
+        static_cast<detail::uint32>((abs & 0x3FF) + ((abs > 0x3FF) << 10)) << 21, 0xB8AA3B29);
+    int e = (abs >> 10) + (abs <= 0x3FF), exp;
+    if(e < 14)
+    {
+        exp = 0;
+        m >>= 14 - e;
+    }
+    else
+    {
+        exp = m >> (45 - e);
+        m   = (m << (e - 14)) & 0x7FFFFFFF;
+    }
+    return half(detail::binary,
+                detail::exp2_post<half::round_style, true>(
+                    detail::exp2(m, 26), exp, (arg.data_ & 0x8000) != 0));
+#endif
+}
+
+/// Binary exponential.
+/// This function is exact to rounding for all rounding modes.
+///
+/// **See also:** Documentation for
+/// [std::exp2](https://en.cppreference.com/w/cpp/numeric/math/exp2).
+/// \param arg function argument
+/// \return 2 raised to \a arg
+/// \exception FE_INVALID for signaling NaN
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+inline half exp2(half arg)
+{
+#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+    return half(detail::binary,
+                detail::float2half<half::round_style>(
+                    std::exp2(detail::half2float<detail::internal_t>(arg.data_))));
+#else
+    int abs = arg.data_ & 0x7FFF;
+    if(!abs)
+        return half(detail::binary, 0x3C00);
+    if(abs >= 0x7C00)
+        return half(detail::binary,
+                    (abs == 0x7C00) ? (0x7C00 & ((arg.data_ >> 15) - 1U))
+                                    : detail::signal(arg.data_));
+    if(abs >= 0x4E40)
+        return half(detail::binary,
+                    (arg.data_ & 0x8000) ? detail::underflow<half::round_style>()
+                                         : detail::overflow<half::round_style>());
+    int e = (abs >> 10) + (abs <= 0x3FF), exp = (abs & 0x3FF) + ((abs > 0x3FF) << 10);
+    detail::uint32 m = detail::exp2((static_cast<detail::uint32>(exp) << (6 + e)) & 0x7FFFFFFF, 28);
+    exp >>= 25 - e;
+    if(m == 0x80000000)
+    {
+        if(arg.data_ & 0x8000)
+            exp = -exp;
+        else if(exp > 15)
+            return half(detail::binary, detail::overflow<half::round_style>());
+        return half(detail::binary,
+                    detail::fixed2half<half::round_style, 31, false, false, false>(m, exp + 14));
+    }
+    return half(detail::binary,
+                detail::exp2_post<half::round_style, true>(m, exp, (arg.data_ & 0x8000) != 0));
+#endif
+}
+
+/// Exponential minus one.
+/// This function may be 1 ULP off the correctly rounded exact result in <0.05% of inputs for
+/// `std::round_to_nearest`
+/// and in <1% of inputs for any other rounding mode.
+///
+/// **See also:** Documentation for
+/// [std::expm1](https://en.cppreference.com/w/cpp/numeric/math/expm1).
+/// \param arg function argument
+/// \return e raised to \a arg and subtracted by 1
+/// \exception FE_INVALID for signaling NaN
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+inline half expm1(half arg)
+{
+#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+    return half(detail::binary,
+                detail::float2half<half::round_style>(
+                    std::expm1(detail::half2float<detail::internal_t>(arg.data_))));
+#else
+    unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ & 0x8000;
+    if(!abs)
+        return arg;
+    if(abs >= 0x7C00)
+        return half(detail::binary,
+                    (abs == 0x7C00) ? (0x7C00 + (sign >> 1)) : detail::signal(arg.data_));
+    if(abs >= 0x4A00)
+        return half(detail::binary,
+                    (arg.data_ & 0x8000) ? detail::rounded<half::round_style, true>(0xBBFF, 1, 1)
+                                         : detail::overflow<half::round_style>());
+    detail::uint32 m = detail::multiply64(
+        static_cast<detail::uint32>((abs & 0x3FF) + ((abs > 0x3FF) << 10)) << 21, 0xB8AA3B29);
+    int e = (abs >> 10) + (abs <= 0x3FF), exp;
+    if(e < 14)
+    {
+        exp = 0;
+        m >>= 14 - e;
+    }
+    else
+    {
+        exp = m >> (45 - e);
+        m   = (m << (e - 14)) & 0x7FFFFFFF;
+    }
+    m = detail::exp2(m);
+    if(sign)
+    {
+        int s = 0;
+        if(m > 0x80000000)
+        {
+            ++exp;
+            m = detail::divide64(0x80000000, m, s);
+        }
+        m = 0x80000000 -
+            ((m >> exp) | ((m & ((static_cast<detail::uint32>(1) << exp) - 1)) != 0) | s);
+        exp = 0;
+    }
+    else
+        m -= (exp < 31) ? (0x80000000 >> exp) : 1;
+    for(exp += 14; m < 0x80000000 && exp; m <<= 1, --exp)
+        ;
+    if(exp > 29)
+        return half(detail::binary, detail::overflow<half::round_style>());
+    return half(detail::binary,
+                detail::rounded<half::round_style, true>(
+                    sign + (exp << 10) + (m >> 21), (m >> 20) & 1, (m & 0xFFFFF) != 0));
+#endif
+}
+
+/// Natural logarithm.
+/// This function is exact to rounding for all rounding modes.
+///
+/// **See also:** Documentation for [std::log](https://en.cppreference.com/w/cpp/numeric/math/log).
+/// \param arg function argument
+/// \return logarithm of \a arg to base e
+/// \exception FE_INVALID for signaling NaN or negative argument
+/// \exception FE_DIVBYZERO for 0
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+inline half log(half arg)
+{
+#ifdef HALF_ARITHMETIC_TYPE
+    return half(detail::binary,
+                detail::float2half<half::round_style>(
+                    std::log(detail::half2float<detail::internal_t>(arg.data_))));
+#else
+    int abs = arg.data_ & 0x7FFF, exp = -15;
+    if(!abs)
+        return half(detail::binary, detail::pole(0x8000));
+    if(arg.data_ & 0x8000)
+        return half(detail::binary,
+                    (arg.data_ <= 0xFC00) ? detail::invalid() : detail::signal(arg.data_));
+    if(abs >= 0x7C00)
+        return (abs == 0x7C00) ? arg : half(detail::binary, detail::signal(arg.data_));
+    for(; abs < 0x400; abs <<= 1, --exp)
+        ;
+    exp += abs >> 10;
+    return half(detail::binary,
+                detail::log2_post<half::round_style, 0xB8AA3B2A>(
+                    detail::log2(static_cast<detail::uint32>((abs & 0x3FF) | 0x400) << 20, 27) + 8,
+                    exp,
+                    17));
+#endif
+}
+
+/// Common logarithm.
+/// This function is exact to rounding for all rounding modes.
+///
+/// **See also:** Documentation for
+/// [std::log10](https://en.cppreference.com/w/cpp/numeric/math/log10).
+/// \param arg function argument
+/// \return logarithm of \a arg to base 10
+/// \exception FE_INVALID for signaling NaN or negative argument
+/// \exception FE_DIVBYZERO for 0
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+inline half log10(half arg)
+{
+#ifdef HALF_ARITHMETIC_TYPE
+    return half(detail::binary,
+                detail::float2half<half::round_style>(
+                    std::log10(detail::half2float<detail::internal_t>(arg.data_))));
+#else
+    int abs = arg.data_ & 0x7FFF, exp = -15;
+    if(!abs)
+        return half(detail::binary, detail::pole(0x8000));
+    if(arg.data_ & 0x8000)
+        return half(detail::binary,
+                    (arg.data_ <= 0xFC00) ? detail::invalid() : detail::signal(arg.data_));
+    if(abs >= 0x7C00)
+        return (abs == 0x7C00) ? arg : half(detail::binary, detail::signal(arg.data_));
+    switch(abs)
+    {
+    case 0x4900: return half(detail::binary, 0x3C00);
+    case 0x5640: return half(detail::binary, 0x4000);
+    case 0x63D0: return half(detail::binary, 0x4200);
+    case 0x70E2: return half(detail::binary, 0x4400);
+    }
+    for(; abs < 0x400; abs <<= 1, --exp)
+        ;
+    exp += abs >> 10;
+    return half(detail::binary,
+                detail::log2_post<half::round_style, 0xD49A784C>(
+                    detail::log2(static_cast<detail::uint32>((abs & 0x3FF) | 0x400) << 20, 27) + 8,
+                    exp,
+                    16));
+#endif
+}
+
+/// Binary logarithm.
+/// This function is exact to rounding for all rounding modes.
+///
+/// **See also:** Documentation for
+/// [std::log2](https://en.cppreference.com/w/cpp/numeric/math/log2).
+/// \param arg function argument
+/// \return logarithm of \a arg to base 2
+/// \exception FE_INVALID for signaling NaN or negative argument
+/// \exception FE_DIVBYZERO for 0
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+inline half log2(half arg)
+{
+#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+    return half(detail::binary,
+                detail::float2half<half::round_style>(
+                    std::log2(detail::half2float<detail::internal_t>(arg.data_))));
+#else
+    int abs = arg.data_ & 0x7FFF, exp = -15, s = 0;
+    if(!abs)
+        return half(detail::binary, detail::pole(0x8000));
+    if(arg.data_ & 0x8000)
+        return half(detail::binary,
+                    (arg.data_ <= 0xFC00) ? detail::invalid() : detail::signal(arg.data_));
+    if(abs >= 0x7C00)
+        return (abs == 0x7C00) ? arg : half(detail::binary, detail::signal(arg.data_));
+    if(abs == 0x3C00)
+        return half(detail::binary, 0);
+    for(; abs < 0x400; abs <<= 1, --exp)
+        ;
+    exp += (abs >> 10);
+    if(!(abs & 0x3FF))
+    {
+        unsigned int value = static_cast<unsigned>(exp < 0) << 15, m = std::abs(exp) << 6;
+        for(exp = 18; m < 0x400; m <<= 1, --exp)
+            ;
+        return half(detail::binary, value + (exp << 10) + m);
+    }
+    detail::uint32 ilog = exp, sign = detail::sign_mask(ilog),
+                   m = (((ilog << 27) +
+                         (detail::log2(static_cast<detail::uint32>((abs & 0x3FF) | 0x400) << 20,
+                                       28) >>
+                          4)) ^
+                        sign) -
+                       sign;
+    if(!m)
+        return half(detail::binary, 0);
+    for(exp = 14; m < 0x8000000 && exp; m <<= 1, --exp)
+        ;
+    for(; m > 0xFFFFFFF; m >>= 1, ++exp)
+        s |= m & 1;
+    return half(
+        detail::binary,
+        detail::fixed2half<half::round_style, 27, false, false, true>(m, exp, sign & 0x8000, s));
+#endif
+}
+
+/// Natural logarithm plus one.
+/// This function may be 1 ULP off the correctly rounded exact result in <0.05% of inputs for
+/// `std::round_to_nearest`
+/// and in ~1% of inputs for any other rounding mode.
+///
+/// **See also:** Documentation for
+/// [std::log1p](https://en.cppreference.com/w/cpp/numeric/math/log1p).
+/// \param arg function argument
+/// \return logarithm of \a arg plus 1 to base e
+/// \exception FE_INVALID for signaling NaN or argument <-1
+/// \exception FE_DIVBYZERO for -1
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+inline half log1p(half arg)
+{
+#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+    return half(detail::binary,
+                detail::float2half<half::round_style>(
+                    std::log1p(detail::half2float<detail::internal_t>(arg.data_))));
+#else
+    if(arg.data_ >= 0xBC00)
+        return half(detail::binary,
+                    (arg.data_ == 0xBC00)
+                        ? detail::pole(0x8000)
+                        : (arg.data_ <= 0xFC00) ? detail::invalid() : detail::signal(arg.data_));
+    int abs = arg.data_ & 0x7FFF, exp = -15;
+    if(!abs || abs >= 0x7C00)
+        return (abs > 0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
+    for(; abs < 0x400; abs <<= 1, --exp)
+        ;
+    exp += abs >> 10;
+    detail::uint32 m = static_cast<detail::uint32>((abs & 0x3FF) | 0x400) << 20;
+    if(arg.data_ & 0x8000)
+    {
+        m = 0x40000000 - (m >> -exp);
+        for(exp = 0; m < 0x40000000; m <<= 1, --exp)
+            ;
+    }
+    else
+    {
+        if(exp < 0)
+        {
+            m   = 0x40000000 + (m >> -exp);
+            exp = 0;
+        }
+        else
+        {
+            m += 0x40000000 >> exp;
+            int i = m >> 31;
+            m >>= i;
+            exp += i;
+        }
+    }
+    return half(detail::binary,
+                detail::log2_post<half::round_style, 0xB8AA3B2A>(detail::log2(m), exp, 17));
+#endif
+}
+
+/// \}
+/// \anchor power
+/// \name Power functions
+/// \{
+
+/// Square root.
+/// This function is exact to rounding for all rounding modes.
+///
+/// **See also:** Documentation for
+/// [std::sqrt](https://en.cppreference.com/w/cpp/numeric/math/sqrt).
+/// \param arg function argument
+/// \return square root of \a arg
+/// \exception FE_INVALID for signaling NaN and negative arguments
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+inline half sqrt(half arg)
+{
+#ifdef HALF_ARITHMETIC_TYPE
+    return half(detail::binary,
+                detail::float2half<half::round_style>(
+                    std::sqrt(detail::half2float<detail::internal_t>(arg.data_))));
+#else
+    int abs = arg.data_ & 0x7FFF, exp = 15;
+    if(!abs || arg.data_ >= 0x7C00)
+        return half(detail::binary,
+                    (abs > 0x7C00) ? detail::signal(arg.data_)
+                                   : (arg.data_ > 0x8000) ? detail::invalid() : arg.data_);
+    for(; abs < 0x400; abs <<= 1, --exp)
+        ;
+    detail::uint32 r = static_cast<detail::uint32>((abs & 0x3FF) | 0x400) << 10,
+                   m = detail::sqrt<20>(r, exp += abs >> 10);
+    return half(
+        detail::binary,
+        detail::rounded<half::round_style, false>((exp << 10) + (m & 0x3FF), r > m, r != 0));
+#endif
+}
+
+/// Cubic root.
+/// This function is exact to rounding for all rounding modes.
+///
+/// **See also:** Documentation for
+/// [std::cbrt](https://en.cppreference.com/w/cpp/numeric/math/cbrt).
+/// \param arg function argument
+/// \return cubic root of \a arg
+/// \exception FE_INVALID for signaling NaN
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+inline half cbrt(half arg)
+{
+#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+    return half(detail::binary,
+                detail::float2half<half::round_style>(
+                    std::cbrt(detail::half2float<detail::internal_t>(arg.data_))));
+#else
+    int abs = arg.data_ & 0x7FFF, exp = -15;
+    if(!abs || abs == 0x3C00 || abs >= 0x7C00)
+        return (abs > 0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
+    for(; abs < 0x400; abs <<= 1, --exp)
+        ;
+    detail::uint32 ilog = exp + (abs >> 10), sign = detail::sign_mask(ilog), f,
+                   m = (((ilog << 27) +
+                         (detail::log2(static_cast<detail::uint32>((abs & 0x3FF) | 0x400) << 20,
+                                       24) >>
+                          4)) ^
+                        sign) -
+                       sign;
+    for(exp = 2; m < 0x80000000; m <<= 1, --exp)
+        ;
+    m     = detail::multiply64(m, 0xAAAAAAAB);
+    int i = m >> 31, s;
+    exp += i;
+    m <<= 1 - i;
+    if(exp < 0)
+    {
+        f   = m >> -exp;
+        exp = 0;
+    }
+    else
+    {
+        f   = (m << exp) & 0x7FFFFFFF;
+        exp = m >> (31 - exp);
+    }
+    m = detail::exp2(f, (half::round_style == std::round_to_nearest) ? 29 : 26);
+    if(sign)
+    {
+        if(m > 0x80000000)
+        {
+            m = detail::divide64(0x80000000, m, s);
+            ++exp;
+        }
+        exp = -exp;
+    }
+    return half(detail::binary,
+                (half::round_style == std::round_to_nearest)
+                    ? detail::fixed2half<half::round_style, 31, false, false, false>(
+                          m, exp + 14, arg.data_ & 0x8000)
+                    : detail::fixed2half<half::round_style, 23, false, false, false>(
+                          (m + 0x80) >> 8, exp + 14, arg.data_ & 0x8000));
+#endif
+}
+
+/// Hypotenuse function.
+/// This function is exact to rounding for all rounding modes.
+///
+/// **See also:** Documentation for
+/// [std::hypot](https://en.cppreference.com/w/cpp/numeric/math/hypot).
+/// \param x first argument
+/// \param y second argument
+/// \return square root of sum of squares without internal over- or underflows
+/// \exception FE_INVALID if \a x or \a y is signaling NaN
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding of the final square root
+inline half hypot(half x, half y)
+{
+#ifdef HALF_ARITHMETIC_TYPE
+    detail::internal_t fx = detail::half2float<detail::internal_t>(x.data_),
+                       fy = detail::half2float<detail::internal_t>(y.data_);
+#if HALF_ENABLE_CPP11_CMATH
+    return half(detail::binary, detail::float2half<half::round_style>(std::hypot(fx, fy)));
+#else
+    return half(detail::binary,
+                detail::float2half<half::round_style>(std::sqrt(fx * fx + fy * fy)));
+#endif
+#else
+    int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, expx = 0, expy = 0;
+    if(absx >= 0x7C00 || absy >= 0x7C00)
+        return half(detail::binary,
+                    (absx == 0x7C00) ? detail::select(0x7C00, y.data_)
+                                     : (absy == 0x7C00) ? detail::select(0x7C00, x.data_)
+                                                        : detail::signal(x.data_, y.data_));
+    if(!absx)
+        return half(detail::binary, absy ? detail::check_underflow(absy) : 0);
+    if(!absy)
+        return half(detail::binary, detail::check_underflow(absx));
+    if(absy > absx)
+        std::swap(absx, absy);
+    for(; absx < 0x400; absx <<= 1, --expx)
+        ;
+    for(; absy < 0x400; absy <<= 1, --expy)
+        ;
+    detail::uint32 mx = (absx & 0x3FF) | 0x400, my = (absy & 0x3FF) | 0x400;
+    mx *= mx;
+    my *= my;
+    int ix = mx >> 21, iy = my >> 21;
+    expx = 2 * (expx + (absx >> 10)) - 15 + ix;
+    expy = 2 * (expy + (absy >> 10)) - 15 + iy;
+    mx <<= 10 - ix;
+    my <<= 10 - iy;
+    int d = expx - expy;
+    my    = (d < 30) ? ((my >> d) | ((my & ((static_cast<detail::uint32>(1) << d) - 1)) != 0)) : 1;
+    return half(detail::binary, detail::hypot_post<half::round_style>(mx + my, expx));
+#endif
+}
+
+/// Hypotenuse function.
+/// This function is exact to rounding for all rounding modes.
+///
+/// **See also:** Documentation for
+/// [std::hypot](https://en.cppreference.com/w/cpp/numeric/math/hypot).
+/// \param x first argument
+/// \param y second argument
+/// \param z third argument
+/// \return square root of sum of squares without internal over- or underflows
+/// \exception FE_INVALID if \a x, \a y or \a z is signaling NaN
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding of the final square root
+inline half hypot(half x, half y, half z)
+{
+#ifdef HALF_ARITHMETIC_TYPE
+    detail::internal_t fx = detail::half2float<detail::internal_t>(x.data_),
+                       fy = detail::half2float<detail::internal_t>(y.data_),
+                       fz = detail::half2float<detail::internal_t>(z.data_);
+    return half(detail::binary,
+                detail::float2half<half::round_style>(std::sqrt(fx * fx + fy * fy + fz * fz)));
+#else
+    int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, absz = z.data_ & 0x7FFF, expx = 0,
+        expy = 0, expz = 0;
+    if(!absx)
+        return hypot(y, z);
+    if(!absy)
+        return hypot(x, z);
+    if(!absz)
+        return hypot(x, y);
+    if(absx >= 0x7C00 || absy >= 0x7C00 || absz >= 0x7C00)
+        return half(detail::binary,
+                    (absx == 0x7C00)
+                        ? detail::select(0x7C00, detail::select(y.data_, z.data_))
+                        : (absy == 0x7C00)
+                              ? detail::select(0x7C00, detail::select(x.data_, z.data_))
+                              : (absz == 0x7C00)
+                                    ? detail::select(0x7C00, detail::select(x.data_, y.data_))
+                                    : detail::signal(x.data_, y.data_, z.data_));
+    if(absz > absy)
+        std::swap(absy, absz);
+    if(absy > absx)
+        std::swap(absx, absy);
+    if(absz > absy)
+        std::swap(absy, absz);
+    for(; absx < 0x400; absx <<= 1, --expx)
+        ;
+    for(; absy < 0x400; absy <<= 1, --expy)
+        ;
+    for(; absz < 0x400; absz <<= 1, --expz)
+        ;
+    detail::uint32 mx = (absx & 0x3FF) | 0x400, my = (absy & 0x3FF) | 0x400,
+                   mz = (absz & 0x3FF) | 0x400;
+    mx *= mx;
+    my *= my;
+    mz *= mz;
+    int ix = mx >> 21, iy = my >> 21, iz = mz >> 21;
+    expx = 2 * (expx + (absx >> 10)) - 15 + ix;
+    expy = 2 * (expy + (absy >> 10)) - 15 + iy;
+    expz = 2 * (expz + (absz >> 10)) - 15 + iz;
+    mx <<= 10 - ix;
+    my <<= 10 - iy;
+    mz <<= 10 - iz;
+    int d = expy - expz;
+    mz    = (d < 30) ? ((mz >> d) | ((mz & ((static_cast<detail::uint32>(1) << d) - 1)) != 0)) : 1;
+    my += mz;
+    if(my & 0x80000000)
+    {
+        my = (my >> 1) | (my & 1);
+        if(++expy > expx)
+        {
+            std::swap(mx, my);
+            std::swap(expx, expy);
+        }
+    }
+    d  = expx - expy;
+    my = (d < 30) ? ((my >> d) | ((my & ((static_cast<detail::uint32>(1) << d) - 1)) != 0)) : 1;
+    return half(detail::binary, detail::hypot_post<half::round_style>(mx + my, expx));
+#endif
+}
+
+/// Power function.
+/// This function may be 1 ULP off the correctly rounded exact result for any rounding mode in
+/// ~0.00025% of inputs.
+///
+/// **See also:** Documentation for [std::pow](https://en.cppreference.com/w/cpp/numeric/math/pow).
+/// \param x base
+/// \param y exponent
+/// \return \a x raised to \a y
+/// \exception FE_INVALID if \a x or \a y is signaling NaN or if \a x is finite an negative and \a y
+/// is finite and not integral
+/// \exception FE_DIVBYZERO if \a x is 0 and \a y is negative
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+inline half pow(half x, half y)
+{
+#ifdef HALF_ARITHMETIC_TYPE
+    return half(detail::binary,
+                detail::float2half<half::round_style>(
+                    std::pow(detail::half2float<detail::internal_t>(x.data_),
+                             detail::half2float<detail::internal_t>(y.data_))));
+#else
+    int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, exp = -15;
+    if(!absy || x.data_ == 0x3C00)
+        return half(detail::binary,
+                    detail::select(0x3C00, (x.data_ == 0x3C00) ? y.data_ : x.data_));
+    bool is_int = absy >= 0x6400 || (absy >= 0x3C00 && !(absy & ((1 << (25 - (absy >> 10))) - 1)));
+    unsigned int sign =
+        x.data_ &
+        (static_cast<unsigned>((absy < 0x6800) && is_int && ((absy >> (25 - (absy >> 10))) & 1))
+         << 15);
+    if(absx >= 0x7C00 || absy >= 0x7C00)
+        return half(detail::binary,
+                    (absx > 0x7C00 || absy > 0x7C00)
+                        ? detail::signal(x.data_, y.data_)
+                        : (absy == 0x7C00)
+                              ? ((absx == 0x3C00)
+                                     ? 0x3C00
+                                     : (!absx && y.data_ == 0xFC00)
+                                           ? detail::pole()
+                                           : (0x7C00 & -((y.data_ >> 15) ^ (absx > 0x3C00))))
+                              : (sign | (0x7C00 & ((y.data_ >> 15) - 1U))));
+    if(!absx)
+        return half(detail::binary, (y.data_ & 0x8000) ? detail::pole(sign) : sign);
+    if((x.data_ & 0x8000) && !is_int)
+        return half(detail::binary, detail::invalid());
+    if(x.data_ == 0xBC00)
+        return half(detail::binary, sign | 0x3C00);
+    if(y.data_ == 0x3800)
+        return sqrt(x);
+    if(y.data_ == 0x3C00)
+        return half(detail::binary, detail::check_underflow(x.data_));
+    if(y.data_ == 0x4000)
+        return x * x;
+    for(; absx < 0x400; absx <<= 1, --exp)
+        ;
+    detail::uint32 ilog = exp + (absx >> 10), msign = detail::sign_mask(ilog), f,
+                   m = (((ilog << 27) +
+                         ((detail::log2(static_cast<detail::uint32>((absx & 0x3FF) | 0x400) << 20) +
+                           8) >>
+                          4)) ^
+                        msign) -
+                       msign;
+    for(exp = -11; m < 0x80000000; m <<= 1, --exp)
+        ;
+    for(; absy < 0x400; absy <<= 1, --exp)
+        ;
+    m     = detail::multiply64(m, static_cast<detail::uint32>((absy & 0x3FF) | 0x400) << 21);
+    int i = m >> 31;
+    exp += (absy >> 10) + i;
+    m <<= 1 - i;
+    if(exp < 0)
+    {
+        f   = m >> -exp;
+        exp = 0;
+    }
+    else
+    {
+        f   = (m << exp) & 0x7FFFFFFF;
+        exp = m >> (31 - exp);
+    }
+    return half(detail::binary,
+                detail::exp2_post<half::round_style, false>(
+                    detail::exp2(f), exp, ((msign & 1) ^ (y.data_ >> 15)) != 0, sign));
+#endif
+}
+
+/// \}
+/// \anchor trigonometric
+/// \name Trigonometric functions
+/// \{
+
+/// Compute sine and cosine simultaneously.
+///	This returns the same results as sin() and cos() but is faster than calling each function
+/// individually.
+///
+/// This function is exact to rounding for all rounding modes.
+/// \param arg function argument
+/// \param sin variable to take sine of \a arg
+/// \param cos variable to take cosine of \a arg
+/// \exception FE_INVALID for signaling NaN or infinity
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+inline void sincos(half arg, half* sin, half* cos)
+{
+#ifdef HALF_ARITHMETIC_TYPE
+    detail::internal_t f = detail::half2float<detail::internal_t>(arg.data_);
+    *sin                 = half(detail::binary, detail::float2half<half::round_style>(std::sin(f)));
+    *cos                 = half(detail::binary, detail::float2half<half::round_style>(std::cos(f)));
+#else
+    int abs = arg.data_ & 0x7FFF, sign = arg.data_ >> 15, k;
+    if(abs >= 0x7C00)
+        *sin = *cos =
+            half(detail::binary, (abs == 0x7C00) ? detail::invalid() : detail::signal(arg.data_));
+    else if(!abs)
+    {
+        *sin = arg;
+        *cos = half(detail::binary, 0x3C00);
+    }
+    else if(abs < 0x2500)
+    {
+        *sin = half(detail::binary, detail::rounded<half::round_style, true>(arg.data_ - 1, 1, 1));
+        *cos = half(detail::binary, detail::rounded<half::round_style, true>(0x3BFF, 1, 1));
+    }
+    else
+    {
+        if(half::round_style != std::round_to_nearest)
+        {
+            switch(abs)
+            {
+            case 0x48B7:
+                *sin = half(
+                    detail::binary,
+                    detail::rounded<half::round_style, true>((~arg.data_ & 0x8000) | 0x1D07, 1, 1));
+                *cos = half(detail::binary, detail::rounded<half::round_style, true>(0xBBFF, 1, 1));
+                return;
+            case 0x598C:
+                *sin = half(
+                    detail::binary,
+                    detail::rounded<half::round_style, true>((arg.data_ & 0x8000) | 0x3BFF, 1, 1));
+                *cos = half(detail::binary, detail::rounded<half::round_style, true>(0x80FC, 1, 1));
+                return;
+            case 0x6A64:
+                *sin = half(
+                    detail::binary,
+                    detail::rounded<half::round_style, true>((~arg.data_ & 0x8000) | 0x3BFE, 1, 1));
+                *cos = half(detail::binary, detail::rounded<half::round_style, true>(0x27FF, 1, 1));
+                return;
+            case 0x6D8C:
+                *sin = half(
+                    detail::binary,
+                    detail::rounded<half::round_style, true>((arg.data_ & 0x8000) | 0x0FE6, 1, 1));
+                *cos = half(detail::binary, detail::rounded<half::round_style, true>(0x3BFF, 1, 1));
+                return;
+            }
+        }
+        std::pair<detail::uint32, detail::uint32> sc =
+            detail::sincos(detail::angle_arg(abs, k), 28);
+        switch(k & 3)
+        {
+        case 1: sc = std::make_pair(sc.second, -sc.first); break;
+        case 2: sc = std::make_pair(-sc.first, -sc.second); break;
+        case 3: sc = std::make_pair(-sc.second, sc.first); break;
+        }
+        *sin = half(detail::binary,
+                    detail::fixed2half<half::round_style, 30, true, true, true>(
+                        (sc.first ^ -static_cast<detail::uint32>(sign)) + sign));
+        *cos = half(detail::binary,
+                    detail::fixed2half<half::round_style, 30, true, true, true>(sc.second));
+    }
+#endif
+}
+
+/// Sine function.
+/// This function is exact to rounding for all rounding modes.
+///
+/// **See also:** Documentation for [std::sin](https://en.cppreference.com/w/cpp/numeric/math/sin).
+/// \param arg function argument
+/// \return sine value of \a arg
+/// \exception FE_INVALID for signaling NaN or infinity
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+inline half sin(half arg)
+{
+#ifdef HALF_ARITHMETIC_TYPE
+    return half(detail::binary,
+                detail::float2half<half::round_style>(
+                    std::sin(detail::half2float<detail::internal_t>(arg.data_))));
+#else
+    int abs = arg.data_ & 0x7FFF, k;
+    if(!abs)
+        return arg;
+    if(abs >= 0x7C00)
+        return half(detail::binary,
+                    (abs == 0x7C00) ? detail::invalid() : detail::signal(arg.data_));
+    if(abs < 0x2900)
+        return half(detail::binary, detail::rounded<half::round_style, true>(arg.data_ - 1, 1, 1));
+    if(half::round_style != std::round_to_nearest)
+        switch(abs)
+        {
+        case 0x48B7:
+            return half(
+                detail::binary,
+                detail::rounded<half::round_style, true>((~arg.data_ & 0x8000) | 0x1D07, 1, 1));
+        case 0x6A64:
+            return half(
+                detail::binary,
+                detail::rounded<half::round_style, true>((~arg.data_ & 0x8000) | 0x3BFE, 1, 1));
+        case 0x6D8C:
+            return half(
+                detail::binary,
+                detail::rounded<half::round_style, true>((arg.data_ & 0x8000) | 0x0FE6, 1, 1));
+        }
+    std::pair<detail::uint32, detail::uint32> sc = detail::sincos(detail::angle_arg(abs, k), 28);
+    detail::uint32 sign = -static_cast<detail::uint32>(((k >> 1) & 1) ^ (arg.data_ >> 15));
+    return half(detail::binary,
+                detail::fixed2half<half::round_style, 30, true, true, true>(
+                    (((k & 1) ? sc.second : sc.first) ^ sign) - sign));
+#endif
+}
+
+/// Cosine function.
+/// This function is exact to rounding for all rounding modes.
+///
+/// **See also:** Documentation for [std::cos](https://en.cppreference.com/w/cpp/numeric/math/cos).
+/// \param arg function argument
+/// \return cosine value of \a arg
+/// \exception FE_INVALID for signaling NaN or infinity
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+inline half cos(half arg)
+{
+#ifdef HALF_ARITHMETIC_TYPE
+    return half(detail::binary,
+                detail::float2half<half::round_style>(
+                    std::cos(detail::half2float<detail::internal_t>(arg.data_))));
+#else
+    int abs = arg.data_ & 0x7FFF, k;
+    if(!abs)
+        return half(detail::binary, 0x3C00);
+    if(abs >= 0x7C00)
+        return half(detail::binary,
+                    (abs == 0x7C00) ? detail::invalid() : detail::signal(arg.data_));
+    if(abs < 0x2500)
+        return half(detail::binary, detail::rounded<half::round_style, true>(0x3BFF, 1, 1));
+    if(half::round_style != std::round_to_nearest && abs == 0x598C)
+        return half(detail::binary, detail::rounded<half::round_style, true>(0x80FC, 1, 1));
+    std::pair<detail::uint32, detail::uint32> sc = detail::sincos(detail::angle_arg(abs, k), 28);
+    detail::uint32 sign                          = -static_cast<detail::uint32>(((k >> 1) ^ k) & 1);
+    return half(detail::binary,
+                detail::fixed2half<half::round_style, 30, true, true, true>(
+                    (((k & 1) ? sc.first : sc.second) ^ sign) - sign));
+#endif
+}
+
+/// Tangent function.
+/// This function is exact to rounding for all rounding modes.
+///
+/// **See also:** Documentation for [std::tan](https://en.cppreference.com/w/cpp/numeric/math/tan).
+/// \param arg function argument
+/// \return tangent value of \a arg
+/// \exception FE_INVALID for signaling NaN or infinity
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+inline half tan(half arg)
+{
+#ifdef HALF_ARITHMETIC_TYPE
+    return half(detail::binary,
+                detail::float2half<half::round_style>(
+                    std::tan(detail::half2float<detail::internal_t>(arg.data_))));
+#else
+    int abs = arg.data_ & 0x7FFF, exp = 13, k;
+    if(!abs)
+        return arg;
+    if(abs >= 0x7C00)
+        return half(detail::binary,
+                    (abs == 0x7C00) ? detail::invalid() : detail::signal(arg.data_));
+    if(abs < 0x2700)
+        return half(detail::binary, detail::rounded<half::round_style, true>(arg.data_, 0, 1));
+    if(half::round_style != std::round_to_nearest)
+        switch(abs)
+        {
+        case 0x658C:
+            return half(
+                detail::binary,
+                detail::rounded<half::round_style, true>((arg.data_ & 0x8000) | 0x07E6, 1, 1));
+        case 0x7330:
+            return half(
+                detail::binary,
+                detail::rounded<half::round_style, true>((~arg.data_ & 0x8000) | 0x4B62, 1, 1));
+        }
+    std::pair<detail::uint32, detail::uint32> sc = detail::sincos(detail::angle_arg(abs, k), 30);
+    if(k & 1)
+        sc = std::make_pair(-sc.second, sc.first);
+    detail::uint32 signy = detail::sign_mask(sc.first), signx = detail::sign_mask(sc.second);
+    detail::uint32 my = (sc.first ^ signy) - signy, mx = (sc.second ^ signx) - signx;
+    for(; my < 0x80000000; my <<= 1, --exp)
+        ;
+    for(; mx < 0x80000000; mx <<= 1, ++exp)
+        ;
+    return half(
+        detail::binary,
+        detail::tangent_post<half::round_style>(my, mx, exp, (signy ^ signx ^ arg.data_) & 0x8000));
+#endif
+}
+
+/// Arc sine.
+/// This function is exact to rounding for all rounding modes.
+///
+/// **See also:** Documentation for
+/// [std::asin](https://en.cppreference.com/w/cpp/numeric/math/asin).
+/// \param arg function argument
+/// \return arc sine value of \a arg
+/// \exception FE_INVALID for signaling NaN or if abs(\a arg) > 1
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+inline half asin(half arg)
+{
+#ifdef HALF_ARITHMETIC_TYPE
+    return half(detail::binary,
+                detail::float2half<half::round_style>(
+                    std::asin(detail::half2float<detail::internal_t>(arg.data_))));
+#else
+    unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ & 0x8000;
+    if(!abs)
+        return arg;
+    if(abs >= 0x3C00)
+        return half(detail::binary,
+                    (abs > 0x7C00)
+                        ? detail::signal(arg.data_)
+                        : (abs > 0x3C00)
+                              ? detail::invalid()
+                              : detail::rounded<half::round_style, true>(sign | 0x3E48, 0, 1));
+    if(abs < 0x2900)
+        return half(detail::binary, detail::rounded<half::round_style, true>(arg.data_, 0, 1));
+    if(half::round_style != std::round_to_nearest && (abs == 0x2B44 || abs == 0x2DC3))
+        return half(detail::binary, detail::rounded<half::round_style, true>(arg.data_ + 1, 1, 1));
+    std::pair<detail::uint32, detail::uint32> sc = detail::atan2_args(abs);
+    detail::uint32 m =
+        detail::atan2(sc.first, sc.second, (half::round_style == std::round_to_nearest) ? 27 : 26);
+    return half(detail::binary,
+                detail::fixed2half<half::round_style, 30, false, true, true>(m, 14, sign));
+#endif
+}
+
+/// Arc cosine function.
+/// This function is exact to rounding for all rounding modes.
+///
+/// **See also:** Documentation for
+/// [std::acos](https://en.cppreference.com/w/cpp/numeric/math/acos).
+/// \param arg function argument
+/// \return arc cosine value of \a arg
+/// \exception FE_INVALID for signaling NaN or if abs(\a arg) > 1
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+inline half acos(half arg)
+{
+#ifdef HALF_ARITHMETIC_TYPE
+    return half(detail::binary,
+                detail::float2half<half::round_style>(
+                    std::acos(detail::half2float<detail::internal_t>(arg.data_))));
+#else
+    unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ >> 15;
+    if(!abs)
+        return half(detail::binary, detail::rounded<half::round_style, true>(0x3E48, 0, 1));
+    if(abs >= 0x3C00)
+        return half(detail::binary,
+                    (abs > 0x7C00)
+                        ? detail::signal(arg.data_)
+                        : (abs > 0x3C00)
+                              ? detail::invalid()
+                              : sign ? detail::rounded<half::round_style, true>(0x4248, 0, 1) : 0);
+    std::pair<detail::uint32, detail::uint32> cs = detail::atan2_args(abs);
+    detail::uint32 m                             = detail::atan2(cs.second, cs.first, 28);
+    return half(detail::binary,
+                detail::fixed2half<half::round_style, 31, false, true, true>(
+                    sign ? (0xC90FDAA2 - m) : m, 15, 0, sign));
+#endif
+}
+
+/// Arc tangent function.
+/// This function is exact to rounding for all rounding modes.
+///
+/// **See also:** Documentation for
+/// [std::atan](https://en.cppreference.com/w/cpp/numeric/math/atan).
+/// \param arg function argument
+/// \return arc tangent value of \a arg
+/// \exception FE_INVALID for signaling NaN
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+inline half atan(half arg)
+{
+#ifdef HALF_ARITHMETIC_TYPE
+    return half(detail::binary,
+                detail::float2half<half::round_style>(
+                    std::atan(detail::half2float<detail::internal_t>(arg.data_))));
+#else
+    unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ & 0x8000;
+    if(!abs)
+        return arg;
+    if(abs >= 0x7C00)
+        return half(detail::binary,
+                    (abs == 0x7C00) ? detail::rounded<half::round_style, true>(sign | 0x3E48, 0, 1)
+                                    : detail::signal(arg.data_));
+    if(abs <= 0x2700)
+        return half(detail::binary, detail::rounded<half::round_style, true>(arg.data_ - 1, 1, 1));
+    int exp           = (abs >> 10) + (abs <= 0x3FF);
+    detail::uint32 my = (abs & 0x3FF) | ((abs > 0x3FF) << 10);
+    detail::uint32 m  = (exp > 15)
+                           ? detail::atan2(my << 19,
+                                           0x20000000 >> (exp - 15),
+                                           (half::round_style == std::round_to_nearest) ? 26 : 24)
+                           : detail::atan2(my << (exp + 4),
+                                           0x20000000,
+                                           (half::round_style == std::round_to_nearest) ? 30 : 28);
+    return half(detail::binary,
+                detail::fixed2half<half::round_style, 30, false, true, true>(m, 14, sign));
+#endif
+}
+
+/// Arc tangent function.
+/// This function may be 1 ULP off the correctly rounded exact result in ~0.005% of inputs for
+/// `std::round_to_nearest`,
+/// in ~0.1% of inputs for `std::round_toward_zero` and in ~0.02% of inputs for any other rounding
+/// mode.
+///
+/// **See also:** Documentation for
+/// [std::atan2](https://en.cppreference.com/w/cpp/numeric/math/atan2).
+/// \param y numerator
+/// \param x denominator
+/// \return arc tangent value
+/// \exception FE_INVALID if \a x or \a y is signaling NaN
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+inline half atan2(half y, half x)
+{
+#ifdef HALF_ARITHMETIC_TYPE
+    return half(detail::binary,
+                detail::float2half<half::round_style>(
+                    std::atan2(detail::half2float<detail::internal_t>(y.data_),
+                               detail::half2float<detail::internal_t>(x.data_))));
+#else
+    unsigned int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, signx = x.data_ >> 15,
+                 signy = y.data_ & 0x8000;
+    if(absx >= 0x7C00 || absy >= 0x7C00)
+    {
+        if(absx > 0x7C00 || absy > 0x7C00)
+            return half(detail::binary, detail::signal(x.data_, y.data_));
+        if(absy == 0x7C00)
+            return half(detail::binary,
+                        (absx < 0x7C00)
+                            ? detail::rounded<half::round_style, true>(signy | 0x3E48, 0, 1)
+                            : signx
+                                  ? detail::rounded<half::round_style, true>(signy | 0x40B6, 0, 1)
+                                  : detail::rounded<half::round_style, true>(signy | 0x3A48, 0, 1));
+        return (x.data_ == 0x7C00)
+                   ? half(detail::binary, signy)
+                   : half(detail::binary,
+                          detail::rounded<half::round_style, true>(signy | 0x4248, 0, 1));
+    }
+    if(!absy)
+        return signx ? half(detail::binary,
+                            detail::rounded<half::round_style, true>(signy | 0x4248, 0, 1))
+                     : y;
+    if(!absx)
+        return half(detail::binary, detail::rounded<half::round_style, true>(signy | 0x3E48, 0, 1));
+    int d = (absy >> 10) + (absy <= 0x3FF) - (absx >> 10) - (absx <= 0x3FF);
+    if(d > (signx ? 18 : 12))
+        return half(detail::binary, detail::rounded<half::round_style, true>(signy | 0x3E48, 0, 1));
+    if(signx && d < -11)
+        return half(detail::binary, detail::rounded<half::round_style, true>(signy | 0x4248, 0, 1));
+    if(!signx && d < ((half::round_style == std::round_toward_zero) ? -15 : -9))
+    {
+        for(; absy < 0x400; absy <<= 1, --d)
+            ;
+        detail::uint32 mx = ((absx << 1) & 0x7FF) | 0x800, my = ((absy << 1) & 0x7FF) | 0x800;
+        int i = my < mx;
+        d -= i;
+        if(d < -25)
+            return half(detail::binary, detail::underflow<half::round_style>(signy));
+        my <<= 11 + i;
+        return half(detail::binary,
+                    detail::fixed2half<half::round_style, 11, false, false, true>(
+                        my / mx, d + 14, signy, my % mx != 0));
+    }
+    detail::uint32 m = detail::atan2(
+        ((absy & 0x3FF) | ((absy > 0x3FF) << 10)) << (19 + ((d < 0) ? d : (d > 0) ? 0 : -1)),
+        ((absx & 0x3FF) | ((absx > 0x3FF) << 10)) << (19 - ((d > 0) ? d : (d < 0) ? 0 : 1)));
+    return half(detail::binary,
+                detail::fixed2half<half::round_style, 31, false, true, true>(
+                    signx ? (0xC90FDAA2 - m) : m, 15, signy, signx));
+#endif
+}
+
+/// \}
+/// \anchor hyperbolic
+/// \name Hyperbolic functions
+/// \{
+
+/// Hyperbolic sine.
+/// This function is exact to rounding for all rounding modes.
+///
+/// **See also:** Documentation for
+/// [std::sinh](https://en.cppreference.com/w/cpp/numeric/math/sinh).
+/// \param arg function argument
+/// \return hyperbolic sine value of \a arg
+/// \exception FE_INVALID for signaling NaN
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+inline half sinh(half arg)
+{
+#ifdef HALF_ARITHMETIC_TYPE
+    return half(detail::binary,
+                detail::float2half<half::round_style>(
+                    std::sinh(detail::half2float<detail::internal_t>(arg.data_))));
+#else
+    int abs = arg.data_ & 0x7FFF, exp;
+    if(!abs || abs >= 0x7C00)
+        return (abs > 0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
+    if(abs <= 0x2900)
+        return half(detail::binary, detail::rounded<half::round_style, true>(arg.data_, 0, 1));
+    std::pair<detail::uint32, detail::uint32> mm =
+        detail::hyperbolic_args(abs, exp, (half::round_style == std::round_to_nearest) ? 29 : 27);
+    detail::uint32 m = mm.first - mm.second;
+    for(exp += 13; m < 0x80000000 && exp; m <<= 1, --exp)
+        ;
+    unsigned int sign = arg.data_ & 0x8000;
+    if(exp > 29)
+        return half(detail::binary, detail::overflow<half::round_style>(sign));
+    return half(detail::binary,
+                detail::fixed2half<half::round_style, 31, false, false, true>(m, exp, sign));
+#endif
+}
+
+/// Hyperbolic cosine.
+/// This function is exact to rounding for all rounding modes.
+///
+/// **See also:** Documentation for
+/// [std::cosh](https://en.cppreference.com/w/cpp/numeric/math/cosh).
+/// \param arg function argument
+/// \return hyperbolic cosine value of \a arg
+/// \exception FE_INVALID for signaling NaN
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+inline half cosh(half arg)
+{
+#ifdef HALF_ARITHMETIC_TYPE
+    return half(detail::binary,
+                detail::float2half<half::round_style>(
+                    std::cosh(detail::half2float<detail::internal_t>(arg.data_))));
+#else
+    int abs = arg.data_ & 0x7FFF, exp;
+    if(!abs)
+        return half(detail::binary, 0x3C00);
+    if(abs >= 0x7C00)
+        return half(detail::binary, (abs > 0x7C00) ? detail::signal(arg.data_) : 0x7C00);
+    std::pair<detail::uint32, detail::uint32> mm =
+        detail::hyperbolic_args(abs, exp, (half::round_style == std::round_to_nearest) ? 23 : 26);
+    detail::uint32 m = mm.first + mm.second, i = (~m & 0xFFFFFFFF) >> 31;
+    m = (m >> i) | (m & i) | 0x80000000;
+    if((exp += 13 + i) > 29)
+        return half(detail::binary, detail::overflow<half::round_style>());
+    return half(detail::binary,
+                detail::fixed2half<half::round_style, 31, false, false, true>(m, exp));
+#endif
+}
+
+/// Hyperbolic tangent.
+/// This function is exact to rounding for all rounding modes.
+///
+/// **See also:** Documentation for
+/// [std::tanh](https://en.cppreference.com/w/cpp/numeric/math/tanh).
+/// \param arg function argument
+/// \return hyperbolic tangent value of \a arg
+/// \exception FE_INVALID for signaling NaN
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+inline half tanh(half arg)
+{
+#ifdef HALF_ARITHMETIC_TYPE
+    return half(detail::binary,
+                detail::float2half<half::round_style>(
+                    std::tanh(detail::half2float<detail::internal_t>(arg.data_))));
+#else
+    int abs = arg.data_ & 0x7FFF, exp;
+    if(!abs)
+        return arg;
+    if(abs >= 0x7C00)
+        return half(detail::binary,
+                    (abs > 0x7C00) ? detail::signal(arg.data_) : (arg.data_ - 0x4000));
+    if(abs >= 0x4500)
+        return half(detail::binary,
+                    detail::rounded<half::round_style, true>((arg.data_ & 0x8000) | 0x3BFF, 1, 1));
+    if(abs < 0x2700)
+        return half(detail::binary, detail::rounded<half::round_style, true>(arg.data_ - 1, 1, 1));
+    if(half::round_style != std::round_to_nearest && abs == 0x2D3F)
+        return half(detail::binary, detail::rounded<half::round_style, true>(arg.data_ - 3, 0, 1));
+    std::pair<detail::uint32, detail::uint32> mm = detail::hyperbolic_args(abs, exp, 27);
+    detail::uint32 my = mm.first - mm.second - (half::round_style != std::round_to_nearest),
+                   mx = mm.first + mm.second, i = (~mx & 0xFFFFFFFF) >> 31;
+    for(exp = 13; my < 0x80000000; my <<= 1, --exp)
+        ;
+    mx = (mx >> i) | 0x80000000;
+    return half(detail::binary,
+                detail::tangent_post<half::round_style>(my, mx, exp - i, arg.data_ & 0x8000));
+#endif
+}
+
+/// Hyperbolic area sine.
+/// This function is exact to rounding for all rounding modes.
+///
+/// **See also:** Documentation for
+/// [std::asinh](https://en.cppreference.com/w/cpp/numeric/math/asinh).
+/// \param arg function argument
+/// \return area sine value of \a arg
+/// \exception FE_INVALID for signaling NaN
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+inline half asinh(half arg)
+{
+#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+    return half(detail::binary,
+                detail::float2half<half::round_style>(
+                    std::asinh(detail::half2float<detail::internal_t>(arg.data_))));
+#else
+    int abs = arg.data_ & 0x7FFF;
+    if(!abs || abs >= 0x7C00)
+        return (abs > 0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
+    if(abs <= 0x2900)
+        return half(detail::binary, detail::rounded<half::round_style, true>(arg.data_ - 1, 1, 1));
+    if(half::round_style != std::round_to_nearest)
+        switch(abs)
+        {
+        case 0x32D4:
+            return half(detail::binary,
+                        detail::rounded<half::round_style, true>(arg.data_ - 13, 1, 1));
+        case 0x3B5B:
+            return half(detail::binary,
+                        detail::rounded<half::round_style, true>(arg.data_ - 197, 1, 1));
+        }
+    return half(detail::binary, detail::area<half::round_style, true>(arg.data_));
+#endif
+}
+
+/// Hyperbolic area cosine.
+/// This function is exact to rounding for all rounding modes.
+///
+/// **See also:** Documentation for
+/// [std::acosh](https://en.cppreference.com/w/cpp/numeric/math/acosh).
+/// \param arg function argument
+/// \return area cosine value of \a arg
+/// \exception FE_INVALID for signaling NaN or arguments <1
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+inline half acosh(half arg)
+{
+#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+    return half(detail::binary,
+                detail::float2half<half::round_style>(
+                    std::acosh(detail::half2float<detail::internal_t>(arg.data_))));
+#else
+    int abs = arg.data_ & 0x7FFF;
+    if((arg.data_ & 0x8000) || abs < 0x3C00)
+        return half(detail::binary,
+                    (abs <= 0x7C00) ? detail::invalid() : detail::signal(arg.data_));
+    if(abs == 0x3C00)
+        return half(detail::binary, 0);
+    if(arg.data_ >= 0x7C00)
+        return (abs > 0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
+    return half(detail::binary, detail::area<half::round_style, false>(arg.data_));
+#endif
+}
+
+/// Hyperbolic area tangent.
+/// This function is exact to rounding for all rounding modes.
+///
+/// **See also:** Documentation for
+/// [std::atanh](https://en.cppreference.com/w/cpp/numeric/math/atanh).
+/// \param arg function argument
+/// \return area tangent value of \a arg
+/// \exception FE_INVALID for signaling NaN or if abs(\a arg) > 1
+/// \exception FE_DIVBYZERO for +/-1
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+inline half atanh(half arg)
+{
+#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+    return half(detail::binary,
+                detail::float2half<half::round_style>(
+                    std::atanh(detail::half2float<detail::internal_t>(arg.data_))));
+#else
+    int abs = arg.data_ & 0x7FFF, exp = 0;
+    if(!abs)
+        return arg;
+    if(abs >= 0x3C00)
+        return half(detail::binary,
+                    (abs == 0x3C00)
+                        ? detail::pole(arg.data_ & 0x8000)
+                        : (abs <= 0x7C00) ? detail::invalid() : detail::signal(arg.data_));
+    if(abs < 0x2700)
+        return half(detail::binary, detail::rounded<half::round_style, true>(arg.data_, 0, 1));
+    detail::uint32 m = static_cast<detail::uint32>((abs & 0x3FF) | ((abs > 0x3FF) << 10))
+                       << ((abs >> 10) + (abs <= 0x3FF) + 6),
+                   my = 0x80000000 + m, mx = 0x80000000 - m;
+    for(; mx < 0x80000000; mx <<= 1, ++exp)
+        ;
+    int i = my >= mx, s;
+    return half(detail::binary,
+                detail::log2_post<half::round_style, 0xB8AA3B2A>(
+                    detail::log2((detail::divide64(my >> i, mx, s) + 1) >> 1, 27) + 0x10,
+                    exp + i - 1,
+                    16,
+                    arg.data_ & 0x8000));
+#endif
+}
+
+/// \}
+/// \anchor special
+/// \name Error and gamma functions
+/// \{
+
+/// Error function.
+/// This function may be 1 ULP off the correctly rounded exact result for any rounding mode in <0.5%
+/// of inputs.
+///
+/// **See also:** Documentation for [std::erf](https://en.cppreference.com/w/cpp/numeric/math/erf).
+/// \param arg function argument
+/// \return error function value of \a arg
+/// \exception FE_INVALID for signaling NaN
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+inline half erf(half arg)
+{
+#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+    return half(detail::binary,
+                detail::float2half<half::round_style>(
+                    std::erf(detail::half2float<detail::internal_t>(arg.data_))));
+#else
+    unsigned int abs = arg.data_ & 0x7FFF;
+    if(!abs || abs >= 0x7C00)
+        return (abs >= 0x7C00)
+                   ? half(detail::binary,
+                          (abs == 0x7C00) ? (arg.data_ - 0x4000) : detail::signal(arg.data_))
+                   : arg;
+    if(abs >= 0x4200)
+        return half(detail::binary,
+                    detail::rounded<half::round_style, true>((arg.data_ & 0x8000) | 0x3BFF, 1, 1));
+    return half(detail::binary, detail::erf<half::round_style, false>(arg.data_));
+#endif
+}
+
+/// Complementary error function.
+/// This function may be 1 ULP off the correctly rounded exact result for any rounding mode in <0.5%
+/// of inputs.
+///
+/// **See also:** Documentation for
+/// [std::erfc](https://en.cppreference.com/w/cpp/numeric/math/erfc).
+/// \param arg function argument
+/// \return 1 minus error function value of \a arg
+/// \exception FE_INVALID for signaling NaN
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+inline half erfc(half arg)
+{
+#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+    return half(detail::binary,
+                detail::float2half<half::round_style>(
+                    std::erfc(detail::half2float<detail::internal_t>(arg.data_))));
+#else
+    unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ & 0x8000;
+    if(abs >= 0x7C00)
+        return (abs >= 0x7C00)
+                   ? half(detail::binary, (abs == 0x7C00) ? (sign >> 1) : detail::signal(arg.data_))
+                   : arg;
+    if(!abs)
+        return half(detail::binary, 0x3C00);
+    if(abs >= 0x4400)
+        return half(
+            detail::binary,
+            detail::rounded<half::round_style, true>((sign >> 1) - (sign >> 15), sign >> 15, 1));
+    return half(detail::binary, detail::erf<half::round_style, true>(arg.data_));
+#endif
+}
+
+/// Natural logarithm of gamma function.
+/// This function may be 1 ULP off the correctly rounded exact result for any rounding mode in
+/// ~0.025% of inputs.
+///
+/// **See also:** Documentation for
+/// [std::lgamma](https://en.cppreference.com/w/cpp/numeric/math/lgamma).
+/// \param arg function argument
+/// \return natural logarith of gamma function for \a arg
+/// \exception FE_INVALID for signaling NaN
+/// \exception FE_DIVBYZERO for 0 or negative integer arguments
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+inline half lgamma(half arg)
+{
+#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+    return half(detail::binary,
+                detail::float2half<half::round_style>(
+                    std::lgamma(detail::half2float<detail::internal_t>(arg.data_))));
+#else
+    int abs = arg.data_ & 0x7FFF;
+    if(abs >= 0x7C00)
+        return half(detail::binary, (abs == 0x7C00) ? 0x7C00 : detail::signal(arg.data_));
+    if(!abs || arg.data_ >= 0xE400 ||
+       (arg.data_ >= 0xBC00 && !(abs & ((1 << (25 - (abs >> 10))) - 1))))
+        return half(detail::binary, detail::pole());
+    if(arg.data_ == 0x3C00 || arg.data_ == 0x4000)
+        return half(detail::binary, 0);
+    return half(detail::binary, detail::gamma<half::round_style, true>(arg.data_));
+#endif
+}
+
+/// Gamma function.
+/// This function may be 1 ULP off the correctly rounded exact result for any rounding mode in
+/// <0.25% of inputs.
+///
+/// **See also:** Documentation for
+/// [std::tgamma](https://en.cppreference.com/w/cpp/numeric/math/tgamma).
+/// \param arg function argument
+/// \return gamma function value of \a arg
+/// \exception FE_INVALID for signaling NaN, negative infinity or negative integer arguments
+/// \exception FE_DIVBYZERO for 0
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+inline half tgamma(half arg)
+{
+#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+    return half(detail::binary,
+                detail::float2half<half::round_style>(
+                    std::tgamma(detail::half2float<detail::internal_t>(arg.data_))));
+#else
+    unsigned int abs = arg.data_ & 0x7FFF;
+    if(!abs)
+        return half(detail::binary, detail::pole(arg.data_));
+    if(abs >= 0x7C00)
+        return (arg.data_ == 0x7C00) ? arg : half(detail::binary, detail::signal(arg.data_));
+    if(arg.data_ >= 0xE400 || (arg.data_ >= 0xBC00 && !(abs & ((1 << (25 - (abs >> 10))) - 1))))
+        return half(detail::binary, detail::invalid());
+    if(arg.data_ >= 0xCA80)
+        return half(
+            detail::binary,
+            detail::underflow<half::round_style>((1 - ((abs >> (25 - (abs >> 10))) & 1)) << 15));
+    if(arg.data_ <= 0x100 || (arg.data_ >= 0x4900 && arg.data_ < 0x8000))
+        return half(detail::binary, detail::overflow<half::round_style>());
+    if(arg.data_ == 0x3C00)
+        return arg;
+    return half(detail::binary, detail::gamma<half::round_style, false>(arg.data_));
+#endif
+}
+
+/// \}
+/// \anchor rounding
+/// \name Rounding
+/// \{
+
+/// Nearest integer not less than half value.
+/// **See also:** Documentation for
+/// [std::ceil](https://en.cppreference.com/w/cpp/numeric/math/ceil).
+/// \param arg half to round
+/// \return nearest integer not less than \a arg
+/// \exception FE_INVALID for signaling NaN
+/// \exception FE_INEXACT if value had to be rounded
+inline half ceil(half arg)
+{
+    return half(detail::binary,
+                detail::integral<std::round_toward_infinity, true, true>(arg.data_));
+}
+
+/// Nearest integer not greater than half value.
+/// **See also:** Documentation for
+/// [std::floor](https://en.cppreference.com/w/cpp/numeric/math/floor).
+/// \param arg half to round
+/// \return nearest integer not greater than \a arg
+/// \exception FE_INVALID for signaling NaN
+/// \exception FE_INEXACT if value had to be rounded
+inline half floor(half arg)
+{
+    return half(detail::binary,
+                detail::integral<std::round_toward_neg_infinity, true, true>(arg.data_));
+}
+
+/// Nearest integer not greater in magnitude than half value.
+/// **See also:** Documentation for
+/// [std::trunc](https://en.cppreference.com/w/cpp/numeric/math/trunc).
+/// \param arg half to round
+/// \return nearest integer not greater in magnitude than \a arg
+/// \exception FE_INVALID for signaling NaN
+/// \exception FE_INEXACT if value had to be rounded
+inline half trunc(half arg)
+{
+    return half(detail::binary, detail::integral<std::round_toward_zero, true, true>(arg.data_));
+}
+
+/// Nearest integer.
+/// **See also:** Documentation for
+/// [std::round](https://en.cppreference.com/w/cpp/numeric/math/round).
+/// \param arg half to round
+/// \return nearest integer, rounded away from zero in half-way cases
+/// \exception FE_INVALID for signaling NaN
+/// \exception FE_INEXACT if value had to be rounded
+inline half round(half arg)
+{
+    return half(detail::binary, detail::integral<std::round_to_nearest, false, true>(arg.data_));
+}
+
+/// Nearest integer.
+/// **See also:** Documentation for
+/// [std::lround](https://en.cppreference.com/w/cpp/numeric/math/round).
+/// \param arg half to round
+/// \return nearest integer, rounded away from zero in half-way cases
+/// \exception FE_INVALID if value is not representable as `long`
+inline long lround(half arg)
+{
+    return detail::half2int<std::round_to_nearest, false, false, long>(arg.data_);
+}
+
+/// Nearest integer using half's internal rounding mode.
+/// **See also:** Documentation for
+/// [std::rint](https://en.cppreference.com/w/cpp/numeric/math/rint).
+/// \param arg half expression to round
+/// \return nearest integer using default rounding mode
+/// \exception FE_INVALID for signaling NaN
+/// \exception FE_INEXACT if value had to be rounded
+inline half rint(half arg)
+{
+    return half(detail::binary, detail::integral<half::round_style, true, true>(arg.data_));
+}
+
+/// Nearest integer using half's internal rounding mode.
+/// **See also:** Documentation for
+/// [std::lrint](https://en.cppreference.com/w/cpp/numeric/math/rint).
+/// \param arg half expression to round
+/// \return nearest integer using default rounding mode
+/// \exception FE_INVALID if value is not representable as `long`
+/// \exception FE_INEXACT if value had to be rounded
+inline long lrint(half arg)
+{
+    return detail::half2int<half::round_style, true, true, long>(arg.data_);
+}
+
+/// Nearest integer using half's internal rounding mode.
+/// **See also:** Documentation for
+/// [std::nearbyint](https://en.cppreference.com/w/cpp/numeric/math/nearbyint).
+/// \param arg half expression to round
+/// \return nearest integer using default rounding mode
+/// \exception FE_INVALID for signaling NaN
+inline half nearbyint(half arg)
+{
+    return half(detail::binary, detail::integral<half::round_style, true, false>(arg.data_));
+}
+#if HALF_ENABLE_CPP11_LONG_LONG
+/// Nearest integer.
+/// **See also:** Documentation for
+/// [std::llround](https://en.cppreference.com/w/cpp/numeric/math/round).
+/// \param arg half to round
+/// \return nearest integer, rounded away from zero in half-way cases
+/// \exception FE_INVALID if value is not representable as `long long`
+inline long long llround(half arg)
+{
+    return detail::half2int<std::round_to_nearest, false, false, long long>(arg.data_);
+}
+
+/// Nearest integer using half's internal rounding mode.
+/// **See also:** Documentation for
+/// [std::llrint](https://en.cppreference.com/w/cpp/numeric/math/rint).
+/// \param arg half expression to round
+/// \return nearest integer using default rounding mode
+/// \exception FE_INVALID if value is not representable as `long long`
+/// \exception FE_INEXACT if value had to be rounded
+inline long long llrint(half arg)
+{
+    return detail::half2int<half::round_style, true, true, long long>(arg.data_);
+}
+#endif
+
+/// \}
+/// \anchor float
+/// \name Floating point manipulation
+/// \{
+
+/// Decompress floating-point number.
+/// **See also:** Documentation for
+/// [std::frexp](https://en.cppreference.com/w/cpp/numeric/math/frexp).
+/// \param arg number to decompress
+/// \param exp address to store exponent at
+/// \return significant in range [0.5, 1)
+/// \exception FE_INVALID for signaling NaN
+inline half frexp(half arg, int* exp)
+{
+    *exp             = 0;
+    unsigned int abs = arg.data_ & 0x7FFF;
+    if(abs >= 0x7C00 || !abs)
+        return (abs > 0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
+    for(; abs < 0x400; abs <<= 1, --*exp)
+        ;
+    *exp += (abs >> 10) - 14;
+    return half(detail::binary, (arg.data_ & 0x8000) | 0x3800 | (abs & 0x3FF));
+}
+
+/// Multiply by power of two.
+/// This function is exact to rounding for all rounding modes.
+///
+/// **See also:** Documentation for
+/// [std::scalbln](https://en.cppreference.com/w/cpp/numeric/math/scalbn).
+/// \param arg number to modify
+/// \param exp power of two to multiply with
+/// \return \a arg multplied by 2 raised to \a exp
+/// \exception FE_INVALID for signaling NaN
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+inline half scalbln(half arg, long exp)
+{
+    unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ & 0x8000;
+    if(abs >= 0x7C00 || !abs)
+        return (abs > 0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
+    for(; abs < 0x400; abs <<= 1, --exp)
+        ;
+    exp += abs >> 10;
+    if(exp > 30)
+        return half(detail::binary, detail::overflow<half::round_style>(sign));
+    else if(exp < -10)
+        return half(detail::binary, detail::underflow<half::round_style>(sign));
+    else if(exp > 0)
+        return half(detail::binary, sign | (exp << 10) | (abs & 0x3FF));
+    unsigned int m = (abs & 0x3FF) | 0x400;
+    return half(detail::binary,
+                detail::rounded<half::round_style, false>(
+                    sign | (m >> (1 - exp)), (m >> -exp) & 1, (m & ((1 << -exp) - 1)) != 0));
+}
+
+/// Multiply by power of two.
+/// This function is exact to rounding for all rounding modes.
+///
+/// **See also:** Documentation for
+/// [std::scalbn](https://en.cppreference.com/w/cpp/numeric/math/scalbn).
+/// \param arg number to modify
+/// \param exp power of two to multiply with
+/// \return \a arg multplied by 2 raised to \a exp
+/// \exception FE_INVALID for signaling NaN
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+inline half scalbn(half arg, int exp) { return scalbln(arg, exp); }
+
+/// Multiply by power of two.
+/// This function is exact to rounding for all rounding modes.
+///
+/// **See also:** Documentation for
+/// [std::ldexp](https://en.cppreference.com/w/cpp/numeric/math/ldexp).
+/// \param arg number to modify
+/// \param exp power of two to multiply with
+/// \return \a arg multplied by 2 raised to \a exp
+/// \exception FE_INVALID for signaling NaN
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+inline half ldexp(half arg, int exp) { return scalbln(arg, exp); }
+
+/// Extract integer and fractional parts.
+/// **See also:** Documentation for
+/// [std::modf](https://en.cppreference.com/w/cpp/numeric/math/modf).
+/// \param arg number to decompress
+/// \param iptr address to store integer part at
+/// \return fractional part
+/// \exception FE_INVALID for signaling NaN
+inline half modf(half arg, half* iptr)
+{
+    unsigned int abs = arg.data_ & 0x7FFF;
+    if(abs > 0x7C00)
+    {
+        arg          = half(detail::binary, detail::signal(arg.data_));
+        return *iptr = arg, arg;
+    }
+    if(abs >= 0x6400)
+        return *iptr = arg, half(detail::binary, arg.data_ & 0x8000);
+    if(abs < 0x3C00)
+        return iptr->data_ = arg.data_ & 0x8000, arg;
+    unsigned int exp = abs >> 10, mask = (1 << (25 - exp)) - 1, m = arg.data_ & mask;
+    iptr->data_ = arg.data_ & ~mask;
+    if(!m)
+        return half(detail::binary, arg.data_ & 0x8000);
+    for(; m < 0x400; m <<= 1, --exp)
+        ;
+    return half(detail::binary, (arg.data_ & 0x8000) | (exp << 10) | (m & 0x3FF));
+}
+
+/// Extract exponent.
+/// **See also:** Documentation for
+/// [std::ilogb](https://en.cppreference.com/w/cpp/numeric/math/ilogb).
+/// \param arg number to query
+/// \return floating-point exponent
+/// \retval FP_ILOGB0 for zero
+/// \retval FP_ILOGBNAN for NaN
+/// \retval INT_MAX for infinity
+/// \exception FE_INVALID for 0 or infinite values
+inline int ilogb(half arg)
+{
+    int abs = arg.data_ & 0x7FFF, exp;
+    if(!abs || abs >= 0x7C00)
+    {
+        detail::raise(FE_INVALID);
+        return !abs ? FP_ILOGB0 : (abs == 0x7C00) ? INT_MAX : FP_ILOGBNAN;
+    }
+    for(exp = (abs >> 10) - 15; abs < 0x200; abs <<= 1, --exp)
+        ;
+    return exp;
+}
+
+/// Extract exponent.
+/// **See also:** Documentation for
+/// [std::logb](https://en.cppreference.com/w/cpp/numeric/math/logb).
+/// \param arg number to query
+/// \return floating-point exponent
+/// \exception FE_INVALID for signaling NaN
+/// \exception FE_DIVBYZERO for 0
+inline half logb(half arg)
+{
+    int abs = arg.data_ & 0x7FFF, exp;
+    if(!abs)
+        return half(detail::binary, detail::pole(0x8000));
+    if(abs >= 0x7C00)
+        return half(detail::binary, (abs == 0x7C00) ? 0x7C00 : detail::signal(arg.data_));
+    for(exp = (abs >> 10) - 15; abs < 0x200; abs <<= 1, --exp)
+        ;
+    unsigned int value = static_cast<unsigned>(exp < 0) << 15;
+    if(exp)
+    {
+        unsigned int m = std::abs(exp) << 6;
+        for(exp = 18; m < 0x400; m <<= 1, --exp)
+            ;
+        value |= (exp << 10) + m;
+    }
+    return half(detail::binary, value);
+}
+
+/// Next representable value.
+/// **See also:** Documentation for
+/// [std::nextafter](https://en.cppreference.com/w/cpp/numeric/math/nextafter).
+/// \param from value to compute next representable value for
+/// \param to direction towards which to compute next value
+/// \return next representable value after \a from in direction towards \a to
+/// \exception FE_INVALID for signaling NaN
+/// \exception FE_OVERFLOW for infinite result from finite argument
+/// \exception FE_UNDERFLOW for subnormal result
+inline half nextafter(half from, half to)
+{
+    int fabs = from.data_ & 0x7FFF, tabs = to.data_ & 0x7FFF;
+    if(fabs > 0x7C00 || tabs > 0x7C00)
+        return half(detail::binary, detail::signal(from.data_, to.data_));
+    if(from.data_ == to.data_ || !(fabs | tabs))
+        return to;
+    if(!fabs)
+    {
+        detail::raise(FE_UNDERFLOW, !HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT);
+        return half(detail::binary, (to.data_ & 0x8000) + 1);
+    }
+    unsigned int out =
+        from.data_ +
+        (((from.data_ >> 15) ^
+          static_cast<unsigned>((from.data_ ^ (0x8000 | (0x8000 - (from.data_ >> 15)))) <
+                                (to.data_ ^ (0x8000 | (0x8000 - (to.data_ >> 15))))))
+         << 1) -
+        1;
+    detail::raise(FE_OVERFLOW, fabs < 0x7C00 && (out & 0x7C00) == 0x7C00);
+    detail::raise(FE_UNDERFLOW, !HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT && (out & 0x7C00) < 0x400);
+    return half(detail::binary, out);
+}
+
+/// Next representable value.
+/// **See also:** Documentation for
+/// [std::nexttoward](https://en.cppreference.com/w/cpp/numeric/math/nexttoward).
+/// \param from value to compute next representable value for
+/// \param to direction towards which to compute next value
+/// \return next representable value after \a from in direction towards \a to
+/// \exception FE_INVALID for signaling NaN
+/// \exception FE_OVERFLOW for infinite result from finite argument
+/// \exception FE_UNDERFLOW for subnormal result
+inline half nexttoward(half from, long double to)
+{
+    int fabs = from.data_ & 0x7FFF;
+    if(fabs > 0x7C00)
+        return half(detail::binary, detail::signal(from.data_));
+    long double lfrom = static_cast<long double>(from);
+    if(detail::builtin_isnan(to) || lfrom == to)
+        return half(static_cast<float>(to));
+    if(!fabs)
+    {
+        detail::raise(FE_UNDERFLOW, !HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT);
+        return half(detail::binary, (static_cast<unsigned>(detail::builtin_signbit(to)) << 15) + 1);
+    }
+    unsigned int out =
+        from.data_ + (((from.data_ >> 15) ^ static_cast<unsigned>(lfrom < to)) << 1) - 1;
+    detail::raise(FE_OVERFLOW, (out & 0x7FFF) == 0x7C00);
+    detail::raise(FE_UNDERFLOW, !HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT && (out & 0x7FFF) < 0x400);
+    return half(detail::binary, out);
+}
+
+/// Take sign.
+/// **See also:** Documentation for
+/// [std::copysign](https://en.cppreference.com/w/cpp/numeric/math/copysign).
+/// \param x value to change sign for
+/// \param y value to take sign from
+/// \return value equal to \a x in magnitude and to \a y in sign
+inline HALF_CONSTEXPR half copysign(half x, half y)
+{
+    return half(detail::binary, x.data_ ^ ((x.data_ ^ y.data_) & 0x8000));
+}
+
+/// \}
+/// \anchor classification
+/// \name Floating point classification
+/// \{
+
+/// Classify floating-point value.
+/// **See also:** Documentation for
+/// [std::fpclassify](https://en.cppreference.com/w/cpp/numeric/math/fpclassify).
+/// \param arg number to classify
+/// \retval FP_ZERO for positive and negative zero
+/// \retval FP_SUBNORMAL for subnormal numbers
+/// \retval FP_INFINITY for positive and negative infinity
+/// \retval FP_NAN for NaNs
+/// \retval FP_NORMAL for all other (normal) values
+inline HALF_CONSTEXPR int fpclassify(half arg)
+{
+    return !(arg.data_ & 0x7FFF)
+               ? FP_ZERO
+               : ((arg.data_ & 0x7FFF) < 0x400)
+                     ? FP_SUBNORMAL
+                     : ((arg.data_ & 0x7FFF) < 0x7C00)
+                           ? FP_NORMAL
+                           : ((arg.data_ & 0x7FFF) == 0x7C00) ? FP_INFINITE : FP_NAN;
+}
+
+/// Check if finite number.
+/// **See also:** Documentation for
+/// [std::isfinite](https://en.cppreference.com/w/cpp/numeric/math/isfinite).
+/// \param arg number to check
+/// \retval true if neither infinity nor NaN
+/// \retval false else
+inline HALF_CONSTEXPR bool isfinite(half arg) { return (arg.data_ & 0x7C00) != 0x7C00; }
+
+/// Check for infinity.
+/// **See also:** Documentation for
+/// [std::isinf](https://en.cppreference.com/w/cpp/numeric/math/isinf).
+/// \param arg number to check
+/// \retval true for positive or negative infinity
+/// \retval false else
+inline HALF_CONSTEXPR bool isinf(half arg) { return (arg.data_ & 0x7FFF) == 0x7C00; }
+
+/// Check for NaN.
+/// **See also:** Documentation for
+/// [std::isnan](https://en.cppreference.com/w/cpp/numeric/math/isnan).
+/// \param arg number to check
+/// \retval true for NaNs
+/// \retval false else
+inline HALF_CONSTEXPR bool isnan(half arg) { return (arg.data_ & 0x7FFF) > 0x7C00; }
+
+/// Check if normal number.
+/// **See also:** Documentation for
+/// [std::isnormal](https://en.cppreference.com/w/cpp/numeric/math/isnormal).
+/// \param arg number to check
+/// \retval true if normal number
+/// \retval false if either subnormal, zero, infinity or NaN
+inline HALF_CONSTEXPR bool isnormal(half arg)
+{
+    return ((arg.data_ & 0x7C00) != 0) & ((arg.data_ & 0x7C00) != 0x7C00);
+}
+
+/// Check sign.
+/// **See also:** Documentation for
+/// [std::signbit](https://en.cppreference.com/w/cpp/numeric/math/signbit).
+/// \param arg number to check
+/// \retval true for negative number
+/// \retval false for positive number
+inline HALF_CONSTEXPR bool signbit(half arg) { return (arg.data_ & 0x8000) != 0; }
+
+/// \}
+/// \anchor compfunc
+/// \name Comparison
+/// \{
+
+/// Quiet comparison for greater than.
+/// **See also:** Documentation for
+/// [std::isgreater](https://en.cppreference.com/w/cpp/numeric/math/isgreater).
+/// \param x first operand
+/// \param y second operand
+/// \retval true if \a x greater than \a y
+/// \retval false else
+inline HALF_CONSTEXPR bool isgreater(half x, half y)
+{
+    return ((x.data_ ^ (0x8000 | (0x8000 - (x.data_ >> 15)))) + (x.data_ >> 15)) >
+               ((y.data_ ^ (0x8000 | (0x8000 - (y.data_ >> 15)))) + (y.data_ >> 15)) &&
+           !isnan(x) && !isnan(y);
+}
+
+/// Quiet comparison for greater equal.
+/// **See also:** Documentation for
+/// [std::isgreaterequal](https://en.cppreference.com/w/cpp/numeric/math/isgreaterequal).
+/// \param x first operand
+/// \param y second operand
+/// \retval true if \a x greater equal \a y
+/// \retval false else
+inline HALF_CONSTEXPR bool isgreaterequal(half x, half y)
+{
+    return ((x.data_ ^ (0x8000 | (0x8000 - (x.data_ >> 15)))) + (x.data_ >> 15)) >=
+               ((y.data_ ^ (0x8000 | (0x8000 - (y.data_ >> 15)))) + (y.data_ >> 15)) &&
+           !isnan(x) && !isnan(y);
+}
+
+/// Quiet comparison for less than.
+/// **See also:** Documentation for
+/// [std::isless](https://en.cppreference.com/w/cpp/numeric/math/isless).
+/// \param x first operand
+/// \param y second operand
+/// \retval true if \a x less than \a y
+/// \retval false else
+inline HALF_CONSTEXPR bool isless(half x, half y)
+{
+    return ((x.data_ ^ (0x8000 | (0x8000 - (x.data_ >> 15)))) + (x.data_ >> 15)) <
+               ((y.data_ ^ (0x8000 | (0x8000 - (y.data_ >> 15)))) + (y.data_ >> 15)) &&
+           !isnan(x) && !isnan(y);
+}
+
+/// Quiet comparison for less equal.
+/// **See also:** Documentation for
+/// [std::islessequal](https://en.cppreference.com/w/cpp/numeric/math/islessequal).
+/// \param x first operand
+/// \param y second operand
+/// \retval true if \a x less equal \a y
+/// \retval false else
+inline HALF_CONSTEXPR bool islessequal(half x, half y)
+{
+    return ((x.data_ ^ (0x8000 | (0x8000 - (x.data_ >> 15)))) + (x.data_ >> 15)) <=
+               ((y.data_ ^ (0x8000 | (0x8000 - (y.data_ >> 15)))) + (y.data_ >> 15)) &&
+           !isnan(x) && !isnan(y);
+}
+
+/// Quiet comarison for less or greater.
+/// **See also:** Documentation for
+/// [std::islessgreater](https://en.cppreference.com/w/cpp/numeric/math/islessgreater).
+/// \param x first operand
+/// \param y second operand
+/// \retval true if either less or greater
+/// \retval false else
+inline HALF_CONSTEXPR bool islessgreater(half x, half y)
+{
+    return x.data_ != y.data_ && ((x.data_ | y.data_) & 0x7FFF) && !isnan(x) && !isnan(y);
+}
+
+/// Quiet check if unordered.
+/// **See also:** Documentation for
+/// [std::isunordered](https://en.cppreference.com/w/cpp/numeric/math/isunordered).
+/// \param x first operand
+/// \param y second operand
+/// \retval true if unordered (one or two NaN operands)
+/// \retval false else
+inline HALF_CONSTEXPR bool isunordered(half x, half y) { return isnan(x) || isnan(y); }
+
+/// \}
+/// \anchor casting
+/// \name Casting
+/// \{
+
+/// Cast to or from half-precision floating-point number.
+/// This casts between [half](\ref half_float::half) and any built-in arithmetic type. The values
+/// are converted
+/// directly using the default rounding mode, without any roundtrip over `float` that a
+/// `static_cast` would otherwise do.
+///
+/// Using this cast with neither of the two types being a [half](\ref half_float::half) or with any
+/// of the two types
+/// not being a built-in arithmetic type (apart from [half](\ref half_float::half), of course)
+/// results in a compiler
+/// error and casting between [half](\ref half_float::half)s returns the argument unmodified.
+/// \tparam T destination type (half or built-in arithmetic type)
+/// \tparam U source type (half or built-in arithmetic type)
+/// \param arg value to cast
+/// \return \a arg converted to destination type
+/// \exception FE_INVALID if \a T is integer type and result is not representable as \a T
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+template <typename T, typename U>
+T half_cast(U arg)
+{
+    return detail::half_caster<T, U>::cast(arg);
+}
+
+/// Cast to or from half-precision floating-point number.
+/// This casts between [half](\ref half_float::half) and any built-in arithmetic type. The values
+/// are converted
+/// directly using the specified rounding mode, without any roundtrip over `float` that a
+/// `static_cast` would otherwise do.
+///
+/// Using this cast with neither of the two types being a [half](\ref half_float::half) or with any
+/// of the two types
+/// not being a built-in arithmetic type (apart from [half](\ref half_float::half), of course)
+/// results in a compiler
+/// error and casting between [half](\ref half_float::half)s returns the argument unmodified.
+/// \tparam T destination type (half or built-in arithmetic type)
+/// \tparam R rounding mode to use.
+/// \tparam U source type (half or built-in arithmetic type)
+/// \param arg value to cast
+/// \return \a arg converted to destination type
+/// \exception FE_INVALID if \a T is integer type and result is not representable as \a T
+/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+template <typename T, std::float_round_style R, typename U>
+T half_cast(U arg)
+{
+    return detail::half_caster<T, U, R>::cast(arg);
+}
+/// \}
+
+/// \}
+/// \anchor errors
+/// \name Error handling
+/// \{
+
+/// Clear exception flags.
+/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is
+/// disabled,
+/// but in that case manual flag management is the only way to raise flags.
+///
+/// **See also:** Documentation for
+/// [std::feclearexcept](https://en.cppreference.com/w/cpp/numeric/fenv/feclearexcept).
+/// \param excepts OR of exceptions to clear
+/// \retval 0 all selected flags cleared successfully
+inline int feclearexcept(int excepts)
+{
+    detail::errflags() &= ~excepts;
+    return 0;
+}
+
+/// Test exception flags.
+/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is
+/// disabled,
+/// but in that case manual flag management is the only way to raise flags.
+///
+/// **See also:** Documentation for
+/// [std::fetestexcept](https://en.cppreference.com/w/cpp/numeric/fenv/fetestexcept).
+/// \param excepts OR of exceptions to test
+/// \return OR of selected exceptions if raised
+inline int fetestexcept(int excepts) { return detail::errflags() & excepts; }
+
+/// Raise exception flags.
+/// This raises the specified floating point exceptions and also invokes any additional automatic
+/// exception handling as
+/// configured with the [HALF_ERRHANDLIG_...](\ref HALF_ERRHANDLING_ERRNO) preprocessor symbols.
+/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is
+/// disabled,
+/// but in that case manual flag management is the only way to raise flags.
+///
+/// **See also:** Documentation for
+/// [std::feraiseexcept](https://en.cppreference.com/w/cpp/numeric/fenv/feraiseexcept).
+/// \param excepts OR of exceptions to raise
+/// \retval 0 all selected exceptions raised successfully
+inline int feraiseexcept(int excepts)
+{
+    detail::errflags() |= excepts;
+    detail::raise(excepts);
+    return 0;
+}
+
+/// Save exception flags.
+/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is
+/// disabled,
+/// but in that case manual flag management is the only way to raise flags.
+///
+/// **See also:** Documentation for
+/// [std::fegetexceptflag](https://en.cppreference.com/w/cpp/numeric/fenv/feexceptflag).
+/// \param flagp adress to store flag state at
+/// \param excepts OR of flags to save
+/// \retval 0 for success
+inline int fegetexceptflag(int* flagp, int excepts)
+{
+    *flagp = detail::errflags() & excepts;
+    return 0;
+}
+
+/// Restore exception flags.
+/// This only copies the specified exception state (including unset flags) without incurring any
+/// additional exception handling.
+/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is
+/// disabled,
+/// but in that case manual flag management is the only way to raise flags.
+///
+/// **See also:** Documentation for
+/// [std::fesetexceptflag](https://en.cppreference.com/w/cpp/numeric/fenv/feexceptflag).
+/// \param flagp adress to take flag state from
+/// \param excepts OR of flags to restore
+/// \retval 0 for success
+inline int fesetexceptflag(const int* flagp, int excepts)
+{
+    detail::errflags() = (detail::errflags() | (*flagp & excepts)) & (*flagp | ~excepts);
+    return 0;
+}
+
+/// Throw C++ exceptions based on set exception flags.
+/// This function manually throws a corresponding C++ exception if one of the specified flags is
+/// set,
+/// no matter if automatic throwing (via [HALF_ERRHANDLING_THROW_...](\ref
+/// HALF_ERRHANDLING_THROW_INVALID)) is enabled or not.
+/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is
+/// disabled,
+/// but in that case manual flag management is the only way to raise flags.
+/// \param excepts OR of exceptions to test
+/// \param msg error message to use for exception description
+/// \throw std::domain_error if `FE_INVALID` or `FE_DIVBYZERO` is selected and set
+/// \throw std::overflow_error if `FE_OVERFLOW` is selected and set
+/// \throw std::underflow_error if `FE_UNDERFLOW` is selected and set
+/// \throw std::range_error if `FE_INEXACT` is selected and set
+inline void fethrowexcept(int excepts, const char* msg = "")
+{
+    excepts &= detail::errflags();
+    if(excepts & (FE_INVALID | FE_DIVBYZERO))
+        throw std::domain_error(msg);
+    if(excepts & FE_OVERFLOW)
+        throw std::overflow_error(msg);
+    if(excepts & FE_UNDERFLOW)
+        throw std::underflow_error(msg);
+    if(excepts & FE_INEXACT)
+        throw std::range_error(msg);
+}
+/// \}
+} // namespace half_float
+
+#undef HALF_UNUSED_NOERR
+#undef HALF_CONSTEXPR
+#undef HALF_CONSTEXPR_CONST
+#undef HALF_CONSTEXPR_NOERR
+#undef HALF_NOEXCEPT
+#undef HALF_NOTHROW
+#undef HALF_THREAD_LOCAL
+#undef HALF_TWOS_COMPLEMENT_INT
+#ifdef HALF_POP_WARNINGS
+#pragma warning(pop)
+#undef HALF_POP_WARNINGS
+#endif
+
+#endif
diff --git a/external/rocm/include/bfloat16_dev.hpp b/external/rocm/include/bfloat16_dev.hpp
new file mode 100644
index 0000000000..52d00346cf
--- /dev/null
+++ b/external/rocm/include/bfloat16_dev.hpp
@@ -0,0 +1,125 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef BFLOAT16_DEVICE_HPP
+#define BFLOAT16_DEVICE_HPP
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __HIP_PLATFORM_HCC__
+#define EXECUTION_SPECIFIER __device__
+#else
+#define EXECUTION_SPECIFIER
+#endif // MIOPEN_BACKEND_HIP
+
+typedef union
+{
+    uint u32;
+    ushort2 ushortx2;
+
+// Composable kernels are written in HIP language. The language doesnt support
+// ushort2.hi or ushort2.low.
+#ifdef __HIP_PLATFORM_HCC__
+    ushort ushortvec[2];
+#endif // MIOPEN_BACKEND_HIP
+    float f32;
+} cvt_bf16_fp32_t;
+
+EXECUTION_SPECIFIER float bfloat16_to_float(ushort src_val)
+{
+    cvt_bf16_fp32_t target_val;
+
+#ifdef __HIP_PLATFORM_HCC__
+    target_val.ushortx2 = make_ushort2(0, src_val);
+#else
+    target_val.ushortx2 = (ushort2)(0, src_val);
+#endif
+
+    return target_val.f32;
+}
+
+EXECUTION_SPECIFIER ushort float_to_bfloat16(float src_val)
+{
+    cvt_bf16_fp32_t target_val;
+    target_val.f32 = src_val;
+    // BF16 round and NaN preservation code matches
+    // https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/library/include/rocblas_bfloat16.h
+    if((~target_val.u32 & 0x7f800000) == 0) // Inf or NaN
+    {
+        // When all of the exponent bits are 1, the value is Inf or NaN.
+        // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
+        // mantissa bit. Quiet NaN is indicated by the most significant mantissa
+        // bit being 1. Signaling NaN is indicated by the most significant
+        // mantissa bit being 0 but some other bit(s) being 1. If any of the
+        // lower 16 bits of the mantissa are 1, we set the least significant bit
+        // of the bfloat16 mantissa, in order to preserve signaling NaN in case
+        // the bloat16's mantissa bits are all 0.
+        if((target_val.u32 & 0xffff) != 0)
+        {
+            target_val.u32 |= 0x10000; // Preserve signaling NaN
+        }
+    }
+    else
+    {
+#ifdef MIOPEN_USE_RNE_BFLOAT16
+// When the exponent bits are not all 1s, then the value is zero, normal,
+// or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
+// 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
+// This causes the bfloat16's mantissa to be incremented by 1 if the 16
+// least significant bits of the float mantissa are greater than 0x8000,
+// or if they are equal to 0x8000 and the least significant bit of the
+// bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
+// the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
+// has the value 0x7f, then incrementing it causes it to become 0x00 and
+// the exponent is incremented by one, which is the next higher FP value
+// to the unrounded bfloat16 value. When the bfloat16 value is subnormal
+// with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up
+// to a normal value with an exponent of 0x01 and a mantissa of 0x00.
+// When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
+// incrementing it causes it to become an exponent of 0xFF and a mantissa
+// of 0x00, which is Inf, the next higher value to the unrounded value.
+#ifdef __HIP_PLATFORM_HCC__
+        target_val.u32 += (0x7fff + (target_val.ushortvec[1] & 1));
+#else
+        target_val.u32 +=
+            (0x7fff + (target_val.ushortx2.hi & 1)); // Round to nearest, round to even
+#endif // MIOPEN_BACKEND_HIP
+#endif // MIOPEN_USE_RNE_BFLOAT16
+    }
+
+#ifdef __HIP_PLATFORM_HCC__
+    return target_val.ushortvec[1];
+#else
+    return target_val.ushortx2.hi;
+#endif // MIOPEN_BACKEND_HIP
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // BFLOAT16_DEVICE_HPP
diff --git a/host/CMakeLists.txt b/host/CMakeLists.txt
new file mode 100644
index 0000000000..c9779398a6
--- /dev/null
+++ b/host/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_subdirectory(host_tensor)
+add_subdirectory(online_compilation)
+add_subdirectory(driver_offline)
+add_subdirectory(driver_online)
diff --git a/host/driver_offline/CMakeLists.txt b/host/driver_offline/CMakeLists.txt
new file mode 100644
index 0000000000..85bd31fbca
--- /dev/null
+++ b/host/driver_offline/CMakeLists.txt
@@ -0,0 +1,21 @@
+include_directories(BEFORE
+    include
+    ${PROJECT_SOURCE_DIR}/host/host_tensor/include
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include/utility
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_operation
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include/problem_transform
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include/driver
+    ${PROJECT_SOURCE_DIR}/external/rocm/include
+    ${PROJECT_SOURCE_DIR}/external/half/include
+)
+
+set(CONV_FWD_DRIVER_OFFLINE_SOURCE conv_fwd_driver_offline.cpp)
+set(CONV_BWD_DRIVER_OFFLINE_SOURCE conv_bwd_driver_offline.cpp)
+
+add_executable(conv_fwd_driver_offline ${CONV_FWD_DRIVER_OFFLINE_SOURCE})
+add_executable(conv_bwd_driver_offline ${CONV_BWD_DRIVER_OFFLINE_SOURCE})
+
+target_link_libraries(conv_fwd_driver_offline PRIVATE host_tensor)
+target_link_libraries(conv_bwd_driver_offline PRIVATE host_tensor)
diff --git a/host/driver_offline/conv_bwd_driver_offline.cpp b/host/driver_offline/conv_bwd_driver_offline.cpp
new file mode 100644
index 0000000000..61c3fc385d
--- /dev/null
+++ b/host/driver_offline/conv_bwd_driver_offline.cpp
@@ -0,0 +1,357 @@
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "config.hpp"
+#include "print.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "conv_common.hpp"
+#include "host_conv_bwd_data.hpp"
+#include "device_tensor.hpp"
+#include "device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp"
+#include "device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp"
+
+#define USE_DYNAMIC_MODE 1
+#define USE_CONV_BWD_V4R1_XDL_NHWC 1
+#define USE_CONV_BWD_V4R1R2_XDL_NHWC 1
+
+enum ConvBackwardDataAlgo
+{
+    V4R1XDLNHWC,
+    V4R1R2XDLNHWC,
+};
+
+int main(int argc, char* argv[])
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+    constexpr auto I6 = Number<6>{};
+
+#if USE_DYNAMIC_MODE
+    // dynamic mode
+    if(argc != 22)
+    {
+        printf("arg1 to 5: layout, algo, do_verification, init_method, do_log, nrepeat\n");
+        printf("rest: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx\n");
+        exit(1);
+    }
+
+    const ConvTensorLayout layout   = static_cast<ConvTensorLayout>(atoi(argv[1]));
+    const ConvBackwardDataAlgo algo = static_cast<ConvBackwardDataAlgo>(atoi(argv[2]));
+    const bool do_verification      = atoi(argv[3]);
+    const int init_method           = atoi(argv[4]);
+    const bool do_log               = atoi(argv[5]);
+    const int nrepeat               = atoi(argv[6]);
+
+    const index_t N  = atoi(argv[7]);
+    const index_t K  = atoi(argv[8]);
+    const index_t C  = atoi(argv[9]);
+    const index_t Y  = atoi(argv[10]);
+    const index_t X  = atoi(argv[11]);
+    const index_t Hi = atoi(argv[12]);
+    const index_t Wi = atoi(argv[13]);
+
+    const index_t conv_stride_h   = atoi(argv[14]);
+    const index_t conv_stride_w   = atoi(argv[15]);
+    const index_t conv_dilation_h = atoi(argv[16]);
+    const index_t conv_dilation_w = atoi(argv[17]);
+    const index_t in_left_pad_h   = atoi(argv[18]);
+    const index_t in_left_pad_w   = atoi(argv[19]);
+    const index_t in_right_pad_h  = atoi(argv[20]);
+    const index_t in_right_pad_w  = atoi(argv[21]);
+
+    const index_t YEff = (Y - 1) * conv_dilation_h + 1;
+    const index_t XEff = (X - 1) * conv_dilation_w + 1;
+
+    const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+    const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+#else
+    // static mode
+    if(argc < 7)
+    {
+        printf("arg1 to 5: layout, algo, do_verification, init_method, do_log, nrepeat\n");
+        exit(1);
+    }
+
+    const ConvTensorLayout layout   = static_cast<ConvTensorLayout>(atoi(argv[1]));
+    const ConvBackwardDataAlgo algo = static_cast<ConvBackwardDataAlgo>(atoi(argv[2]));
+    const bool do_verification      = atoi(argv[3]);
+    const int init_method           = atoi(argv[4]);
+    const bool do_log               = atoi(argv[5]);
+    const int nrepeat               = atoi(argv[6]);
+
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 192;
+    constexpr index_t Hi = 71;
+    constexpr index_t Wi = 71;
+    constexpr index_t K  = 256;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+
+    const index_t conv_stride_h   = 2;
+    const index_t conv_stride_w   = 2;
+    const index_t conv_dilation_h = 1;
+    const index_t conv_dilation_w = 1;
+    const index_t in_left_pad_h   = 1;
+    const index_t in_left_pad_w   = 1;
+    const index_t in_right_pad_h  = 1;
+    const index_t in_right_pad_w  = 1;
+
+    const index_t YEff = (Y - 1) * conv_dilation_h + 1;
+    const index_t XEff = (X - 1) * conv_dilation_w + 1;
+
+    const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+    const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+#endif
+
+#if 0
+    constexpr index_t in_vector_size = 1;
+    using in_data_t                  = float;
+    using acc_data_t                 = float;
+    using out_data_t                 = float;
+#elif 1
+    constexpr index_t in_vector_size = 1;
+    using in_data_t                  = half_t;
+    using acc_data_t                 = float;
+    using out_data_t                 = half_t;
+#endif
+
+    std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4);
+
+    switch(layout)
+    {
+    case ConvTensorLayout::NCHW:
+        // NCHW
+        in_lengths_host[0]  = static_cast<std::size_t>(N);
+        in_lengths_host[1]  = static_cast<std::size_t>(C);
+        in_lengths_host[2]  = static_cast<std::size_t>(Hi);
+        in_lengths_host[3]  = static_cast<std::size_t>(Wi);
+        wei_lengths_host[0] = static_cast<std::size_t>(K);
+        wei_lengths_host[1] = static_cast<std::size_t>(C);
+        wei_lengths_host[2] = static_cast<std::size_t>(Y);
+        wei_lengths_host[3] = static_cast<std::size_t>(X);
+        out_lengths_host[0] = static_cast<std::size_t>(N);
+        out_lengths_host[1] = static_cast<std::size_t>(K);
+        out_lengths_host[2] = static_cast<std::size_t>(Ho);
+        out_lengths_host[3] = static_cast<std::size_t>(Wo);
+        break;
+    case ConvTensorLayout::NHWC:
+        // NHWC
+        in_lengths_host[0]  = static_cast<std::size_t>(N);
+        in_lengths_host[1]  = static_cast<std::size_t>(Hi);
+        in_lengths_host[2]  = static_cast<std::size_t>(Wi);
+        in_lengths_host[3]  = static_cast<std::size_t>(C);
+        wei_lengths_host[0] = static_cast<std::size_t>(K);
+        wei_lengths_host[1] = static_cast<std::size_t>(Y);
+        wei_lengths_host[2] = static_cast<std::size_t>(X);
+        wei_lengths_host[3] = static_cast<std::size_t>(C);
+        out_lengths_host[0] = static_cast<std::size_t>(N);
+        out_lengths_host[1] = static_cast<std::size_t>(Ho);
+        out_lengths_host[2] = static_cast<std::size_t>(Wo);
+        out_lengths_host[3] = static_cast<std::size_t>(K);
+        break;
+    default: throw std::runtime_error("wrong! not implemented");
+    }
+
+    Tensor<in_data_t> in_host(in_lengths_host);
+    Tensor<in_data_t> in_device(in_lengths_host);
+    Tensor<in_data_t> wei(wei_lengths_host);
+    Tensor<out_data_t> out(out_lengths_host);
+
+    std::cout << "layout: " << layout << std::endl;
+    ostream_HostTensorDescriptor(in_host.mDesc, std::cout << "in: ");
+    ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
+    ostream_HostTensorDescriptor(out.mDesc, std::cout << "out: ");
+    print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
+    print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
+    print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
+    print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
+
+    std::size_t num_thread = std::thread::hardware_concurrency();
+
+    switch(init_method)
+    {
+    case 0:
+        // no initialization
+        break;
+    case 1:
+        out.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        break;
+    case 2:
+        out.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        break;
+    case 3:
+        out.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        break;
+    case 4:
+        out.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        break;
+    case 5:
+        out.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
+        break;
+    default:
+        out.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
+
+        auto gen_wei = [](auto... is) {
+            return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
+        };
+        wei.GenerateTensorValue(gen_wei, num_thread);
+    }
+
+    auto f_make_for_device_nchw = [&]() {
+#if USE_DYNAMIC_MODE
+        const auto in_lengths_dev     = make_tuple(N, C, Hi, Wi);
+        const auto wei_lengths_dev    = make_tuple(K, C, Y, X);
+        const auto out_lengths_dev    = make_tuple(N, K, Ho, Wo);
+        const auto conv_strides_dev   = make_tuple(conv_stride_h, conv_stride_w);
+        const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
+        const auto in_left_pads_dev   = make_tuple(in_left_pad_h, in_left_pad_w);
+        const auto in_right_pads_dev  = make_tuple(in_right_pad_h, in_right_pad_w);
+#else
+        const auto in_lengths_dev =
+            make_tuple(Number<N>{}, Number<C>{}, Number<Hi>{}, Number<Wi>{});
+        const auto wei_lengths_dev = make_tuple(Number<K>{}, Number<C>{}, Number<Y>{}, Number<X>{});
+        const auto out_lengths_dev =
+            make_tuple(Number<N>{}, Number<K>{}, Number<Ho>{}, Number<Wo>{});
+        const auto conv_strides_dev = make_tuple(Number<conv_stride_h>{}, Number<conv_stride_w>{});
+        const auto conv_dilations_dev =
+            make_tuple(Number<conv_dilation_h>{}, Number<conv_dilation_w>{});
+        const auto in_left_pads_dev = make_tuple(Number<in_left_pad_h>{}, Number<in_left_pad_w>{});
+        const auto in_right_pads_dev =
+            make_tuple(Number<in_right_pad_h>{}, Number<in_right_pad_w>{});
+#endif
+
+        return make_tuple(in_lengths_dev,
+                          wei_lengths_dev,
+                          out_lengths_dev,
+                          conv_strides_dev,
+                          conv_dilations_dev,
+                          in_left_pads_dev,
+                          in_right_pads_dev);
+    };
+
+    auto f_make_for_device_nhwc = [&]() {
+#if USE_DYNAMIC_MODE
+        const auto in_lengths_dev     = make_tuple(N, Hi, Wi, C);
+        const auto wei_lengths_dev    = make_tuple(K, Y, X, C);
+        const auto out_lengths_dev    = make_tuple(N, Ho, Wo, K);
+        const auto conv_strides_dev   = make_tuple(conv_stride_h, conv_stride_w);
+        const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
+        const auto in_left_pads_dev   = make_tuple(in_left_pad_h, in_left_pad_w);
+        const auto in_right_pads_dev  = make_tuple(in_right_pad_h, in_right_pad_w);
+#else
+        const auto in_lengths_dev =
+            make_tuple(Number<N>{}, Number<Hi>{}, Number<Wi>{}, Number<C>{});
+        const auto wei_lengths_dev = make_tuple(Number<K>{}, Number<Y>{}, Number<X>{}, Number<C>{});
+        const auto out_lengths_dev =
+            make_tuple(Number<N>{}, Number<Ho>{}, Number<Wo>{}, Number<K>{});
+        const auto conv_strides_dev = make_tuple(Number<conv_stride_h>{}, Number<conv_stride_w>{});
+        const auto conv_dilations_dev =
+            make_tuple(Number<conv_dilation_h>{}, Number<conv_dilation_w>{});
+        const auto in_left_pads_dev = make_tuple(Number<in_left_pad_h>{}, Number<in_left_pad_w>{});
+        const auto in_right_pads_dev =
+            make_tuple(Number<in_right_pad_h>{}, Number<in_right_pad_w>{});
+#endif
+
+        return make_tuple(in_lengths_dev,
+                          wei_lengths_dev,
+                          out_lengths_dev,
+                          conv_strides_dev,
+                          conv_dilations_dev,
+                          in_left_pads_dev,
+                          in_right_pads_dev);
+    };
+
+    const auto nhwc_desc = f_make_for_device_nhwc();
+
+#if USE_CONV_BWD_V4R1_XDL_NHWC
+    if(algo == ConvBackwardDataAlgo::V4R1XDLNHWC)
+    {
+        if(layout != ConvTensorLayout::NHWC)
+        {
+            throw std::runtime_error("wrong! layout");
+        }
+
+        const auto tmp = f_make_for_device_nhwc();
+
+        device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk<
+            in_data_t,
+            acc_data_t,
+            out_data_t>(tmp[I0],
+                        tmp[I1],
+                        tmp[I2],
+                        tmp[I3],
+                        tmp[I4],
+                        tmp[I5],
+                        tmp[I6],
+                        in_device,
+                        wei,
+                        out,
+                        nrepeat);
+    }
+#endif
+
+#if USE_CONV_BWD_V4R1R2_XDL_NHWC
+    if(algo == ConvBackwardDataAlgo::V4R1R2XDLNHWC)
+    {
+        if(layout != ConvTensorLayout::NHWC)
+        {
+            throw std::runtime_error("wrong! layout");
+        }
+
+        const auto tmp = f_make_for_device_nhwc();
+
+        device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk<
+            in_data_t,
+            acc_data_t,
+            out_data_t>(tmp[I0],
+                        tmp[I1],
+                        tmp[I2],
+                        tmp[I3],
+                        tmp[I4],
+                        tmp[I5],
+                        tmp[I6],
+                        in_device,
+                        wei,
+                        out,
+                        nrepeat);
+    }
+#endif
+
+    if(do_verification)
+    {
+        host_direct_convolution_backward_data(in_host,
+                                              wei,
+                                              out,
+                                              make_tuple(conv_stride_h, conv_stride_w),
+                                              make_tuple(conv_dilation_h, conv_dilation_w),
+                                              make_tuple(in_left_pad_h, in_left_pad_w),
+                                              make_tuple(in_right_pad_h, in_right_pad_w),
+                                              layout);
+
+        check_error(in_host, in_device);
+
+        if(do_log)
+        {
+            LogRangeAsType<float>(std::cout << "out : ", out.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "in_host  : ", in_host.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "in_device: ", in_device.mData, ",") << std::endl;
+        }
+    }
+}
diff --git a/host/driver_offline/conv_fwd_driver_offline.cpp b/host/driver_offline/conv_fwd_driver_offline.cpp
new file mode 100644
index 0000000000..ef2e16c4fa
--- /dev/null
+++ b/host/driver_offline/conv_fwd_driver_offline.cpp
@@ -0,0 +1,480 @@
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "config.hpp"
+#include "print.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "conv_common.hpp"
+#include "host_conv.hpp"
+#include "device_tensor.hpp"
+#include "device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp"
+#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp"
+#include "device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp"
+#include "device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp"
+#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
+#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
+
+#define USE_DYNAMIC_MODE 1
+#define USE_CONV_FWD_V4R4_NCHW 1
+#define USE_CONV_FWD_V4R4R2_NHWC 1
+#define USE_CONV_FWD_V6R1_NCHW 1
+#define USE_CONV_FWD_V5R1_NCHW 0
+#define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
+#define USE_CONV_FWD_V4R4R4_XDL_NHWC 0
+
+enum ConvForwardAlgo
+{
+    V4R4NCHW,      // 0
+    V4R4R2NHWC,    // 1
+    V6R1NCHW,      // 2
+    V5R1NCHW,      // 3
+    V4R4R2XDLNCHW, // 4
+    V4R4R4XDLNHWC  // 5
+};
+
+int main(int argc, char* argv[])
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+    constexpr auto I6 = Number<6>{};
+
+#if USE_DYNAMIC_MODE
+    // dynamic mode
+    if(argc != 22)
+    {
+        printf("arg1 to 5: layout, algo, do_verification, init_method, do_log, nrepeat\n");
+        printf("rest: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx\n");
+        exit(1);
+    }
+
+    const ConvTensorLayout layout = static_cast<ConvTensorLayout>(atoi(argv[1]));
+    const ConvForwardAlgo algo    = static_cast<ConvForwardAlgo>(atoi(argv[2]));
+    const bool do_verification    = atoi(argv[3]);
+    const int init_method         = atoi(argv[4]);
+    const bool do_log             = atoi(argv[5]);
+    const int nrepeat             = atoi(argv[6]);
+
+    const index_t N  = atoi(argv[7]);
+    const index_t K  = atoi(argv[8]);
+    const index_t C  = atoi(argv[9]);
+    const index_t Y  = atoi(argv[10]);
+    const index_t X  = atoi(argv[11]);
+    const index_t Hi = atoi(argv[12]);
+    const index_t Wi = atoi(argv[13]);
+
+    const index_t conv_stride_h   = atoi(argv[14]);
+    const index_t conv_stride_w   = atoi(argv[15]);
+    const index_t conv_dilation_h = atoi(argv[16]);
+    const index_t conv_dilation_w = atoi(argv[17]);
+    const index_t in_left_pad_h   = atoi(argv[18]);
+    const index_t in_left_pad_w   = atoi(argv[19]);
+    const index_t in_right_pad_h  = atoi(argv[20]);
+    const index_t in_right_pad_w  = atoi(argv[21]);
+
+    const index_t YEff = (Y - 1) * conv_dilation_h + 1;
+    const index_t XEff = (X - 1) * conv_dilation_w + 1;
+
+    const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+    const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+#else
+    // static mode
+    if(argc < 7)
+    {
+        printf("arg1 to 5: layout, algo, do_verification, init_method, do_log, nrepeat\n");
+        exit(1);
+    }
+
+    const ConvTensorLayout layout = static_cast<ConvTensorLayout>(atoi(argv[1]));
+    const ConvForwardAlgo algo    = static_cast<ConvForwardAlgo>(atoi(argv[2]));
+    const bool do_verification    = atoi(argv[3]);
+    const int init_method         = atoi(argv[4]);
+    const bool do_log             = atoi(argv[5]);
+    const int nrepeat             = atoi(argv[6]);
+
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 192;
+    constexpr index_t Hi = 71;
+    constexpr index_t Wi = 71;
+    constexpr index_t K  = 256;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+
+    const index_t conv_stride_h   = 2;
+    const index_t conv_stride_w   = 2;
+    const index_t conv_dilation_h = 1;
+    const index_t conv_dilation_w = 1;
+    const index_t in_left_pad_h   = 1;
+    const index_t in_left_pad_w   = 1;
+    const index_t in_right_pad_h  = 1;
+    const index_t in_right_pad_w  = 1;
+
+    const index_t YEff = (Y - 1) * conv_dilation_h + 1;
+    const index_t XEff = (X - 1) * conv_dilation_w + 1;
+
+    const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+    const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+#endif
+
+#if 1
+    using in_data_t  = float;
+    using acc_data_t = float;
+    using out_data_t = float;
+#elif 1
+    using in_data_t  = half_t;
+    using acc_data_t = float;
+    using out_data_t = half_t;
+#elif 1
+    using in_data_t  = int8_t;
+    using acc_data_t = int32_t;
+    using out_data_t = int8_t;
+#endif
+
+    std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4);
+
+    switch(layout)
+    {
+    case ConvTensorLayout::NCHW:
+        // NCHW
+        in_lengths_host[0]  = static_cast<std::size_t>(N);
+        in_lengths_host[1]  = static_cast<std::size_t>(C);
+        in_lengths_host[2]  = static_cast<std::size_t>(Hi);
+        in_lengths_host[3]  = static_cast<std::size_t>(Wi);
+        wei_lengths_host[0] = static_cast<std::size_t>(K);
+        wei_lengths_host[1] = static_cast<std::size_t>(C);
+        wei_lengths_host[2] = static_cast<std::size_t>(Y);
+        wei_lengths_host[3] = static_cast<std::size_t>(X);
+        out_lengths_host[0] = static_cast<std::size_t>(N);
+        out_lengths_host[1] = static_cast<std::size_t>(K);
+        out_lengths_host[2] = static_cast<std::size_t>(Ho);
+        out_lengths_host[3] = static_cast<std::size_t>(Wo);
+        break;
+    case ConvTensorLayout::NHWC:
+        // NHWC
+        in_lengths_host[0]  = static_cast<std::size_t>(N);
+        in_lengths_host[1]  = static_cast<std::size_t>(Hi);
+        in_lengths_host[2]  = static_cast<std::size_t>(Wi);
+        in_lengths_host[3]  = static_cast<std::size_t>(C);
+        wei_lengths_host[0] = static_cast<std::size_t>(K);
+        wei_lengths_host[1] = static_cast<std::size_t>(Y);
+        wei_lengths_host[2] = static_cast<std::size_t>(X);
+        wei_lengths_host[3] = static_cast<std::size_t>(C);
+        out_lengths_host[0] = static_cast<std::size_t>(N);
+        out_lengths_host[1] = static_cast<std::size_t>(Ho);
+        out_lengths_host[2] = static_cast<std::size_t>(Wo);
+        out_lengths_host[3] = static_cast<std::size_t>(K);
+        break;
+    default: throw std::runtime_error("wrong! not implemented");
+    }
+
+    Tensor<in_data_t> in(in_lengths_host);
+    Tensor<in_data_t> wei(wei_lengths_host);
+    Tensor<out_data_t> out_host(out_lengths_host);
+    Tensor<out_data_t> out_device(out_lengths_host);
+
+    std::cout << "layout: " << layout << std::endl;
+    ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
+    ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
+    ostream_HostTensorDescriptor(out_host.mDesc, std::cout << "out: ");
+    print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
+    print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
+    print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
+    print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
+
+    std::size_t num_thread = std::thread::hardware_concurrency();
+
+    switch(init_method)
+    {
+    case 0:
+        // no initialization
+        break;
+    case 1:
+        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        break;
+    case 2:
+        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        break;
+    case 3:
+        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        break;
+    case 4:
+        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        break;
+    case 5:
+        in.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
+        break;
+    default:
+        in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
+
+        auto gen_wei = [](auto... is) {
+            return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
+        };
+        wei.GenerateTensorValue(gen_wei, num_thread);
+    }
+
+    auto f_make_for_device_nchw = [&]() {
+#if USE_DYNAMIC_MODE
+        const auto in_lengths_dev     = make_tuple(N, C, Hi, Wi);
+        const auto wei_lengths_dev    = make_tuple(K, C, Y, X);
+        const auto out_lengths_dev    = make_tuple(N, K, Ho, Wo);
+        const auto conv_strides_dev   = make_tuple(conv_stride_h, conv_stride_w);
+        const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
+        const auto in_left_pads_dev   = make_tuple(in_left_pad_h, in_left_pad_w);
+        const auto in_right_pads_dev  = make_tuple(in_right_pad_h, in_right_pad_w);
+#else
+        const auto in_lengths_dev =
+            make_tuple(Number<N>{}, Number<C>{}, Number<Hi>{}, Number<Wi>{});
+        const auto wei_lengths_dev = make_tuple(Number<K>{}, Number<C>{}, Number<Y>{}, Number<X>{});
+        const auto out_lengths_dev =
+            make_tuple(Number<N>{}, Number<K>{}, Number<Ho>{}, Number<Wo>{});
+        const auto conv_strides_dev = make_tuple(Number<conv_stride_h>{}, Number<conv_stride_w>{});
+        const auto conv_dilations_dev =
+            make_tuple(Number<conv_dilation_h>{}, Number<conv_dilation_w>{});
+        const auto in_left_pads_dev = make_tuple(Number<in_left_pad_h>{}, Number<in_left_pad_w>{});
+        const auto in_right_pads_dev =
+            make_tuple(Number<in_right_pad_h>{}, Number<in_right_pad_w>{});
+#endif
+
+        return make_tuple(in_lengths_dev,
+                          wei_lengths_dev,
+                          out_lengths_dev,
+                          conv_strides_dev,
+                          conv_dilations_dev,
+                          in_left_pads_dev,
+                          in_right_pads_dev);
+    };
+
+    auto f_make_for_device_nhwc = [&]() {
+#if USE_DYNAMIC_MODE
+        const auto in_lengths_dev     = make_tuple(N, Hi, Wi, C);
+        const auto wei_lengths_dev    = make_tuple(K, Y, X, C);
+        const auto out_lengths_dev    = make_tuple(N, Ho, Wo, K);
+        const auto conv_strides_dev   = make_tuple(conv_stride_h, conv_stride_w);
+        const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
+        const auto in_left_pads_dev   = make_tuple(in_left_pad_h, in_left_pad_w);
+        const auto in_right_pads_dev  = make_tuple(in_right_pad_h, in_right_pad_w);
+#else
+        const auto in_lengths_dev =
+            make_tuple(Number<N>{}, Number<Hi>{}, Number<Wi>{}, Number<C>{});
+        const auto wei_lengths_dev = make_tuple(Number<K>{}, Number<Y>{}, Number<X>{}, Number<C>{});
+        const auto out_lengths_dev =
+            make_tuple(Number<N>{}, Number<Ho>{}, Number<Wo>{}, Number<K>{});
+        const auto conv_strides_dev = make_tuple(Number<conv_stride_h>{}, Number<conv_stride_w>{});
+        const auto conv_dilations_dev =
+            make_tuple(Number<conv_dilation_h>{}, Number<conv_dilation_w>{});
+        const auto in_left_pads_dev = make_tuple(Number<in_left_pad_h>{}, Number<in_left_pad_w>{});
+        const auto in_right_pads_dev =
+            make_tuple(Number<in_right_pad_h>{}, Number<in_right_pad_w>{});
+#endif
+
+        return make_tuple(in_lengths_dev,
+                          wei_lengths_dev,
+                          out_lengths_dev,
+                          conv_strides_dev,
+                          conv_dilations_dev,
+                          in_left_pads_dev,
+                          in_right_pads_dev);
+    };
+
+#if USE_CONV_FWD_V4R4_NCHW
+    if(algo == ConvForwardAlgo::V4R4NCHW)
+    {
+        if(layout != ConvTensorLayout::NCHW)
+        {
+            throw std::runtime_error("wrong! layout");
+        }
+
+        const auto tmp = f_make_for_device_nchw();
+
+        device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw<in_data_t,
+                                                                                   acc_data_t,
+                                                                                   out_data_t>(
+            tmp[I0],
+            tmp[I1],
+            tmp[I2],
+            tmp[I3],
+            tmp[I4],
+            tmp[I5],
+            tmp[I6],
+            in,
+            wei,
+            out_device,
+            nrepeat);
+    }
+#endif
+
+#if USE_CONV_FWD_V4R4R2_NHWC
+    if(algo == ConvForwardAlgo::V4R4R2NHWC)
+    {
+        if(layout != ConvTensorLayout::NHWC)
+        {
+            throw std::runtime_error("wrong! layout");
+        }
+
+        const auto tmp = f_make_for_device_nhwc();
+
+        device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk<in_data_t,
+                                                                                     acc_data_t,
+                                                                                     out_data_t>(
+            tmp[I0],
+            tmp[I1],
+            tmp[I2],
+            tmp[I3],
+            tmp[I4],
+            tmp[I5],
+            tmp[I6],
+            in,
+            wei,
+            out_device,
+            nrepeat);
+    }
+#endif
+
+#if USE_CONV_FWD_V6R1_NCHW
+    if(algo == ConvForwardAlgo::V6R1NCHW)
+    {
+        if(layout != ConvTensorLayout::NCHW)
+        {
+            throw std::runtime_error("wrong! layout");
+        }
+
+        const auto tmp = f_make_for_device_nchw();
+
+        device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw<in_data_t,
+                                                                                   acc_data_t,
+                                                                                   out_data_t>(
+            tmp[I0],
+            tmp[I1],
+            tmp[I2],
+            tmp[I3],
+            tmp[I4],
+            tmp[I5],
+            tmp[I6],
+            in,
+            wei,
+            out_device,
+            nrepeat);
+    }
+#endif
+
+#if USE_CONV_FWD_V5R1_NCHW
+    if(algo == ConvForwardAlgo::V5R1NCHW)
+    {
+        if(layout != ConvTensorLayout::NCHW)
+        {
+            throw std::runtime_error("wrong! layout");
+        }
+
+        const auto tmp = f_make_for_device_nchw();
+
+        device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw<in_data_t,
+                                                                                   16,
+                                                                                   acc_data_t,
+                                                                                   out_data_t>(
+            tmp[I0],
+            tmp[I1],
+            tmp[I2],
+            tmp[I3],
+            tmp[I4],
+            tmp[I5],
+            tmp[I6],
+            in,
+            wei,
+            out_device,
+            nrepeat);
+    }
+#endif
+
+#if USE_CONV_FWD_V4R4R2_XDL_NCHW
+    if(algo == ConvForwardAlgo::V4R4R2XDLNCHW)
+    {
+        if(layout != ConvTensorLayout::NCHW)
+        {
+            throw std::runtime_error("wrong! layout");
+        }
+
+        const auto tmp = f_make_for_device_nchw();
+
+        device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw<in_data_t,
+                                                                                      acc_data_t,
+                                                                                      out_data_t>(
+            tmp[I0],
+            tmp[I1],
+            tmp[I2],
+            tmp[I3],
+            tmp[I4],
+            tmp[I5],
+            tmp[I6],
+            in,
+            wei,
+            out_device,
+            nrepeat);
+    }
+#endif
+
+#if USE_CONV_FWD_V4R4R4_XDL_NHWC
+    if(algo == ConvForwardAlgo::V4R4R4XDLNHWC)
+    {
+        if(layout != ConvTensorLayout::NHWC)
+        {
+            throw std::runtime_error("wrong! layout");
+        }
+
+        const auto tmp = f_make_for_device_nhwc();
+
+        device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk<in_data_t,
+                                                                                      acc_data_t,
+                                                                                      out_data_t>(
+            tmp[I0],
+            tmp[I1],
+            tmp[I2],
+            tmp[I3],
+            tmp[I4],
+            tmp[I5],
+            tmp[I6],
+            in,
+            wei,
+            out_device,
+            nrepeat);
+    }
+#endif
+
+    if(do_verification)
+    {
+        host_direct_convolution(in,
+                                wei,
+                                out_host,
+                                make_tuple(conv_stride_h, conv_stride_w),
+                                make_tuple(conv_dilation_h, conv_dilation_w),
+                                make_tuple(in_left_pad_h, in_left_pad_w),
+                                make_tuple(in_right_pad_h, in_right_pad_w),
+                                layout);
+
+        check_error(out_host, out_device);
+
+#if 0
+        if(do_log)
+        {
+            LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "out_host  : ", out_host.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
+        }
+#endif
+    }
+}
diff --git a/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp b/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
new file mode 100644
index 0000000000..49e0223b33
--- /dev/null
+++ b/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,341 @@
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp"
+#include "driver_dynamic_gemm_xdlops_v2r3.hpp"
+
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk(
+    const InLengths& in_n_hi_wi_c_lengths,
+    const WeiLengths& wei_k_y_x_c_lengths,
+    const OutLengths& out_n_ho_wo_k_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    Tensor<TInWei>& in_n_hi_wi_c,
+    const Tensor<TInWei>& wei_k_y_x_c,
+    const Tensor<TOut>& out_n_ho_wo_k,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+
+    std::cout << __func__ << std::endl;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+    constexpr auto I6 = Number<6>{};
+    constexpr auto I7 = Number<7>{};
+    constexpr auto I8 = Number<8>{};
+
+    DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
+    DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
+    DeviceMem out_n_ho_wo_k_device_buf(sizeof(TOut) * out_n_ho_wo_k.mDesc.GetElementSpace());
+
+    in_n_hi_wi_c_device_buf.ToDevice(in_n_hi_wi_c.mData.data());
+    wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data());
+    out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
+
+    const auto in_n_hi_wi_c_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths);
+    const auto wei_k_y_x_c_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths);
+    const auto out_n_ho_wo_k_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths);
+
+#if 1
+    // [M, N, K0, K1] = [128, 128, 4, 4] for fp32
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 4;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 2;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 4>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmM  = 2;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 4;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 4;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 4;
+#elif 1
+    // [M, N, K0, K1] = [128, 128, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 2;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmM  = 2;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 4;
+#elif 1
+    // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmM  = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 4;
+#elif 1
+    // [M, N, K0, K1] = [128, 256, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 256;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 4;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmM  = 2;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 4, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 4;
+#elif 0
+    // [M, N, K0, K1] = [256, 128, 4, 4]
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 4;
+
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 4>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmM  = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 4;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 4;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 4;
+#endif
+
+    const auto descs =
+        transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk(wei_k_y_x_c_desc,
+                                                                          out_n_ho_wo_k_desc,
+                                                                          in_n_hi_wi_c_desc,
+                                                                          conv_strides,
+                                                                          conv_dilations,
+                                                                          in_left_pads,
+                                                                          in_right_pads,
+                                                                          I0,
+                                                                          I0,
+                                                                          Number<GemmK1>{});
+
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = descs[I0];
+    const auto out_gemmk0_gemmn_gemmk1_grid_desc = descs[I1];
+    const auto in_gemmm_gemmn_grid_desc          = descs[I2];
+
+    // HACK: hacks that control index calculation when iterating over A, B, C matrix
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},   // 0+: gemmk0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: gemmm
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 2+: gemmk1
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},   // 0-: Gemmk0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: Gemmm
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 2-: Gemmk1
+
+    constexpr auto out_gemmk0_gemmn_gemmk1_grid_iterator_hacks = make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},   // 0+: gemmk0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0>{},   // 1+: gemmn
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 2+: gemmk1
+        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},   // 0-: gemmk0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0>{},   // 1-: gemmn
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 2-: gemmk1
+
+    constexpr auto in_m0_m1_m2_n_grid_iterator_hacks = make_tuple(
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 0+: MRepeat
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},  // 1+: NRepeat
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 2+: MWaves
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},  // 3+: NWaves
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 4+: M0
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 5+: M1
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 6+: M2
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}), // 7+: N1
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0-: MRepeat
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},   // 1-: NRepeat
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2-: MWaves
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},   // 3-: NWaves
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4-: M0
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5-: M1
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6-: M2
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{})); // 7-: N1
+
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{};
+
+    constexpr auto out_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0>{};
+
+    for(index_t i = 0; i < 5; ++i)
+    {
+        float ave_time = driver_dynamic_gemm_xdlops_v2r3<
+            BlockSize,
+            TInWei,
+            TAcc,
+            TOut,
+            InMemoryDataOperationEnum_t::Set,
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_desc),
+            decltype(out_gemmk0_gemmn_gemmk1_grid_desc),
+            decltype(in_gemmm_gemmn_grid_desc),
+            GemmMPerBlock,
+            GemmNPerBlock,
+            GemmKPerBlock,
+            GemmMPerWave,
+            GemmNPerWave,
+            GemmK1,
+            MRepeat,
+            NRepeat,
+            GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1,
+            GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1,
+            Sequence<2, 0, 1>,
+            Sequence<0, 2, 1>,
+            1,
+            GemmABlockTransferSrcScalarPerVector_GemmM,
+            GemmABlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+            GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1,
+            GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1,
+            Sequence<1, 0, 2>,
+            Sequence<1, 0, 2>,
+            2,
+            GemmBBlockTransferSrcScalarPerVector_GemmK1,
+            GemmBBlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+            Sequence<1, 3, 7, 0, 2, 4, 5, 6>,
+            6,
+            GemmCThreadTransferDstScalarPerVector,
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks),
+            decltype(out_gemmk0_gemmn_gemmk1_grid_iterator_hacks),
+            decltype(in_m0_m1_m2_n_grid_iterator_hacks),
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks),
+            decltype(out_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks),
+            false // CAccessOrderMRepeatNRepeat
+            >(static_cast<TInWei*>(wei_k_y_x_c_device_buf.GetDeviceBuffer()),
+              static_cast<TOut*>(out_n_ho_wo_k_device_buf.GetDeviceBuffer()),
+              static_cast<TInWei*>(in_n_hi_wi_c_device_buf.GetDeviceBuffer()),
+              wei_gemmk0_gemmm_gemmk1_grid_desc,
+              out_gemmk0_gemmn_gemmk1_grid_desc,
+              in_gemmm_gemmn_grid_desc,
+              wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks,
+              out_gemmk0_gemmn_gemmk1_grid_iterator_hacks,
+              in_m0_m1_m2_n_grid_iterator_hacks,
+              wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks,
+              out_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks,
+              nrepeat);
+
+        {
+            const auto N = out_n_ho_wo_k_lengths[I0];
+            const auto K = out_n_ho_wo_k_lengths[I3];
+            const auto C = wei_k_y_x_c_lengths[I3];
+
+            const auto Hi = in_n_hi_wi_c_lengths[I1];
+            const auto Wi = in_n_hi_wi_c_lengths[I2];
+
+            const auto Ho = out_n_ho_wo_k_lengths[I1];
+            const auto Wo = out_n_ho_wo_k_lengths[I2];
+
+            const auto Y = wei_k_y_x_c_lengths[I1];
+            const auto X = wei_k_y_x_c_lengths[I2];
+
+            float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
+                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
+                      << std::endl;
+        }
+    }
+
+    // copy result back to host
+    in_n_hi_wi_c_device_buf.FromDevice(in_n_hi_wi_c.mData.data());
+}
diff --git a/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp b/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
new file mode 100644
index 0000000000..ce4dd155f6
--- /dev/null
+++ b/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,317 @@
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp"
+#include "driver_dynamic_gemm_xdlops_v2r3.hpp"
+
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk(
+    const InLengths& in_n_hi_wi_c_lengths,
+    const WeiLengths& wei_k_y_x_c_lengths,
+    const OutLengths& out_n_ho_wo_k_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    Tensor<TInWei>& in_n_hi_wi_c,
+    const Tensor<TInWei>& wei_k_y_x_c,
+    const Tensor<TOut>& out_n_ho_wo_k,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+
+    std::cout << __func__ << std::endl;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+    constexpr auto I6 = Number<6>{};
+    constexpr auto I7 = Number<7>{};
+    constexpr auto I8 = Number<8>{};
+
+    DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
+    DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
+    DeviceMem out_n_ho_wo_k_device_buf(sizeof(TOut) * out_n_ho_wo_k.mDesc.GetElementSpace());
+
+    in_n_hi_wi_c_device_buf.ToDevice(in_n_hi_wi_c.mData.data());
+    wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data());
+    out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
+
+    const auto in_n_hi_wi_c_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths);
+    const auto wei_k_y_x_c_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths);
+    const auto out_n_ho_wo_k_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths);
+
+#if 0
+    // [M, N, K0, K1] = [256, 128, 4, 4] for fp32
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 4;
+
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 4>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 4;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN  = 2;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 4;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#elif 0
+    // [M, N, K0, K1] = [128, 128, 4, 4] for fp32
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 4;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 2;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 4>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 4;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN  = 2;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 4;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#elif 1
+    // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN  = 2;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#elif 1
+    // [M, N, K0, K1] = [128, 256, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 256;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 4;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 4, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN  = 4;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#endif
+
+    const auto descs =
+        transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk(out_n_ho_wo_k_desc,
+                                                                            wei_k_y_x_c_desc,
+                                                                            in_n_hi_wi_c_desc,
+                                                                            conv_strides,
+                                                                            conv_dilations,
+                                                                            in_left_pads,
+                                                                            in_right_pads,
+                                                                            I0,
+                                                                            I0,
+                                                                            Number<GemmK1>{});
+
+    const auto out_gemmk0_gemmm_gemmk1_grid_desc = descs[I0];
+    const auto wei_gemmk0_gemmn_gemmk1_grid_desc = descs[I1];
+    const auto in_gemmm_gemmn_grid_desc          = descs[I2];
+
+    // HACK: hacks that control index calculation when iterating over A, B, C matrix
+    constexpr auto out_gemmk0_gemmm_gemmk1_grid_iterator_hacks = make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},   // 0+: gemmk0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0>{},   // 1+: gemmm
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 2+: gemmk1
+        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},   // 0-: gemmk0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0>{},   // 1-: gemmm
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 2-: gemmk1
+
+    constexpr auto wei_gemmk0_gemmn_gemmk1_grid_iterator_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},   // 0+: gemmk0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: gemmn
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 2+: gemmk1
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},   // 0-: Gemmk0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: Gemmn
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 2-: Gemmk1
+
+    constexpr auto in_m0_m1_m2_n_grid_iterator_hacks = make_tuple(
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},  // 0+: MRepeat
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 1+: NRepeat
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},  // 2+: MWaves
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 3+: NWaves
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},  // 4+: M0
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},  // 5+: M1
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},  // 6+: M2
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}), // 7+: N1
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},   // 0-: MRepeat
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: NRepeat
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},   // 2-: MWaves
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3-: NWaves
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},   // 4-: M0
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},   // 5-: M1
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},   // 6-: M2
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 7-: N1
+
+    constexpr auto out_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0>{};
+
+    constexpr auto wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{};
+
+    for(index_t i = 0; i < 5; ++i)
+    {
+        float ave_time = driver_dynamic_gemm_xdlops_v2r3<
+            BlockSize,
+            TInWei,
+            TAcc,
+            TOut,
+            InMemoryDataOperationEnum_t::Set,
+            decltype(out_gemmk0_gemmm_gemmk1_grid_desc),
+            decltype(wei_gemmk0_gemmn_gemmk1_grid_desc),
+            decltype(in_gemmm_gemmn_grid_desc),
+            GemmMPerBlock,
+            GemmNPerBlock,
+            GemmKPerBlock,
+            GemmMPerWave,
+            GemmNPerWave,
+            GemmK1,
+            MRepeat,
+            NRepeat,
+            GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1,
+            GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1,
+            Sequence<1, 0, 2>,
+            Sequence<1, 0, 2>,
+            2,
+            GemmABlockTransferSrcScalarPerVector_GemmK1,
+            GemmABlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+            GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1,
+            GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1,
+            Sequence<2, 0, 1>,
+            Sequence<0, 2, 1>,
+            1,
+            GemmBBlockTransferSrcScalarPerVector_GemmN,
+            GemmBBlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+#if 0
+            Sequence<0, 2, 4, 5, 6, 1, 3, 7>,
+#else
+            Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+#endif
+            7,
+            GemmCThreadTransferDstScalarPerVector,
+            decltype(out_gemmk0_gemmm_gemmk1_grid_iterator_hacks),
+            decltype(wei_gemmk0_gemmn_gemmk1_grid_iterator_hacks),
+            decltype(in_m0_m1_m2_n_grid_iterator_hacks),
+            decltype(out_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks),
+            decltype(wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks),
+            true // CAccessOrderMRepeatNRepeat
+            >(static_cast<TOut*>(out_n_ho_wo_k_device_buf.GetDeviceBuffer()),
+              static_cast<TInWei*>(wei_k_y_x_c_device_buf.GetDeviceBuffer()),
+              static_cast<TInWei*>(in_n_hi_wi_c_device_buf.GetDeviceBuffer()),
+              out_gemmk0_gemmm_gemmk1_grid_desc,
+              wei_gemmk0_gemmn_gemmk1_grid_desc,
+              in_gemmm_gemmn_grid_desc,
+              out_gemmk0_gemmm_gemmk1_grid_iterator_hacks,
+              wei_gemmk0_gemmn_gemmk1_grid_iterator_hacks,
+              in_m0_m1_m2_n_grid_iterator_hacks,
+              out_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks,
+              wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks,
+              nrepeat);
+
+        {
+            const auto N = out_n_ho_wo_k_lengths[I0];
+            const auto K = out_n_ho_wo_k_lengths[I3];
+            const auto C = wei_k_y_x_c_lengths[I3];
+
+            const auto Hi = in_n_hi_wi_c_lengths[I1];
+            const auto Wi = in_n_hi_wi_c_lengths[I2];
+
+            const auto Ho = out_n_ho_wo_k_lengths[I1];
+            const auto Wo = out_n_ho_wo_k_lengths[I2];
+
+            const auto Y = wei_k_y_x_c_lengths[I1];
+            const auto X = wei_k_y_x_c_lengths[I2];
+
+            float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
+                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
+                      << std::endl;
+        }
+    }
+
+    // copy result back to host
+    in_n_hi_wi_c_device_buf.FromDevice(in_n_hi_wi_c.mData.data());
+}
diff --git a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
new file mode 100644
index 0000000000..24ba775309
--- /dev/null
+++ b/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
@@ -0,0 +1,210 @@
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp"
+#include "driver_dynamic_gemm_dlops_v1r2.hpp"
+
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
+    const InLengths& in_n_c_hi_wi_lengths,
+    const WeiLengths& wei_k_c_y_x_lengths,
+    const OutLengths& out_n_k_ho_wo_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_c_hi_wi,
+    const Tensor<TInWei>& wei_k_c_y_x,
+    Tensor<TOut>& out_n_k_ho_wo,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+
+    std::cout << __func__ << std::endl;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+    constexpr auto I6 = Number<6>{};
+    constexpr auto I7 = Number<7>{};
+    constexpr auto I8 = Number<8>{};
+
+    DeviceMem in_n_c_hi_wi_device_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
+    DeviceMem wei_k_c_y_x_device_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
+    DeviceMem out_n_k_ho_wo_device_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
+
+    in_n_c_hi_wi_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
+    wei_k_c_y_x_device_buf.ToDevice(wei_k_c_y_x.mData.data());
+    out_n_k_ho_wo_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
+
+    const auto in_n_c_hi_wi_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
+    const auto wei_k_c_y_x_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
+    const auto out_n_k_ho_wo_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
+
+#if 1
+    // cdata = 64, BlockSize = 256, 128x128x8
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlockM1 = 128;
+    constexpr index_t GemmNPerBlockN1 = 128;
+    constexpr index_t GemmKPerBlock   = 8;
+
+    constexpr index_t GemmM1PerThreadM111 = 4;
+    constexpr index_t GemmN1PerThreadN111 = 4;
+    constexpr index_t GemmKPerThread      = 1;
+
+    constexpr index_t GemmM11N11ThreadClusterM1100 = 8;
+    constexpr index_t GemmM11N11ThreadClusterN1100 = 8;
+    constexpr index_t GemmM11N11ThreadClusterM1101 = 2;
+    constexpr index_t GemmM11N11ThreadClusterN1101 = 2;
+
+    using GemmABlockTransferThreadSliceLengths_K_M0_M1   = Sequence<4, 1, 1>;
+    using GemmABlockTransferThreadClusterLengths_K_M0_M1 = Sequence<2, 1, 128>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_K  = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_M1 = 1;
+
+    using GemmBBlockTransferThreadSliceLengths_K_N0_N1   = Sequence<4, 1, 1>;
+    using GemmBBlockTransferThreadClusterLengths_K_N0_N1 = Sequence<2, 1, 128>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_N1 = 1;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_N1 = 1;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector_N11 = 1;
+#endif
+
+    const auto descs =
+        transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
+                                                                        in_n_c_hi_wi_desc,
+                                                                        out_n_k_ho_wo_desc,
+                                                                        conv_strides,
+                                                                        conv_dilations,
+                                                                        in_left_pads,
+                                                                        in_right_pads);
+
+    // HACK: hacks that control index calculation when iterating over A, B, C matrix
+    constexpr auto wei_gemmk_gemmm0_gemmn1_grid_iterator_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{}),
+                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{}));
+
+    constexpr auto in_gemmk_gemmn0_gemmn1_grid_iterator_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}),
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{}));
+
+    constexpr auto out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_iterator_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 1, 0, 0>{},
+                              Sequence<0, 0, 1, 0, 0>{},
+                              Sequence<0, 0, 1, 0, 0>{}),
+                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 2, 0, 0>{},
+                              Sequence<0, 0, 2, 0, 0>{},
+                              Sequence<0, 0, 2, 0, 0>{}));
+
+    constexpr auto wei_gemmk_gemmm0_gemmm1_grid_move_slice_window_iterator_hacks =
+        Sequence<0, 0, 0, 0, 0>{};
+
+    constexpr auto in_gemmk_gemmn0_gemmn1_grid_move_slice_window_iterator_hacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>{};
+
+    const auto wei_gemmk_gemmm_grid_desc = descs[I0];
+    const auto in_gemmk_gemmn_grid_desc  = descs[I1];
+    const auto out_gemmm_gemmn_grid_desc = descs[I2];
+
+    for(index_t i = 0; i < 5; ++i)
+    {
+        float ave_time = driver_dynamic_gemm_dlops_v1r2<
+            BlockSize,
+            TInWei,
+            TAcc,
+            TOut,
+            InMemoryDataOperationEnum_t::Set,
+            decltype(wei_gemmk_gemmm_grid_desc),
+            decltype(in_gemmk_gemmn_grid_desc),
+            decltype(out_gemmm_gemmn_grid_desc),
+            GemmMPerBlockM1,
+            GemmNPerBlockN1,
+            GemmKPerBlock,
+            GemmM1PerThreadM111,
+            GemmN1PerThreadN111,
+            GemmKPerThread,
+            GemmM11N11ThreadClusterM1100,
+            GemmM11N11ThreadClusterN1100,
+            GemmM11N11ThreadClusterM1101,
+            GemmM11N11ThreadClusterN1101,
+            GemmABlockTransferThreadSliceLengths_K_M0_M1,
+            GemmABlockTransferThreadClusterLengths_K_M0_M1,
+            Sequence<2, 1, 0>, // ABlockTransferThreadClusterArrangeOrder
+            Sequence<2, 1, 0>, // ABlockTransferSrcAccessOrder
+            0,                 // ABlockTransferSrcVectorDim
+            GemmABlockTransferSrcScalarPerVector_K,
+            GemmABlockTransferDstScalarPerVector_M1,
+            false, // don't move back src coordinate after threadwise copy
+            GemmBBlockTransferThreadSliceLengths_K_N0_N1,
+            GemmBBlockTransferThreadClusterLengths_K_N0_N1,
+            Sequence<0, 1, 2>, // BBlockTransferThreadClusterArrangeOrder
+            Sequence<0, 1, 2>, // BBlockTransferSrcAccessOrder
+            2,                 // BBlockTransferSrcVectorDim
+            GemmBBlockTransferSrcScalarPerVector_N1,
+            GemmBBlockTransferDstScalarPerVector_N1,
+            false,                      // don't move back src coordinate after threadwise copy
+            Sequence<3, 4, 5, 0, 1, 2>, // CThreadTransferSrcDstAccessOrder
+            5,                          // CThreadTransferSrcDstVectorDim
+            GemmCThreadTransferDstScalarPerVector_N11,
+            decltype(wei_gemmk_gemmm0_gemmn1_grid_iterator_hacks),
+            decltype(in_gemmk_gemmn0_gemmn1_grid_iterator_hacks),
+            decltype(out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_iterator_hacks),
+            decltype(wei_gemmk_gemmm0_gemmm1_grid_move_slice_window_iterator_hacks),
+            decltype(in_gemmk_gemmn0_gemmn1_grid_move_slice_window_iterator_hacks)>(
+            static_cast<TInWei*>(wei_k_c_y_x_device_buf.GetDeviceBuffer()),
+            static_cast<TInWei*>(in_n_c_hi_wi_device_buf.GetDeviceBuffer()),
+            static_cast<TOut*>(out_n_k_ho_wo_device_buf.GetDeviceBuffer()),
+            wei_gemmk_gemmm_grid_desc,
+            in_gemmk_gemmn_grid_desc,
+            out_gemmm_gemmn_grid_desc,
+            wei_gemmk_gemmm0_gemmn1_grid_iterator_hacks,
+            in_gemmk_gemmn0_gemmn1_grid_iterator_hacks,
+            out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_iterator_hacks,
+            wei_gemmk_gemmm0_gemmm1_grid_move_slice_window_iterator_hacks,
+            in_gemmk_gemmn0_gemmn1_grid_move_slice_window_iterator_hacks,
+            nrepeat);
+
+        float perf = (float)calculate_convolution_flops(
+                         in_n_c_hi_wi_desc, wei_k_c_y_x_desc, out_n_k_ho_wo_desc) /
+                     (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+        std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
+    }
+
+    // copy result back to host
+    out_n_k_ho_wo_device_buf.FromDevice(out_n_k_ho_wo.mData.data());
+}
diff --git a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
new file mode 100644
index 0000000000..b6b1cc8969
--- /dev/null
+++ b/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
@@ -0,0 +1,283 @@
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "driver_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
+
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(
+    const InLengths& in_n_c_hi_wi_lengths,
+    const WeiLengths& wei_k_c_y_x_lengths,
+    const OutLengths& out_n_k_ho_wo_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_c_hi_wi,
+    const Tensor<TInWei>& wei_k_c_y_x,
+    Tensor<TOut>& out_n_k_ho_wo,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+
+    std::cout << __func__ << std::endl;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+    constexpr auto I6 = Number<6>{};
+    constexpr auto I7 = Number<7>{};
+    constexpr auto I8 = Number<8>{};
+
+    DeviceMem in_n_c_hi_wi_device_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
+    DeviceMem wei_k_c_y_x_device_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
+    DeviceMem out_n_k_ho_wo_device_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
+
+    in_n_c_hi_wi_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
+    wei_k_c_y_x_device_buf.ToDevice(wei_k_c_y_x.mData.data());
+    out_n_k_ho_wo_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
+
+    const auto in_n_c_hi_wi_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
+    const auto wei_k_c_y_x_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
+    const auto out_n_k_ho_wo_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
+
+#if 0
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 64;
+    constexpr index_t GemmNPerWave = 64;
+    constexpr index_t GemmKPack    = 8;
+
+    constexpr index_t MRepeat = 1;
+    constexpr index_t NRepeat = 1;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_KPack = 8;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 4, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 32, 2>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 4;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_KPack = 4;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
+#elif 0
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 64;
+    constexpr index_t GemmNPerWave = 64;
+    constexpr index_t GemmKPack    = 8;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 1;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_KPack = 8;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 4, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 32, 2>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 4;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_KPack = 4;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
+#elif 0
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 64;
+    constexpr index_t GemmNPerWave = 64;
+    constexpr index_t GemmKPack    = 8;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 1;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_KPack = 8;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 4, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 32, 2>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_KPack = 4;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
+#elif 1
+    // [M, N, K0, K1] = [256, 128, 4, 4]
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 64;
+    constexpr index_t GemmNPerWave = 64;
+    constexpr index_t GemmKPack    = 4;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 1;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 4>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_KPack = 4;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_KPack = 4;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
+#elif 1
+    // [M, N, K0, K1] = [128, 128, 4, 4]
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 64;
+    constexpr index_t GemmNPerWave = 64;
+    constexpr index_t GemmKPack    = 4;
+
+    constexpr index_t MRepeat = 1;
+    constexpr index_t NRepeat = 1;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 4>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_KPack = 4;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_KPack = 4;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
+#endif
+
+    const auto descs =
+#if 1
+        transform_forward_convolution_into_gemm_v4r4_xdlops_nchw_kcyx_nkhw_pad
+#else
+        transform_forward_convolution_into_gemm_v4r4_xdlops_nchw_kcyx_nkhw_1x1
+#endif
+        <TInWei, GemmMPerBlock, GemmNPerBlock, GemmMPerWave, GemmNPerWave, GemmKPack>(
+            wei_k_c_y_x_desc,
+            in_n_c_hi_wi_desc,
+            out_n_k_ho_wo_desc,
+            conv_strides,
+            conv_dilations,
+            in_left_pads,
+            in_right_pads);
+
+    for(index_t i = 0; i < 5; ++i)
+    {
+#if 0
+        float ave_time = launch_kernel_dynamic_gemm_xdlops_v1
+#else
+        float ave_time = launch_kernel_dynamic_gemm_xdlops_v2
+#endif
+        <BlockSize,
+         TInWei,
+         TAcc,
+         TOut,
+         InMemoryDataOperationEnum_t::Set,
+         decltype(descs[I0]),
+         decltype(descs[I1]),
+         decltype(descs[I2]),
+         decltype(descs[I3]),
+         GemmMPerBlock,
+         GemmNPerBlock,
+         GemmKPerBlock,
+         GemmMPerWave,
+         GemmNPerWave,
+         GemmKPack,
+         MRepeat,
+         NRepeat,
+         GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1,
+         GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1,
+         Sequence<1, 0, 2>,
+         Sequence<1, 0, 2>,
+         2,
+         GemmABlockTransferSrcScalarPerVector_GemmK,
+         GemmABlockTransferDstScalarPerVector_KPack,
+         false, // don't move back src coordinate after threadwise copy
+         GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1,
+         GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1,
+         Sequence<0, 2, 1>,
+         Sequence<1, 0, 2>,
+         1,
+         GemmBBlockTransferSrcScalarPerVector_GemmN,
+         GemmBBlockTransferDstScalarPerVector_KPack,
+         false, // don't move back src coordinate after threadwise copy, which will be fused
+                // with MoveSrcSliceWindow() to save addr computation
+         Sequence<2, 3, 0, 1>,
+         3,
+         GemmCThreadTransferDstScalarPerVector_GemmN1,
+         decltype(descs[I4]),
+         decltype(descs[I5]),
+         decltype(descs[I6]),
+         decltype(descs[I7]),
+         decltype(descs[I8])>(static_cast<TInWei*>(wei_k_c_y_x_device_buf.GetDeviceBuffer()),
+                              static_cast<TInWei*>(in_n_c_hi_wi_device_buf.GetDeviceBuffer()),
+                              static_cast<TOut*>(out_n_k_ho_wo_device_buf.GetDeviceBuffer()),
+                              descs[I0],
+                              descs[I1],
+                              descs[I2],
+                              descs[I3],
+                              descs[I4],
+                              descs[I5],
+                              descs[I6],
+                              descs[I7],
+                              descs[I8],
+                              nrepeat);
+
+        float perf = (float)calculate_convolution_flops(
+                         in_n_c_hi_wi_desc, wei_k_c_y_x_desc, out_n_k_ho_wo_desc) /
+                     (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+        std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
+    }
+
+    // copy result back to host
+    out_n_k_ho_wo_device_buf.FromDevice(out_n_k_ho_wo.mData.data());
+}
diff --git a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp b/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp
new file mode 100644
index 0000000000..cdd1084c0d
--- /dev/null
+++ b/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,284 @@
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
+#include "driver_dynamic_gemm_dlops_v1r3.hpp"
+
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk(
+    const InLengths& in_n_hi_wi_c_lengths,
+    const WeiLengths& wei_k_y_x_c_lengths,
+    const OutLengths& out_n_ho_wo_k_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_hi_wi_c,
+    const Tensor<TInWei>& wei_k_y_x_c,
+    Tensor<TOut>& out_n_ho_wo_k,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+
+    std::cout << __func__ << std::endl;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+    constexpr auto I6 = Number<6>{};
+    constexpr auto I7 = Number<7>{};
+    constexpr auto I8 = Number<8>{};
+
+    DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
+    DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
+    DeviceMem out_n_ho_wo_k_device_buf(sizeof(TOut) * out_n_ho_wo_k.mDesc.GetElementSpace());
+
+    in_n_hi_wi_c_device_buf.ToDevice(in_n_hi_wi_c.mData.data());
+    wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data());
+    out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
+
+    const auto in_n_hi_wi_c_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths);
+    const auto wei_k_y_x_c_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths);
+    const auto out_n_ho_wo_k_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths);
+
+#if 1
+    // [M, N, K0, K1] = [128, 128, 8, 1] for fp32
+    // cdata = 64, BlockSize = 256
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlockM1 = 128;
+    constexpr index_t GemmNPerBlockN1 = 128;
+    constexpr index_t GemmKPerBlock   = 8;
+    constexpr index_t GemmK1          = 1;
+
+    constexpr index_t GemmM1PerThreadM111 = 4;
+    constexpr index_t GemmN1PerThreadN111 = 4;
+    constexpr index_t GemmKPerThread      = 1;
+
+    using GemmM11N11ThreadClusterM110Xs = Sequence<8, 2>;
+    using GemmM11N11ThreadClusterN110Xs = Sequence<8, 2>;
+
+    using GemmABlockTransferThreadSliceLengths_K0_M0_M1_K1   = Sequence<4, 1, 1, 1>;
+    using GemmABlockTransferThreadClusterLengths_K0_M0_M1_K1 = Sequence<2, 1, 128, 1>;
+
+    using GemmABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1 = Sequence<4, 1, 1, 1>;
+    using GemmABlockTransferDstVectorTensorLengths_K0_M0_M1_K1 = Sequence<1, 1, 1, 1>;
+
+    using GemmBBlockTransferThreadSliceLengths_K0_N0_N1_K1   = Sequence<4, 1, 1, 1>;
+    using GemmBBlockTransferThreadClusterLengths_K0_N0_N1_K1 = Sequence<2, 1, 128, 1>;
+
+    using GemmBBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1 = Sequence<4, 1, 1, 1>;
+    using GemmBBlockTransferDstVectorTensorLengths_K0_N0_N1_K1 = Sequence<1, 1, 1, 1>;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector_N11 = 4;
+#elif 1
+    // [M, N, K0, K1] = [128, 128, 8, 2] for fp16
+    // cdata = 64, BlockSize = 256
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlockM1 = 128;
+    constexpr index_t GemmNPerBlockN1 = 128;
+    constexpr index_t GemmKPerBlock   = 8;
+    constexpr index_t GemmK1          = 2;
+
+    constexpr index_t GemmM1PerThreadM111 = 4;
+    constexpr index_t GemmN1PerThreadN111 = 4;
+    constexpr index_t GemmKPerThread      = 1;
+
+    using GemmM11N11ThreadClusterM110Xs = Sequence<8, 2>;
+    using GemmM11N11ThreadClusterN110Xs = Sequence<8, 2>;
+
+    using GemmABlockTransferThreadSliceLengths_K0_M0_M1_K1   = Sequence<4, 1, 1, 2>;
+    using GemmABlockTransferThreadClusterLengths_K0_M0_M1_K1 = Sequence<2, 1, 128, 1>;
+
+    using GemmABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1 = Sequence<4, 1, 1, 2>;
+    using GemmABlockTransferDstVectorTensorLengths_K0_M0_M1_K1 = Sequence<1, 1, 1, 2>;
+
+    using GemmBBlockTransferThreadSliceLengths_K0_N0_N1_K1   = Sequence<4, 1, 1, 2>;
+    using GemmBBlockTransferThreadClusterLengths_K0_N0_N1_K1 = Sequence<2, 1, 128, 1>;
+
+    using GemmBBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1 = Sequence<4, 1, 1, 2>;
+    using GemmBBlockTransferDstVectorTensorLengths_K0_N0_N1_K1 = Sequence<1, 1, 1, 2>;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector_N11 = 4;
+#elif 1
+    // [M, N, K0, K1] = [128, 128, 8, 4] for i8
+    // cdata = 64, BlockSize = 256
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlockM1 = 128;
+    constexpr index_t GemmNPerBlockN1 = 128;
+    constexpr index_t GemmKPerBlock   = 8;
+    constexpr index_t GemmK1          = 4;
+
+    constexpr index_t GemmM1PerThreadM111 = 4;
+    constexpr index_t GemmN1PerThreadN111 = 4;
+    constexpr index_t GemmKPerThread      = 1;
+
+    using GemmM11N11ThreadClusterM110Xs = Sequence<8, 2>;
+    using GemmM11N11ThreadClusterN110Xs = Sequence<8, 2>;
+
+    using GemmABlockTransferThreadSliceLengths_K0_M0_M1_K1   = Sequence<4, 1, 1, 4>;
+    using GemmABlockTransferThreadClusterLengths_K0_M0_M1_K1 = Sequence<2, 1, 128, 1>;
+
+    using GemmABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1 = Sequence<4, 1, 1, 4>;
+    using GemmABlockTransferDstVectorTensorLengths_K0_M0_M1_K1 = Sequence<1, 1, 1, 4>;
+
+    using GemmBBlockTransferThreadSliceLengths_K0_N0_N1_K1   = Sequence<4, 1, 1, 4>;
+    using GemmBBlockTransferThreadClusterLengths_K0_N0_N1_K1 = Sequence<2, 1, 128, 1>;
+
+    using GemmBBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1 = Sequence<4, 1, 1, 4>;
+    using GemmBBlockTransferDstVectorTensorLengths_K0_N0_N1_K1 = Sequence<1, 1, 1, 4>;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector_N11 = 4;
+#endif
+
+    const auto descs =
+        transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad(in_n_hi_wi_c_desc,
+                                                                          wei_k_y_x_c_desc,
+                                                                          out_n_ho_wo_k_desc,
+                                                                          conv_strides,
+                                                                          conv_dilations,
+                                                                          in_left_pads,
+                                                                          in_right_pads,
+                                                                          Number<GemmK1>{});
+
+    const auto in_gemmk0_gemmm_gemmk1_grid_desc  = descs[I0];
+    const auto wei_gemmk0_gemmn_gemmk1_grid_desc = descs[I1];
+    const auto out_gemmm_gemmn_grid_desc         = descs[I2];
+
+    // HACK: hacks that control index calculation when iterating over A, B, C matrix
+    constexpr auto in_gemmk0_gemmm0_gemmm1_gemmk1_grid_iterator_hacks = make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},   // 0+: GemmK0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},   // 1+: GemmM0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},   // 2+: GemmM1
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{}),  // 3+: GemmK1
+        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{},   // 0-: GemmK0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},   // 1-: GemmM0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},   // 3-: GemmM1
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{})); // 3-: GemmK1
+
+    constexpr auto wei_gemmk0_gemmn0_gemmn1_gemmk1_grid_iterator_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0>{},   // 0+: GemmK0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: GemmN0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0>{},   // 2+: GemmN1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0>{}),  // 3+: GemmK1
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0>{},   // 0-: GemmK0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: GemmN0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0>{},   // 2-: GemmN1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0>{})); // 3-: GemmK1
+
+    constexpr auto out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_iterator_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},   // 0+: GemmM0
+                              Sequence<0, 0, 0, 0, 0>{},   // 1+: GemmM10
+                              Sequence<0, 0, 0, 0, 0>{},   // 2+: GemmM11
+                              Sequence<0, 0, 0, 0, 0>{},   // 3+: GemmN0
+                              Sequence<0, 0, 0, 0, 0>{},   // 4+: GemmN10
+                              Sequence<0, 0, 0, 0, 0>{}),  // 5+: GemmN11
+                   make_tuple(Sequence<0, 0, 0, 0, 0>{},   // 0-: GemmM0
+                              Sequence<0, 0, 0, 0, 0>{},   // 1-: GemmM10
+                              Sequence<0, 0, 0, 0, 0>{},   // 2-: GemmM11
+                              Sequence<0, 0, 0, 0, 0>{},   // 3-: GemmN0
+                              Sequence<0, 0, 0, 0, 0>{},   // 4-: GemmN10
+                              Sequence<0, 0, 0, 0, 0>{})); // 5-: GemmN11
+
+    constexpr auto in_gemmk0_gemmm0_gemmm1_gemmk1_grid_move_slice_window_iterator_hacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0>{};
+
+    constexpr auto wei_gemmk0_gemmn0_gemmn1_gemmk1_grid_move_slice_window_iterator_hacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0>{};
+
+    for(index_t i = 0; i < 5; ++i)
+    {
+        float ave_time = driver_dynamic_gemm_dlops_v1r3<
+            BlockSize,
+            TInWei,
+            TAcc,
+            TOut,
+            InMemoryDataOperationEnum_t::Set,
+            decltype(in_gemmk0_gemmm_gemmk1_grid_desc),
+            decltype(wei_gemmk0_gemmn_gemmk1_grid_desc),
+            decltype(out_gemmm_gemmn_grid_desc),
+            GemmMPerBlockM1,
+            GemmNPerBlockN1,
+            GemmKPerBlock,
+            GemmM1PerThreadM111,
+            GemmN1PerThreadN111,
+            GemmKPerThread,
+            GemmM11N11ThreadClusterM110Xs,
+            GemmM11N11ThreadClusterN110Xs,
+            GemmABlockTransferThreadSliceLengths_K0_M0_M1_K1,
+            GemmABlockTransferThreadClusterLengths_K0_M0_M1_K1,
+            Sequence<1, 2, 0, 3>, // ABlockTransferThreadClusterArrangeOrder
+            Sequence<1, 2, 0, 3>, // ABlockTransferSrcAccessOrder
+            GemmABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
+            Sequence<1, 2, 0, 3>, // ABlockTransferSrcVectorTensorContiguousDimOrder
+            GemmABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
+            GemmBBlockTransferThreadSliceLengths_K0_N0_N1_K1,
+            GemmBBlockTransferThreadClusterLengths_K0_N0_N1_K1,
+            Sequence<1, 2, 0, 3>, // BBlockTransferThreadClusterArrangeOrder
+            Sequence<1, 2, 0, 3>, // BBlockTransferSrcAccessOrder
+            GemmBBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
+            Sequence<1, 2, 0, 3>, // BBlockTransferSrcVectorTensorContiguousDimOrder
+            GemmBBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
+            Sequence<0, 1, 2, 3, 4, 5>, // CThreadTransferSrcDstAccessOrder
+            5,                          // CThreadTransferSrcDstVectorDim
+            GemmCThreadTransferDstScalarPerVector_N11,
+            decltype(in_gemmk0_gemmm0_gemmm1_gemmk1_grid_iterator_hacks),
+            decltype(wei_gemmk0_gemmn0_gemmn1_gemmk1_grid_iterator_hacks),
+            decltype(out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_iterator_hacks),
+            decltype(in_gemmk0_gemmm0_gemmm1_gemmk1_grid_move_slice_window_iterator_hacks),
+            decltype(wei_gemmk0_gemmn0_gemmn1_gemmk1_grid_move_slice_window_iterator_hacks)>(
+            static_cast<TInWei*>(in_n_hi_wi_c_device_buf.GetDeviceBuffer()),
+            static_cast<TInWei*>(wei_k_y_x_c_device_buf.GetDeviceBuffer()),
+            static_cast<TOut*>(out_n_ho_wo_k_device_buf.GetDeviceBuffer()),
+            in_gemmk0_gemmm_gemmk1_grid_desc,
+            wei_gemmk0_gemmn_gemmk1_grid_desc,
+            out_gemmm_gemmn_grid_desc,
+            in_gemmk0_gemmm0_gemmm1_gemmk1_grid_iterator_hacks,
+            wei_gemmk0_gemmn0_gemmn1_gemmk1_grid_iterator_hacks,
+            out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_iterator_hacks,
+            in_gemmk0_gemmm0_gemmm1_gemmk1_grid_move_slice_window_iterator_hacks,
+            wei_gemmk0_gemmn0_gemmn1_gemmk1_grid_move_slice_window_iterator_hacks,
+            nrepeat);
+
+        {
+            const auto N = out_n_ho_wo_k_lengths[I0];
+            const auto K = out_n_ho_wo_k_lengths[I3];
+            const auto C = wei_k_y_x_c_lengths[I3];
+
+            const auto Hi = in_n_hi_wi_c_lengths[I1];
+            const auto Wi = in_n_hi_wi_c_lengths[I2];
+
+            const auto Ho = out_n_ho_wo_k_lengths[I1];
+            const auto Wo = out_n_ho_wo_k_lengths[I2];
+
+            const auto Y = wei_k_y_x_c_lengths[I1];
+            const auto X = wei_k_y_x_c_lengths[I2];
+
+            float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
+                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
+                      << std::endl;
+        }
+    }
+
+    // copy result back to host
+    out_n_ho_wo_k_device_buf.FromDevice(out_n_ho_wo_k.mData.data());
+}
diff --git a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
new file mode 100644
index 0000000000..b56cbc0335
--- /dev/null
+++ b/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
@@ -0,0 +1,206 @@
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp"
+#include "driver_dynamic_gemm_xdlops_v2r3.hpp"
+
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw(
+    const InLengths& in_n_c_hi_wi_lengths,
+    const WeiLengths& wei_k_c_y_x_lengths,
+    const OutLengths& out_n_k_ho_wo_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_c_hi_wi,
+    const Tensor<TInWei>& wei_k_c_y_x,
+    Tensor<TOut>& out_n_k_ho_wo,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+
+    std::cout << __func__ << std::endl;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+    constexpr auto I6 = Number<6>{};
+    constexpr auto I7 = Number<7>{};
+    constexpr auto I8 = Number<8>{};
+
+    DeviceMem in_n_c_hi_wi_device_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
+    DeviceMem wei_k_c_y_x_device_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
+    DeviceMem out_n_k_ho_wo_device_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
+
+    in_n_c_hi_wi_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
+    wei_k_c_y_x_device_buf.ToDevice(wei_k_c_y_x.mData.data());
+    out_n_k_ho_wo_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
+
+    const auto in_n_c_hi_wi_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
+    const auto wei_k_c_y_x_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
+    const auto out_n_k_ho_wo_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
+
+#if 1
+    // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN  = 1;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#endif
+
+    const auto descs =
+        transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
+                                                                          in_n_c_hi_wi_desc,
+                                                                          out_n_k_ho_wo_desc,
+                                                                          conv_strides,
+                                                                          conv_dilations,
+                                                                          in_left_pads,
+                                                                          in_right_pads,
+                                                                          Number<GemmK1>{});
+
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = descs[I0];
+    const auto in_gemmk0_gemmn_gemmk1_grid_desc  = descs[I1];
+    const auto out_gemmm_gemmn_grid_desc         = descs[I2];
+
+    // HACK: hacks that control index calculation when iterating over A, B, C matrix
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks = make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}));
+
+    constexpr auto in_gemmk0_gemmn_gemmk1_grid_iterator_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}));
+
+    constexpr auto out_m0_m1_m2_n_grid_iterator_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 1, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 1, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 1, 0, 0>{}),
+                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 2, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 2, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 2, 0, 0>{}));
+
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks =
+        Sequence<0, 0, 0, 0, 0>{};
+
+    constexpr auto in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>{};
+
+    for(index_t i = 0; i < 5; ++i)
+    {
+        float ave_time = driver_dynamic_gemm_xdlops_v2r3<
+            BlockSize,
+            TInWei,
+            TAcc,
+            TOut,
+            InMemoryDataOperationEnum_t::Set,
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_desc),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_desc),
+            decltype(out_gemmm_gemmn_grid_desc),
+            GemmMPerBlock,
+            GemmNPerBlock,
+            GemmKPerBlock,
+            GemmMPerWave,
+            GemmNPerWave,
+            GemmK1,
+            MRepeat,
+            NRepeat,
+            GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1,
+            GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1,
+            Sequence<1, 0, 2>,
+            Sequence<1, 0, 2>,
+            2,
+            GemmABlockTransferSrcScalarPerVector_GemmK1,
+            GemmABlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+            GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1,
+            GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1,
+            Sequence<0, 2, 1>,
+            Sequence<1, 0, 2>,
+            1,
+            GemmBBlockTransferSrcScalarPerVector_GemmN,
+            GemmBBlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+            Sequence<3, 0, 1, 2, 7, 5, 4, 6>,
+            7,
+            GemmCThreadTransferDstScalarPerVector,
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_iterator_hacks),
+            decltype(out_m0_m1_m2_n_grid_iterator_hacks),
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks),
+            false>(static_cast<TInWei*>(wei_k_c_y_x_device_buf.GetDeviceBuffer()),
+                   static_cast<TInWei*>(in_n_c_hi_wi_device_buf.GetDeviceBuffer()),
+                   static_cast<TOut*>(out_n_k_ho_wo_device_buf.GetDeviceBuffer()),
+                   wei_gemmk0_gemmm_gemmk1_grid_desc,
+                   in_gemmk0_gemmn_gemmk1_grid_desc,
+                   out_gemmm_gemmn_grid_desc,
+                   wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks,
+                   in_gemmk0_gemmn_gemmk1_grid_iterator_hacks,
+                   out_m0_m1_m2_n_grid_iterator_hacks,
+                   wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks,
+                   in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks,
+                   nrepeat);
+
+        float perf = (float)calculate_convolution_flops(
+                         in_n_c_hi_wi_desc, wei_k_c_y_x_desc, out_n_k_ho_wo_desc) /
+                     (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+        std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
+    }
+
+    // copy result back to host
+    out_n_k_ho_wo_device_buf.FromDevice(out_n_k_ho_wo.mData.data());
+}
diff --git a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp b/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp
new file mode 100644
index 0000000000..10284b48f3
--- /dev/null
+++ b/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,240 @@
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp"
+#include "driver_dynamic_gemm_xdlops_v2r2.hpp"
+
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk(
+    const InLengths& in_n_hi_wi_c_lengths,
+    const WeiLengths& wei_k_y_x_c_lengths,
+    const OutLengths& out_n_ho_wo_k_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_hi_wi_c,
+    const Tensor<TInWei>& wei_k_y_x_c,
+    Tensor<TOut>& out_n_ho_wo_k,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+
+    std::cout << __func__ << std::endl;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+    constexpr auto I6 = Number<6>{};
+    constexpr auto I7 = Number<7>{};
+    constexpr auto I8 = Number<8>{};
+
+    DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
+    DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
+    DeviceMem out_n_ho_wo_k_device_buf(sizeof(TOut) * out_n_ho_wo_k.mDesc.GetElementSpace());
+
+    in_n_hi_wi_c_device_buf.ToDevice(in_n_hi_wi_c.mData.data());
+    wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data());
+    out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
+
+    const auto in_n_hi_wi_c_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths);
+    const auto wei_k_y_x_c_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths);
+    const auto out_n_ho_wo_k_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths);
+
+#if 1
+    // [M, N, K0, K1] = [256, 128, 4, 4] for fp32
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 64;
+    constexpr index_t GemmNPerWave = 64;
+    constexpr index_t GemmK1       = 4;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 1;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 4>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 4;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 4;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 4;
+#elif 1
+    // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 64;
+    constexpr index_t GemmNPerWave = 64;
+    constexpr index_t GemmK1       = 8;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 1;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 4;
+#endif
+
+    const auto descs =
+        transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk_pad(wei_k_y_x_c_desc,
+                                                                          in_n_hi_wi_c_desc,
+                                                                          out_n_ho_wo_k_desc,
+                                                                          conv_strides,
+                                                                          conv_dilations,
+                                                                          in_left_pads,
+                                                                          in_right_pads,
+                                                                          Number<GemmK1>{});
+
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = descs[I0];
+    const auto in_gemmk0_gemmn_gemmk1_grid_desc  = descs[I1];
+    const auto out_gemmm_gemmn_grid_desc         = descs[I2];
+
+    // HACK: hacks that control index calculation when iterating over A, B, C matrix
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks = make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}));
+
+    constexpr auto in_gemmk0_gemmn_gemmk1_grid_iterator_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}));
+
+    constexpr auto out_m0_m1_m2_n_grid_iterator_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 1, 0, 0>{}),
+                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 2, 0, 0>{}));
+
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks =
+        Sequence<0, 0, 0, 0, 0>{};
+
+    constexpr auto in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>{};
+
+    for(index_t i = 0; i < 5; ++i)
+    {
+        float ave_time = driver_dynamic_gemm_xdlops_v2r2<
+            BlockSize,
+            TInWei,
+            TAcc,
+            TOut,
+            InMemoryDataOperationEnum_t::Set,
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_desc),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_desc),
+            decltype(out_gemmm_gemmn_grid_desc),
+            GemmMPerBlock,
+            GemmNPerBlock,
+            GemmKPerBlock,
+            GemmMPerWave,
+            GemmNPerWave,
+            MRepeat,
+            NRepeat,
+            GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1,
+            GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1,
+            Sequence<1, 0, 2>,
+            Sequence<1, 0, 2>,
+            2,
+            GemmABlockTransferSrcScalarPerVector_GemmK1,
+            GemmABlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+            GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1,
+            GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1,
+            Sequence<1, 0, 2>,
+            Sequence<1, 0, 2>,
+            2,
+            GemmBBlockTransferSrcScalarPerVector_GemmK1,
+            GemmBBlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+            Sequence<2, 3, 0, 1>,
+            2,
+            GemmCThreadTransferDstScalarPerVector,
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_iterator_hacks),
+            decltype(out_m0_m1_m2_n_grid_iterator_hacks),
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks)>(
+            static_cast<TInWei*>(wei_k_y_x_c_device_buf.GetDeviceBuffer()),
+            static_cast<TInWei*>(in_n_hi_wi_c_device_buf.GetDeviceBuffer()),
+            static_cast<TOut*>(out_n_ho_wo_k_device_buf.GetDeviceBuffer()),
+            wei_gemmk0_gemmm_gemmk1_grid_desc,
+            in_gemmk0_gemmn_gemmk1_grid_desc,
+            out_gemmm_gemmn_grid_desc,
+            wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks,
+            in_gemmk0_gemmn_gemmk1_grid_iterator_hacks,
+            out_m0_m1_m2_n_grid_iterator_hacks,
+            wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks,
+            in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks,
+            nrepeat);
+
+        {
+            const auto N = out_n_ho_wo_k_lengths[I0];
+            const auto K = out_n_ho_wo_k_lengths[I3];
+            const auto C = wei_k_y_x_c_lengths[I3];
+
+            const auto Hi = in_n_hi_wi_c_lengths[I1];
+            const auto Wi = in_n_hi_wi_c_lengths[I2];
+
+            const auto Ho = out_n_ho_wo_k_lengths[I1];
+            const auto Wo = out_n_ho_wo_k_lengths[I2];
+
+            const auto Y = wei_k_y_x_c_lengths[I1];
+            const auto X = wei_k_y_x_c_lengths[I2];
+
+            float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
+                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
+                      << std::endl;
+        }
+    }
+
+    // copy result back to host
+    out_n_ho_wo_k_device_buf.FromDevice(out_n_ho_wo_k.mData.data());
+}
diff --git a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp b/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp
new file mode 100644
index 0000000000..f2a30fb525
--- /dev/null
+++ b/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,305 @@
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp"
+#include "driver_dynamic_gemm_xdlops_v2r3.hpp"
+
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_dynamic_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk(
+    const InLengths& in_n_hi_wi_c_lengths,
+    const WeiLengths& wei_k_y_x_c_lengths,
+    const OutLengths& out_n_ho_wo_k_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_hi_wi_c,
+    const Tensor<TInWei>& wei_k_y_x_c,
+    Tensor<TOut>& out_n_ho_wo_k,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+
+    std::cout << __func__ << std::endl;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+    constexpr auto I6 = Number<6>{};
+    constexpr auto I7 = Number<7>{};
+    constexpr auto I8 = Number<8>{};
+
+    DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
+    DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
+    DeviceMem out_n_ho_wo_k_device_buf(sizeof(TOut) * out_n_ho_wo_k.mDesc.GetElementSpace());
+
+    in_n_hi_wi_c_device_buf.ToDevice(in_n_hi_wi_c.mData.data());
+    wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data());
+    out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
+
+    const auto in_n_hi_wi_c_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths);
+    const auto wei_k_y_x_c_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths);
+    const auto out_n_ho_wo_k_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths);
+
+#if 1
+    // [M, N, K0, K1] = [256, 128, 4, 4] for fp32
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 4;
+
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 4>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 4;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 4;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 4;
+#elif 1
+    // [M, N, K0, K1] = [128, 128, 4, 4] for fp32
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 4;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 2;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 4>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 4;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 4;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 4;
+#elif 0
+    // [M, N, K0, K1] = [256, 256, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 256;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 4;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 4, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 4;
+#elif 1
+    // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 4;
+#endif
+
+    const auto descs =
+        transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk_pad(wei_k_y_x_c_desc,
+                                                                          in_n_hi_wi_c_desc,
+                                                                          out_n_ho_wo_k_desc,
+                                                                          conv_strides,
+                                                                          conv_dilations,
+                                                                          in_left_pads,
+                                                                          in_right_pads,
+                                                                          Number<GemmK1>{});
+
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = descs[I0];
+    const auto in_gemmk0_gemmn_gemmk1_grid_desc  = descs[I1];
+    const auto out_gemmm_gemmn_grid_desc         = descs[I2];
+
+    // HACK: hacks that control index calculation when iterating over A, B, C matrix
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks = make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}));
+
+    constexpr auto in_gemmk0_gemmn_gemmk1_grid_iterator_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}));
+
+    constexpr auto out_m0_m1_m2_n_grid_iterator_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 1, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 1, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 1, 0, 0>{}),
+                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 2, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 2, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 2, 0, 0>{}));
+
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks =
+        Sequence<0, 0, 0, 0, 0>{};
+
+    constexpr auto in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>{};
+
+    for(index_t i = 0; i < 5; ++i)
+    {
+        float ave_time = driver_dynamic_gemm_xdlops_v2r3<
+            BlockSize,
+            TInWei,
+            TAcc,
+            TOut,
+            InMemoryDataOperationEnum_t::Set,
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_desc),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_desc),
+            decltype(out_gemmm_gemmn_grid_desc),
+            GemmMPerBlock,
+            GemmNPerBlock,
+            GemmKPerBlock,
+            GemmMPerWave,
+            GemmNPerWave,
+            MRepeat,
+            NRepeat,
+            GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1,
+            GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1,
+            Sequence<1, 0, 2>,
+            Sequence<1, 0, 2>,
+            2,
+            GemmABlockTransferSrcScalarPerVector_GemmK1,
+            GemmABlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+            GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1,
+            GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1,
+            Sequence<1, 0, 2>,
+            Sequence<1, 0, 2>,
+            2,
+            GemmBBlockTransferSrcScalarPerVector_GemmK1,
+            GemmBBlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+            Sequence<2, 3, 0, 1, 7, 5, 4, 6>,
+            6,
+            GemmCThreadTransferDstScalarPerVector,
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_iterator_hacks),
+            decltype(out_m0_m1_m2_n_grid_iterator_hacks),
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks),
+            false // CAccessOrderMRepeatNRepeat
+            >(static_cast<TInWei*>(wei_k_y_x_c_device_buf.GetDeviceBuffer()),
+              static_cast<TInWei*>(in_n_hi_wi_c_device_buf.GetDeviceBuffer()),
+              static_cast<TOut*>(out_n_ho_wo_k_device_buf.GetDeviceBuffer()),
+              wei_gemmk0_gemmm_gemmk1_grid_desc,
+              in_gemmk0_gemmn_gemmk1_grid_desc,
+              out_gemmm_gemmn_grid_desc,
+              wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks,
+              in_gemmk0_gemmn_gemmk1_grid_iterator_hacks,
+              out_m0_m1_m2_n_grid_iterator_hacks,
+              wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks,
+              in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks,
+              nrepeat);
+
+        {
+            const auto N = out_n_ho_wo_k_lengths[I0];
+            const auto K = out_n_ho_wo_k_lengths[I3];
+            const auto C = wei_k_y_x_c_lengths[I3];
+
+            const auto Hi = in_n_hi_wi_c_lengths[I1];
+            const auto Wi = in_n_hi_wi_c_lengths[I2];
+
+            const auto Ho = out_n_ho_wo_k_lengths[I1];
+            const auto Wo = out_n_ho_wo_k_lengths[I2];
+
+            const auto Y = wei_k_y_x_c_lengths[I1];
+            const auto X = wei_k_y_x_c_lengths[I2];
+
+            float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
+                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
+                      << std::endl;
+        }
+    }
+
+    // copy result back to host
+    out_n_ho_wo_k_device_buf.FromDevice(out_n_ho_wo_k.mData.data());
+}
diff --git a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp b/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
new file mode 100644
index 0000000000..601878c347
--- /dev/null
+++ b/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,365 @@
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
+#include "driver_dynamic_gemm_xdlops_v2r3.hpp"
+
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk(
+    const InLengths& in_n_hi_wi_c_lengths,
+    const WeiLengths& wei_k_y_x_c_lengths,
+    const OutLengths& out_n_ho_wo_k_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_hi_wi_c,
+    const Tensor<TInWei>& wei_k_y_x_c,
+    Tensor<TOut>& out_n_ho_wo_k,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+
+    std::cout << __func__ << std::endl;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+    constexpr auto I6 = Number<6>{};
+    constexpr auto I7 = Number<7>{};
+    constexpr auto I8 = Number<8>{};
+
+    DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
+    DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
+    DeviceMem out_n_ho_wo_k_device_buf(sizeof(TOut) * out_n_ho_wo_k.mDesc.GetElementSpace());
+
+    in_n_hi_wi_c_device_buf.ToDevice(in_n_hi_wi_c.mData.data());
+    wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data());
+    out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
+
+    const auto in_n_hi_wi_c_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths);
+    const auto wei_k_y_x_c_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths);
+    const auto out_n_ho_wo_k_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths);
+
+#if 0
+    // [M, N, K0, K1] = [256, 128, 4, 4] for fp32
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 4;
+
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 4>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 4;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 4;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#elif 0
+    // [M, N, K0, K1] = [128, 128, 4, 4] for fp32
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 4;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 2;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 4>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 4;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 4;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#elif 0
+    // [M, N, K0, K1] = [256, 256, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 256;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 4;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 4, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#elif 0
+    // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#elif 1
+    // [M, N, K0, K1] = [128, 256, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 256;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 4;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 4, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#elif 1
+    // [M, N, K0, K1] = [128, 128, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 2;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#endif
+
+    const auto descs =
+        transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad(in_n_hi_wi_c_desc,
+                                                                          wei_k_y_x_c_desc,
+                                                                          out_n_ho_wo_k_desc,
+                                                                          conv_strides,
+                                                                          conv_dilations,
+                                                                          in_left_pads,
+                                                                          in_right_pads,
+                                                                          Number<GemmK1>{});
+
+    const auto in_gemmk0_gemmm_gemmk1_grid_desc  = descs[I0];
+    const auto wei_gemmk0_gemmn_gemmk1_grid_desc = descs[I1];
+    const auto out_gemmm_gemmn_grid_desc         = descs[I2];
+
+    // HACK: hacks that control index calculation when iterating over A, B, C matrix
+    constexpr auto in_gemmk0_gemmm_gemmk1_grid_iterator_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},   // 0+: GemmK0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},   // 1+: GemmM
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),  // 2+: GemmK1
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},   // 0-: GemmK0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},   // 1-: GemmM
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})); // 2-: GemmK1
+
+    constexpr auto wei_gemmk0_gemmn_gemmk1_grid_iterator_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},   // 0+: GemmK0
+                              Sequence<0, 0, 0, 0, 0>{},   // 1+: GemmN
+                              Sequence<0, 0, 0, 0, 0>{}),  // 2+: GemmK1
+                   make_tuple(Sequence<0, 0, 0, 0, 0>{},   // 0-: GemmK0
+                              Sequence<0, 0, 0, 0, 0>{},   // 1-: GemmN
+                              Sequence<0, 0, 0, 0, 0>{})); // 2-: GemmK1
+
+    constexpr auto out_m0_m1_m2_n_grid_iterator_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},   // 0+: MRepeat
+                              Sequence<0, 0, 0, 0, 0>{},   // 1+: NRepeat
+                              Sequence<0, 0, 0, 0, 0>{},   // 2+: MWaves
+                              Sequence<0, 0, 0, 0, 0>{},   // 3+: NWaves
+                              Sequence<0, 0, 0, 0, 0>{},   // 4+: M0
+                              Sequence<0, 0, 0, 0, 0>{},   // 5+: M1
+                              Sequence<0, 0, 0, 0, 0>{},   // 6+: M2
+                              Sequence<0, 0, 0, 0, 0>{}),  // 7+: N1
+                   make_tuple(Sequence<0, 0, 0, 0, 0>{},   // 0-: MRepeat
+                              Sequence<0, 0, 0, 0, 0>{},   // 1-: NRepeat
+                              Sequence<0, 0, 0, 0, 0>{},   // 2-: MWaves
+                              Sequence<0, 0, 0, 0, 0>{},   // 3-: NWaves
+                              Sequence<0, 0, 0, 0, 0>{},   // 4-: M0
+                              Sequence<0, 0, 0, 0, 0>{},   // 5-: M1
+                              Sequence<0, 0, 0, 0, 0>{},   // 6-: M2
+                              Sequence<0, 0, 0, 0, 0>{})); // 7-: N1
+
+    constexpr auto in_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>{};
+
+    constexpr auto wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks =
+        Sequence<0, 0, 0, 0, 0>{};
+
+    for(index_t i = 0; i < 5; ++i)
+    {
+        float ave_time = driver_dynamic_gemm_xdlops_v2r3<
+            BlockSize,
+            TInWei,
+            TAcc,
+            TOut,
+            InMemoryDataOperationEnum_t::Set,
+            decltype(in_gemmk0_gemmm_gemmk1_grid_desc),
+            decltype(wei_gemmk0_gemmn_gemmk1_grid_desc),
+            decltype(out_gemmm_gemmn_grid_desc),
+            GemmMPerBlock,
+            GemmNPerBlock,
+            GemmKPerBlock,
+            GemmMPerWave,
+            GemmNPerWave,
+            GemmK1,
+            MRepeat,
+            NRepeat,
+            GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1,
+            GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1,
+            Sequence<1, 0, 2>,
+            Sequence<1, 0, 2>,
+            2,
+            GemmABlockTransferSrcScalarPerVector_GemmK1,
+            GemmABlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+            GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1,
+            GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1,
+            Sequence<1, 0, 2>,
+            Sequence<1, 0, 2>,
+            2,
+            GemmBBlockTransferSrcScalarPerVector_GemmK1,
+            GemmBBlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+            Sequence<2, 3, 0, 1, 7, 5, 4, 6>,
+            7,
+            GemmCThreadTransferDstScalarPerVector,
+            decltype(in_gemmk0_gemmm_gemmk1_grid_iterator_hacks),
+            decltype(wei_gemmk0_gemmn_gemmk1_grid_iterator_hacks),
+            decltype(out_m0_m1_m2_n_grid_iterator_hacks),
+            decltype(in_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks),
+            decltype(wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks),
+            false // CAccessOrderMRepeatNRepeat
+            >(static_cast<TInWei*>(in_n_hi_wi_c_device_buf.GetDeviceBuffer()),
+              static_cast<TInWei*>(wei_k_y_x_c_device_buf.GetDeviceBuffer()),
+              static_cast<TOut*>(out_n_ho_wo_k_device_buf.GetDeviceBuffer()),
+              in_gemmk0_gemmm_gemmk1_grid_desc,
+              wei_gemmk0_gemmn_gemmk1_grid_desc,
+              out_gemmm_gemmn_grid_desc,
+              in_gemmk0_gemmm_gemmk1_grid_iterator_hacks,
+              wei_gemmk0_gemmn_gemmk1_grid_iterator_hacks,
+              out_m0_m1_m2_n_grid_iterator_hacks,
+              in_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks,
+              wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks,
+              nrepeat);
+
+        {
+            const auto N = out_n_ho_wo_k_lengths[I0];
+            const auto K = out_n_ho_wo_k_lengths[I3];
+            const auto C = wei_k_y_x_c_lengths[I3];
+
+            const auto Hi = in_n_hi_wi_c_lengths[I1];
+            const auto Wi = in_n_hi_wi_c_lengths[I2];
+
+            const auto Ho = out_n_ho_wo_k_lengths[I1];
+            const auto Wo = out_n_ho_wo_k_lengths[I2];
+
+            const auto Y = wei_k_y_x_c_lengths[I1];
+            const auto X = wei_k_y_x_c_lengths[I2];
+
+            float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
+                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
+                      << std::endl;
+        }
+    }
+
+    // copy result back to host
+    out_n_ho_wo_k_device_buf.FromDevice(out_n_ho_wo_k.mData.data());
+}
diff --git a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
new file mode 100644
index 0000000000..ca0d47c33a
--- /dev/null
+++ b/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
@@ -0,0 +1,192 @@
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp"
+#include "driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp"
+
+template <typename TInWei,
+          ck::index_t InWeiVectorSize,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
+    const InLengths& in_n_c_hi_wi_lengths,
+    const WeiLengths& wei_k_c_y_x_lengths,
+    const OutLengths& out_n_k_ho_wo_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_c_hi_wi,
+    const Tensor<TInWei>& wei_k_c_y_x,
+    Tensor<TOut>& out_n_k_ho_wo,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+
+    std::cout << __func__ << std::endl;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    const auto N = out_n_k_ho_wo_lengths[I0];
+    const auto K = out_n_k_ho_wo_lengths[I1];
+    const auto C = wei_k_c_y_x_lengths[I1];
+
+    const auto Hi = in_n_c_hi_wi_lengths[I2];
+    const auto Wi = in_n_c_hi_wi_lengths[I3];
+
+    const auto Ho = out_n_k_ho_wo_lengths[I2];
+    const auto Wo = out_n_k_ho_wo_lengths[I3];
+
+    const auto Y = wei_k_c_y_x_lengths[I2];
+    const auto X = wei_k_c_y_x_lengths[I3];
+
+    const auto C0 = C / Number<InWeiVectorSize>{};
+    const auto C1 = Number<InWeiVectorSize>{};
+
+    const auto K0 = K / Number<InWeiVectorSize>{};
+    const auto K1 = Number<InWeiVectorSize>{};
+
+    Tensor<TInWei> in_n_c0_hi_wi_c1(
+        HostTensorDescriptor(std::initializer_list<index_t>{N, C0, Hi, Wi, C1}));
+    Tensor<TInWei> wei_k_c0_y_x_c1(
+        HostTensorDescriptor(std::initializer_list<index_t>{K, C0, Y, X, C1}));
+    Tensor<TOut> out_n_k0_ho_wo_k1(
+        HostTensorDescriptor(std::initializer_list<index_t>{N, K0, Ho, Wo, K1}));
+
+    auto f_nchw2nc0hwc1 = [&](auto n, auto hi, auto wi, auto c) {
+        in_n_c0_hi_wi_c1(n, c / InWeiVectorSize, hi, wi, c % InWeiVectorSize) =
+            in_n_c_hi_wi(n, c, hi, wi);
+    };
+
+    auto f_kcyx2kc0yxc1 = [&](auto k, auto y, auto x, auto c) {
+        wei_k_c0_y_x_c1(k, c / InWeiVectorSize, y, x, c % InWeiVectorSize) =
+            wei_k_c_y_x(k, c, y, x);
+    };
+
+    make_ParallelTensorFunctor(f_nchw2nc0hwc1, N, Hi, Wi, C)();
+    make_ParallelTensorFunctor(f_kcyx2kc0yxc1, K, Y, X, C)();
+
+    DeviceMem in_n_c0_hi_wi_c1_device_buf(sizeof(TInWei) *
+                                          in_n_c0_hi_wi_c1.mDesc.GetElementSpace());
+    DeviceMem wei_k_c0_y_x_c1_device_buf(sizeof(TInWei) * wei_k_c0_y_x_c1.mDesc.GetElementSpace());
+    DeviceMem out_n_k0_ho_wo_k1_device_buf(sizeof(TOut) *
+                                           out_n_k0_ho_wo_k1.mDesc.GetElementSpace());
+
+    in_n_c0_hi_wi_c1_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data());
+    wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data());
+
+    const auto in_n_c0_hi_wi_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, C0, Hi, Wi));
+    const auto wei_k_c0_y_x_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C0, Y, X));
+    const auto out_n_k0_ho_wo_k1_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Ho, Wo, K1));
+
+#if 1
+    // cdata = 64, BlockSize = 64, 16x8x32x4
+    constexpr index_t BlockSize = 64;
+
+    constexpr index_t KPerBlock  = 16;
+    constexpr index_t HoPerBlock = 8;
+    constexpr index_t WoPerBlock = 32;
+    constexpr index_t EPerBlock  = 1;
+
+    constexpr index_t KPerThread  = KPerBlock;
+    constexpr index_t HoPerThread = 2;
+    constexpr index_t WoPerThread = 2;
+    constexpr index_t EPerThread  = EPerBlock;
+
+    using ABlockTransferThreadSliceLengths_E_K   = Sequence<3, 1>;
+    using ABlockTransferThreadClusterLengths_E_K = Sequence<3 * EPerBlock, KPerBlock>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_E = 1;
+    constexpr index_t ABlockTransferDstScalarPerVector_K = 1;
+
+    constexpr index_t BThreadTransferSrcScalarPerVector_W = 1;
+
+    constexpr index_t CThreadTransferDstScalarPerVector_W = 16;
+
+    static_assert(KPerThread % CThreadTransferDstScalarPerVector_W == 0, "");
+#else
+    constexpr index_t BlockSize = 64;
+
+    constexpr index_t KPerBlock  = 16;
+    constexpr index_t HoPerBlock = 8;
+    constexpr index_t WoPerBlock = 32;
+    constexpr index_t EPerBlock  = 1;
+
+    constexpr index_t KPerThread  = 16;
+    constexpr index_t HoPerThread = 2;
+    constexpr index_t WoPerThread = 2;
+    constexpr index_t EPerThread  = EPerBlock;
+
+    using ABlockTransferThreadSliceLengths_E_K   = Sequence<9, 1>;
+    using ABlockTransferThreadClusterLengths_E_K = Sequence<EPerBlock, 16>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_E = 1;
+    constexpr index_t ABlockTransferDstScalarPerVector_K = 1;
+
+    constexpr index_t BThreadTransferSrcScalarPerVector_W = 1;
+
+    constexpr index_t CThreadTransferDstScalarPerVector_W = K1;
+
+    static_assert(KPerThread % CThreadTransferDstScalarPerVector_W == 0, "");
+#endif
+
+    constexpr auto conv_driver =
+#if 0
+        DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
+#else
+        DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outpad
+#endif
+        <BlockSize,
+         typename vector_type<TInWei, InWeiVectorSize>::type,
+         TAcc,
+         TOut,
+         KPerBlock,
+         HoPerBlock,
+         WoPerBlock,
+         EPerBlock,
+         KPerThread,
+         HoPerThread,
+         WoPerThread,
+         EPerThread,
+         ABlockTransferThreadSliceLengths_E_K,
+         ABlockTransferThreadClusterLengths_E_K,
+         ABlockTransferSrcScalarPerVector_E,
+         ABlockTransferDstScalarPerVector_K,
+         BThreadTransferSrcScalarPerVector_W,
+         CThreadTransferDstScalarPerVector_W>{};
+
+    conv_driver.Run(wei_k_c0_y_x_desc,
+                    in_n_c0_hi_wi_desc,
+                    out_n_k0_ho_wo_k1_desc,
+                    conv_strides,
+                    conv_dilations,
+                    in_left_pads,
+                    in_right_pads,
+                    static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
+                        wei_k_c0_y_x_c1_device_buf.GetDeviceBuffer()),
+                    static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
+                        in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()),
+                    static_cast<TOut*>(out_n_k0_ho_wo_k1_device_buf.GetDeviceBuffer()));
+
+    out_n_k0_ho_wo_k1_device_buf.FromDevice(out_n_k0_ho_wo_k1.mData.data());
+
+    auto f_nk0hwk1_to_nkhw = [&](auto n, auto k, auto ho, auto wo) {
+        out_n_k_ho_wo(n, k, ho, wo) =
+            out_n_k0_ho_wo_k1(n, k / InWeiVectorSize, ho, wo, k % InWeiVectorSize);
+    };
+
+    make_ParallelTensorFunctor(f_nk0hwk1_to_nkhw, N, K, Ho, Wo)();
+}
diff --git a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
new file mode 100644
index 0000000000..8fb276b464
--- /dev/null
+++ b/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
@@ -0,0 +1,244 @@
+#pragma once
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp"
+#include "driver_dynamic_contraction_dlops_v1r2.hpp"
+
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
+    const InLengths& in_n_c_hi_wi_lengths,
+    const WeiLengths& wei_k_c_y_x_lengths,
+    const OutLengths& out_n_k_ho_wo_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_c_hi_wi,
+    const Tensor<TInWei>& wei_k_c_y_x,
+    Tensor<TOut>& out_n_k_ho_wo,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+
+    std::cout << __func__ << std::endl;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+
+    DeviceMem in_n_c_hi_wi_device_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
+    DeviceMem wei_k_c_y_x_device_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
+    DeviceMem out_n_k_ho_wo_device_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
+
+    in_n_c_hi_wi_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
+    wei_k_c_y_x_device_buf.ToDevice(wei_k_c_y_x.mData.data());
+    out_n_k_ho_wo_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
+
+    const auto in_desc_n_c_hi_wi =
+        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
+    const auto wei_desc_k_c_y_x =
+        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
+    const auto out_desc_n_k_ho_wo =
+        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
+
+#if 1
+    // [8, 1, 128, 1] * [8, 4, 32, 1] = [1, 128, 4, 32] for fp32
+    // cdata = 64, BlockSize = 256
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GN0 = 4;
+    constexpr index_t GK1 = 1;
+
+    constexpr index_t GM1PerBlockGM11 = 128;
+    constexpr index_t GN1PerBlockGN11 = 32;
+    constexpr index_t GK0PerBlock     = 8;
+
+    constexpr index_t BM1PerThreadBM11 = 4;
+    constexpr index_t BN1PerThreadBN11 = 4;
+    constexpr index_t BK0PerThread     = 1;
+
+    using BM10BN10ThreadClusterBM10Xs = Sequence<8, 2>;
+    using BM10BN10ThreadClusterBN10Xs = Sequence<8, 2>;
+
+    using ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1   = Sequence<4, 1, 1, 1, 1>;
+    using ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<2, 1, 1, 128, 1>;
+
+    using ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<4, 1, 1, 1, 1>;
+    using ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<1, 1, 1, 1, 1>;
+
+    using BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1   = Sequence<1, 4, 1, 1, 1>;
+    using BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<8, 1, 1, 32, 1>;
+
+    using BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<1, 1, 1, 1, 1>;
+    using BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<1, 1, 1, 1, 1>;
+
+    constexpr index_t CThreadTransferDstScalarPerVector_BN1 = 1;
+#elif 1
+    // [8, 1, 128, 2] * [8, 4, 32, 2] = [1, 128, 4, 32] for fp16
+    // cdata = 64, BlockSize = 256
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GN0 = 4;
+    constexpr index_t GK1 = 2;
+
+    constexpr index_t GM1PerBlockGM11 = 128;
+    constexpr index_t GN1PerBlockGN11 = 32;
+    constexpr index_t GK0PerBlock     = 8;
+
+    constexpr index_t BM1PerThreadBM11 = 4;
+    constexpr index_t BN1PerThreadBN11 = 4;
+    constexpr index_t BK0PerThread     = 1;
+
+    using BM10BN10ThreadClusterBM10Xs = Sequence<8, 2>;
+    using BM10BN10ThreadClusterBN10Xs = Sequence<8, 2>;
+
+    using ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1   = Sequence<4, 1, 1, 1, 2>;
+    using ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<2, 1, 1, 128, 1>;
+
+    using ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<4, 1, 1, 1, 1>;
+    using ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<1, 1, 1, 1, 2>;
+
+    using BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1   = Sequence<1, 4, 1, 1, 2>;
+    using BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<8, 1, 1, 32, 1>;
+
+    using BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<1, 1, 1, 1, 1>;
+    using BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<1, 1, 1, 1, 2>;
+
+    constexpr index_t CThreadTransferDstScalarPerVector_BN1 = 1;
+#endif
+
+    const auto descs =
+        transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad(wei_desc_k_c_y_x,
+                                                                               in_desc_n_c_hi_wi,
+                                                                               out_desc_n_k_ho_wo,
+                                                                               conv_strides,
+                                                                               conv_dilations,
+                                                                               in_left_pads,
+                                                                               in_right_pads,
+                                                                               Number<GN0>{},
+                                                                               Number<GK1>{});
+
+    const auto wei_grid_desc_gk0_gm0_gm1_gk1 = descs[I0];
+    const auto in_grid_desc_gk0_gn0_gn1_gk1  = descs[I1];
+    const auto out_grid_desc_gm0_gm1_gn0_gn1 = descs[I2];
+
+    // HACK: hacks that control index calculation when iterating over A, B, C matrix
+    constexpr auto wei_grid_iterator_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0>{},   // 0+: GK0
+                              Sequence<0, 0, 0, 0, 0, 0, 0>{},   // 1+: GM0
+                              Sequence<0, 0, 0, 0, 0, 0, 0>{},   // 2+: GM10
+                              Sequence<0, 0, 0, 0, 0, 0, 0>{},   // 3+: GM11
+                              Sequence<0, 0, 0, 0, 0, 0, 0>{}),  // 4+: GK1
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0>{},   // 0-: GK0
+                              Sequence<0, 0, 0, 0, 0, 0, 0>{},   // 1-: GM0
+                              Sequence<0, 0, 0, 0, 0, 0, 0>{},   // 2-: GM10
+                              Sequence<0, 0, 0, 0, 0, 0, 0>{},   // 3-: GM11
+                              Sequence<0, 0, 0, 0, 0, 0, 0>{})); // 4-: GK1
+
+    constexpr auto in_grid_iterator_hacks = make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0>{},   // 0+: GK0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},   // 1+: GN0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},   // 2+: GN10
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},   // 3+: GN11
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 4+: GK1
+        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0>{},   // 0-: GK0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},   // 1-: GN0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},   // 2-: GN10
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},   // 3-: GN11
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 4-: GK1
+
+    constexpr auto out_grid_iterator_hacks = make_tuple(
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 0+: GM10
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},  // 1+: BM0
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},  // 2+: BM1
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 3+: GN10
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{},  // 4+: BN0
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{}), // 5+: GN1
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0-: GM10
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{},   // 1-: BM0
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{},   // 2-: BM1
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3-: GN10
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{},   // 4-: BN0
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{})); // 5-: GN1
+
+    constexpr auto wei_grid_move_slice_window_iterator_hacks = Sequence<0, 0, 0, 0, 0, 0, 0>{};
+
+    constexpr auto in_grid_move_slice_window_iterator_hacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0>{};
+
+    for(index_t i = 0; i < 5; ++i)
+    {
+        float ave_time = driver_dynamic_contraction_dlops_v1r2<
+            BlockSize,
+            TInWei,
+            TAcc,
+            TOut,
+            InMemoryDataOperationEnum_t::Set,
+            decltype(wei_grid_desc_gk0_gm0_gm1_gk1),
+            decltype(in_grid_desc_gk0_gn0_gn1_gk1),
+            decltype(out_grid_desc_gm0_gm1_gn0_gn1),
+            GM1PerBlockGM11,
+            GN1PerBlockGN11,
+            GK0PerBlock,
+            BM1PerThreadBM11,
+            BN1PerThreadBN11,
+            BK0PerThread,
+            BM10BN10ThreadClusterBM10Xs,
+            BM10BN10ThreadClusterBN10Xs,
+            ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1,
+            ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1,
+            Sequence<1, 2, 3, 0, 4>, // ABlockTransferThreadClusterArrangeOrder
+            Sequence<3, 2, 1, 0, 4>, // ABlockTransferSrcAccessOrder
+            ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
+            ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
+            Sequence<0, 1, 2, 3, 4>, // ABlockTransferSrcVectorTensorContiguousDimOrder
+            BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1,
+            Sequence<0, 4, 1, 2, 3>, // BBlockTransferThreadClusterArrangeOrder
+            Sequence<4, 3, 2, 0, 1>, // BBlockTransferSrcAccessOrder
+            BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
+            Sequence<0, 1, 2, 3, 4>,    // BBlockTransferSrcVectorTensorContiguousDimOrder
+            Sequence<3, 4, 5, 0, 1, 2>, // CThreadTransferSrcDstAccessOrder
+            5,                          // CThreadTransferSrcDstVectorDim
+            CThreadTransferDstScalarPerVector_BN1,
+            decltype(wei_grid_iterator_hacks),
+            decltype(in_grid_iterator_hacks),
+            decltype(out_grid_iterator_hacks),
+            decltype(wei_grid_move_slice_window_iterator_hacks),
+            decltype(in_grid_move_slice_window_iterator_hacks)>(
+            static_cast<TInWei*>(wei_k_c_y_x_device_buf.GetDeviceBuffer()),
+            static_cast<TInWei*>(in_n_c_hi_wi_device_buf.GetDeviceBuffer()),
+            static_cast<TOut*>(out_n_k_ho_wo_device_buf.GetDeviceBuffer()),
+            wei_grid_desc_gk0_gm0_gm1_gk1,
+            in_grid_desc_gk0_gn0_gn1_gk1,
+            out_grid_desc_gm0_gm1_gn0_gn1,
+            wei_grid_iterator_hacks,
+            in_grid_iterator_hacks,
+            out_grid_iterator_hacks,
+            wei_grid_move_slice_window_iterator_hacks,
+            in_grid_move_slice_window_iterator_hacks,
+            nrepeat);
+
+        float perf = (float)calculate_convolution_flops(
+                         in_desc_n_c_hi_wi, wei_desc_k_c_y_x, out_desc_n_k_ho_wo) /
+                     (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+        std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
+    }
+
+    // copy result back to host
+    out_n_k_ho_wo_device_buf.FromDevice(out_n_k_ho_wo.mData.data());
+}
diff --git a/host/driver_offline/include/driver_dynamic_contraction_dlops_v1r2.hpp b/host/driver_offline/include/driver_dynamic_contraction_dlops_v1r2.hpp
new file mode 100644
index 0000000000..2f175962c1
--- /dev/null
+++ b/host/driver_offline/include/driver_dynamic_contraction_dlops_v1r2.hpp
@@ -0,0 +1,290 @@
+#ifndef DRIVER_DYNAMIC_CONTRACTION_DLOPS_V1R2_HPP
+#define DRIVER_DYNAMIC_CONTRACTION_DLOPS_V1R2_HPP
+
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "gridwise_dynamic_contraction_dlops_v1r2.hpp"
+
+template <ck::index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          ck::InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
+          typename AGridDesc_GK0_GM0_GM1_GK1,
+          typename BGridDesc_GK0_GN0_GN1_GK1,
+          typename CGridDesc_GM0_GM1_GN0_GN1,
+          ck::index_t GM1PerBlockGM11,
+          ck::index_t GN1PerBlockGN11,
+          ck::index_t GK0PerBlock,
+          ck::index_t BM1PerThreadBM11,
+          ck::index_t BN1PerThreadBN11,
+          ck::index_t BK0PerThread,
+          typename BM10BN10ThreadClusterBM10Xs,
+          typename BM10BN10ThreadClusterBN10Xs,
+          typename ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1,
+          typename ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          typename ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
+          typename ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
+          typename ABlockTransferSrcVectorTensorContiguousDimOrder,
+          typename BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1,
+          typename BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          typename BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
+          typename BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
+          typename BBlockTransferSrcVectorTensorContiguousDimOrder,
+          typename CThreadTransferSrcDstAccessOrder,
+          ck::index_t CThreadTransferSrcDstVectorDim,
+          ck::index_t CThreadTransferDstScalarPerVector,
+          typename AGridIteratorHacks,
+          typename BGridIteratorHacks,
+          typename CGridIteratorHacks,
+          typename AGridMoveSliceWindowIteratorHacks,
+          typename BGridMoveSliceWindowIteratorHacks>
+__host__ float
+driver_dynamic_contraction_dlops_v1r2(const FloatAB* p_a_grid,
+                                      const FloatAB* p_b_grid,
+                                      FloatC* p_c_grid,
+                                      const AGridDesc_GK0_GM0_GM1_GK1& a_grid_desc_gk0_gm0_gm1_gk1,
+                                      const BGridDesc_GK0_GN0_GN1_GK1& b_grid_desc_gk0_gn0_gn1_gk1,
+                                      const CGridDesc_GM0_GM1_GN0_GN1& c_grid_desc_gm0_gm1_gn0_gn1,
+                                      AGridIteratorHacks,
+                                      BGridIteratorHacks,
+                                      CGridIteratorHacks,
+                                      AGridMoveSliceWindowIteratorHacks,
+                                      BGridMoveSliceWindowIteratorHacks,
+                                      ck::index_t nrepeat)
+
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+
+    // GEMM
+    using GridwiseContraction =
+        GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1<
+            BlockSize,
+            FloatAB,
+            FloatAcc,
+            FloatC,
+            CGlobalMemoryDataOperation,
+            AGridDesc_GK0_GM0_GM1_GK1,
+            BGridDesc_GK0_GN0_GN1_GK1,
+            CGridDesc_GM0_GM1_GN0_GN1,
+            GM1PerBlockGM11,
+            GN1PerBlockGN11,
+            GK0PerBlock,
+            BM1PerThreadBM11,
+            BN1PerThreadBN11,
+            BK0PerThread,
+            BM10BN10ThreadClusterBM10Xs,
+            BM10BN10ThreadClusterBN10Xs,
+            ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1,
+            ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1,
+            ABlockTransferThreadClusterArrangeOrder,
+            ABlockTransferSrcAccessOrder,
+            ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
+            ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
+            ABlockTransferSrcVectorTensorContiguousDimOrder,
+            BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferThreadClusterArrangeOrder,
+            BBlockTransferSrcAccessOrder,
+            BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferSrcVectorTensorContiguousDimOrder,
+            CThreadTransferSrcDstAccessOrder,
+            CThreadTransferSrcDstVectorDim,
+            CThreadTransferDstScalarPerVector,
+            AGridIteratorHacks,
+            BGridIteratorHacks,
+            CGridIteratorHacks,
+            AGridMoveSliceWindowIteratorHacks,
+            BGridMoveSliceWindowIteratorHacks>;
+
+    const auto GK0 = a_grid_desc_gk0_gm0_gm1_gk1.GetLength(I0);
+
+    if(!GridwiseContraction::CheckValidity(
+           a_grid_desc_gk0_gm0_gm1_gk1, b_grid_desc_gk0_gn0_gn1_gk1, c_grid_desc_gm0_gm1_gn0_gn1))
+    {
+        throw std::runtime_error("wrong! "
+                                 "GridwiseDynamicContraction_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_"
+                                 "GM0_GM1_GN0_GN1 has invalid setting");
+    }
+
+    const auto a_grid_desc_gk0_gm0_gm10_gm11_gk1 =
+        GridwiseContraction::MakeAGridDescriptor_GK0_GM0_GM10_GM11_GK1(a_grid_desc_gk0_gm0_gm1_gk1);
+    const auto b_grid_desc_gk0_gn0_gn10_gn11_gk1 =
+        GridwiseContraction::MakeBGridDescriptor_GK0_GN0_GN10_GN11_GK1(b_grid_desc_gk0_gn0_gn1_gk1);
+
+    using AGridDesc_GK0_GM0_GM10_GM11_GK1 = decltype(a_grid_desc_gk0_gm0_gm10_gm11_gk1);
+    using BGridDesc_GK0_GN0_GN10_GN11_GK1 = decltype(b_grid_desc_gk0_gn0_gn10_gn11_gk1);
+
+    // c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1
+    const auto c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1 =
+        GridwiseContraction::MakeCGridDescriptor_GM10_BM0_BM1_GN10_BN0_BN1(
+            c_grid_desc_gm0_gm1_gn0_gn1);
+
+    using CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1 = decltype(c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1);
+
+    // c_grid_block_cluster_blockid_to_gm10_gn10
+    const auto c_grid_block_cluster_blockid_to_gm10_gn10 =
+        GridwiseContraction::MakeCGridBlockCluster_BlockId_To_GM10_GN10(
+            c_grid_desc_gm0_gm1_gn0_gn1);
+
+    using CGridBlockCluster_BlockId_To_GM10_GN10 =
+        decltype(c_grid_block_cluster_blockid_to_gm10_gn10);
+
+    const index_t grid_size = GridwiseContraction::CalculateGridSize(c_grid_desc_gm0_gm1_gn0_gn1);
+
+    const bool has_main_k_block_loop = GridwiseContraction::CalculateHasMainKBlockLoop(GK0);
+
+    const bool has_double_tail_k_block_loop =
+        GridwiseContraction::CalculateHasDoubleTailKBlockLoop(GK0);
+
+    {
+        std::cout << "a_grid_desc_gk0_gm0_gm10_gm11_gk1{"
+                  << a_grid_desc_gk0_gm0_gm10_gm11_gk1.GetLength(I0) << ", "
+                  << a_grid_desc_gk0_gm0_gm10_gm11_gk1.GetLength(I1) << ", "
+                  << a_grid_desc_gk0_gm0_gm10_gm11_gk1.GetLength(I2) << ", "
+                  << a_grid_desc_gk0_gm0_gm10_gm11_gk1.GetLength(I3) << ", "
+                  << a_grid_desc_gk0_gm0_gm10_gm11_gk1.GetLength(I4) << "}" << std::endl;
+
+        std::cout << "b_grid_desc_gk0_gn0_gn10_gn11_gk1{"
+                  << b_grid_desc_gk0_gn0_gn10_gn11_gk1.GetLength(I0) << ", "
+                  << b_grid_desc_gk0_gn0_gn10_gn11_gk1.GetLength(I1) << ", "
+                  << b_grid_desc_gk0_gn0_gn10_gn11_gk1.GetLength(I2) << ", "
+                  << b_grid_desc_gk0_gn0_gn10_gn11_gk1.GetLength(I3) << ", "
+                  << b_grid_desc_gk0_gn0_gn10_gn11_gk1.GetLength(I4) << "}" << std::endl;
+
+        std::cout << "c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1{ "
+                  << c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1.GetLength(I0) << ", "
+                  << c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1.GetLength(I1) << ", "
+                  << c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1.GetLength(I2) << ", "
+                  << c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1.GetLength(I3) << ", "
+                  << c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1.GetLength(I4) << ", "
+                  << c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1.GetLength(I5) << "}" << std::endl;
+    }
+
+    float ave_time = 0;
+
+    if(has_main_k_block_loop && has_double_tail_k_block_loop)
+    {
+        const auto kernel = kernel_dynamic_contraction_dlops_v1r2<
+            GridwiseContraction,
+            FloatAB,
+            FloatC,
+            remove_reference_t<AGridDesc_GK0_GM0_GM10_GM11_GK1>,
+            remove_reference_t<BGridDesc_GK0_GN0_GN10_GN11_GK1>,
+            remove_reference_t<CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1>,
+            remove_reference_t<CGridBlockCluster_BlockId_To_GM10_GN10>,
+            true,
+            true>;
+
+        ave_time = launch_and_time_kernel(kernel,
+                                          nrepeat,
+                                          dim3(grid_size),
+                                          dim3(BlockSize),
+                                          0,
+                                          0,
+                                          p_a_grid,
+                                          p_b_grid,
+                                          p_c_grid,
+                                          a_grid_desc_gk0_gm0_gm10_gm11_gk1,
+                                          b_grid_desc_gk0_gn0_gn10_gn11_gk1,
+                                          c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
+                                          c_grid_block_cluster_blockid_to_gm10_gn10);
+    }
+    else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
+    {
+        const auto kernel = kernel_dynamic_contraction_dlops_v1r2<
+            GridwiseContraction,
+            FloatAB,
+            FloatC,
+            remove_reference_t<AGridDesc_GK0_GM0_GM10_GM11_GK1>,
+            remove_reference_t<BGridDesc_GK0_GN0_GN10_GN11_GK1>,
+            remove_reference_t<CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1>,
+            remove_reference_t<CGridBlockCluster_BlockId_To_GM10_GN10>,
+            true,
+            false>;
+
+        ave_time = launch_and_time_kernel(kernel,
+                                          nrepeat,
+                                          dim3(grid_size),
+                                          dim3(BlockSize),
+                                          0,
+                                          0,
+                                          p_a_grid,
+                                          p_b_grid,
+                                          p_c_grid,
+                                          a_grid_desc_gk0_gm0_gm10_gm11_gk1,
+                                          b_grid_desc_gk0_gn0_gn10_gn11_gk1,
+                                          c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
+                                          c_grid_block_cluster_blockid_to_gm10_gn10);
+    }
+    else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
+    {
+        const auto kernel = kernel_dynamic_contraction_dlops_v1r2<
+            GridwiseContraction,
+            FloatAB,
+            FloatC,
+            remove_reference_t<AGridDesc_GK0_GM0_GM10_GM11_GK1>,
+            remove_reference_t<BGridDesc_GK0_GN0_GN10_GN11_GK1>,
+            remove_reference_t<CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1>,
+            remove_reference_t<CGridBlockCluster_BlockId_To_GM10_GN10>,
+            false,
+            true>;
+
+        ave_time = launch_and_time_kernel(kernel,
+                                          nrepeat,
+                                          dim3(grid_size),
+                                          dim3(BlockSize),
+                                          0,
+                                          0,
+                                          p_a_grid,
+                                          p_b_grid,
+                                          p_c_grid,
+                                          a_grid_desc_gk0_gm0_gm10_gm11_gk1,
+                                          b_grid_desc_gk0_gn0_gn10_gn11_gk1,
+                                          c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
+                                          c_grid_block_cluster_blockid_to_gm10_gn10);
+    }
+    else
+    {
+        const auto kernel = kernel_dynamic_contraction_dlops_v1r2<
+            GridwiseContraction,
+            FloatAB,
+            FloatC,
+            remove_reference_t<AGridDesc_GK0_GM0_GM10_GM11_GK1>,
+            remove_reference_t<BGridDesc_GK0_GN0_GN10_GN11_GK1>,
+            remove_reference_t<CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1>,
+            remove_reference_t<CGridBlockCluster_BlockId_To_GM10_GN10>,
+            false,
+            false>;
+
+        ave_time = launch_and_time_kernel(kernel,
+                                          nrepeat,
+                                          dim3(grid_size),
+                                          dim3(BlockSize),
+                                          0,
+                                          0,
+                                          p_a_grid,
+                                          p_b_grid,
+                                          p_c_grid,
+                                          a_grid_desc_gk0_gm0_gm10_gm11_gk1,
+                                          b_grid_desc_gk0_gn0_gn10_gn11_gk1,
+                                          c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
+                                          c_grid_block_cluster_blockid_to_gm10_gn10);
+    }
+
+    return ave_time;
+}
+#endif
diff --git a/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
new file mode 100644
index 0000000000..7c4b1043f3
--- /dev/null
+++ b/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
@@ -0,0 +1,352 @@
+#ifndef DRIVER_DYNAMIC_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_NCHW_KCYX_NKHW_HPP
+#define DRIVER_DYNAMIC_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_NCHW_KCYX_NKHW_HPP
+
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "gridwise_dynamic_gemm_dlops_v2.hpp"
+#include "gridwise_operation_wrapper.hpp"
+
+template <ck::index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          ck::index_t KPerBlock,
+          ck::index_t HoPerBlock,
+          ck::index_t WoPerBlock,
+          ck::index_t EPerBlock,
+          ck::index_t KPerThread,
+          ck::index_t HoPerThread,
+          ck::index_t WoPerThread,
+          ck::index_t EPerThread,
+          typename ABlockTransferThreadSliceLengths_E_K,
+          typename ABlockTransferThreadClusterLengths_E_K,
+          ck::index_t ABlockTransferSrcScalarPerVector_E,
+          ck::index_t ABlockTransferDstScalarPerVector_K,
+          ck::index_t BThreadTransferSrcScalarPerVector_W,
+          ck::index_t CThreadTransferDstScalarPerVector_W>
+struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
+{
+    template <typename... Wei,
+              typename... In,
+              typename... Out,
+              typename ConvStrides,
+              typename ConvDilations,
+              typename InLeftPads,
+              typename InRightPads>
+    __host__ void Run(const ck::DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
+                      const ck::DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
+                      const ck::DynamicTensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
+                      const ConvStrides& conv_strides,
+                      const ConvDilations& conv_dilations,
+                      const InLeftPads& in_left_pads,
+                      const InRightPads& in_right_pads,
+                      const FloatAB* __restrict__ p_wei_global,
+                      const FloatAB* __restrict__ p_in_global,
+                      FloatC* __restrict__ p_out_global) const
+    {
+        using namespace ck;
+
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+        constexpr auto I4 = Number<4>{};
+
+        const auto N  = in_n_c_hi_wi_global_desc.GetLength(I0);
+        const auto C  = in_n_c_hi_wi_global_desc.GetLength(I1);
+        const auto K0 = out_n_k0_ho_wo_k1_global_desc.GetLength(I1);
+
+        const auto Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
+        const auto Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
+
+        const auto Ho = out_n_k0_ho_wo_k1_global_desc.GetLength(I2);
+        const auto Wo = out_n_k0_ho_wo_k1_global_desc.GetLength(I3);
+
+        const auto K1 = out_n_k0_ho_wo_k1_global_desc.GetLength(I4);
+
+        const auto K = wei_k_c_y_x_global_desc.GetLength(I0);
+        const auto Y = wei_k_c_y_x_global_desc.GetLength(I2);
+        const auto X = wei_k_c_y_x_global_desc.GetLength(I3);
+
+        const auto ConvStrideH = conv_strides[I0];
+        const auto ConvStrideW = conv_strides[I1];
+
+        const auto ConvDilationH = conv_dilations[I0];
+        const auto ConvDilationW = conv_dilations[I1];
+
+        const auto InLeftPadH = in_left_pads[I0];
+        const auto InLeftPadW = in_left_pads[I1];
+
+        const auto InRightPadH = in_right_pads[I0];
+        const auto InRightPadW = in_right_pads[I1];
+
+        // weight tensor
+        const auto wei_e_k_global_desc = transform_dynamic_tensor_descriptor(
+            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C * Y * X)),
+            make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+        // input tensor
+        const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
+            in_n_c_hi_wi_global_desc,
+            make_tuple(make_pass_through_transform(N),
+                       make_pass_through_transform(C),
+                       make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                       make_pad_transform(Wi, InLeftPadW, InRightPadW)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+        const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor(
+            in_n_c_hip_wip_global_desc,
+            make_tuple(
+                make_pass_through_transform(N),
+                make_pass_through_transform(C),
+                make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
+
+        const auto in_e_n_ho_wo_global_desc = transform_dynamic_tensor_descriptor(
+            in_n_c_y_ho_x_wo_global_desc,
+            make_tuple(make_merge_transform(make_tuple(C, Y, X)),
+                       make_pass_through_transform(N),
+                       make_pass_through_transform(Ho),
+                       make_pass_through_transform(Wo)),
+            make_tuple(Sequence<1, 2, 4>{}, Sequence<0>{}, Sequence<3>{}, Sequence<5>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+        // output tensor
+        const auto out_k_n_ho_wo_global_desc = transform_dynamic_tensor_descriptor(
+            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Ho, Wo, K1)),
+            make_tuple(make_merge_transform(make_tuple(K0, K1)),
+                       make_pass_through_transform(N),
+                       make_pass_through_transform(Ho),
+                       make_pass_through_transform(Wo)),
+            make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+        const auto E = C * Y * X;
+
+        if(!((K % KPerBlock) == 0 && (Ho % HoPerBlock) == 0 && (Wo % WoPerBlock) == 0 &&
+             (E % EPerBlock) == 0))
+        {
+            throw std::runtime_error("wrong! GEMM size no divisible");
+        }
+
+        // hack to control index calculation when iterating over a_k_m_global tensor
+        constexpr auto a_e_k_global_iterator_hacks =
+            make_tuple(make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}),
+                       make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}));
+
+        constexpr auto a_e_k_global_move_slice_window_iterator_hack = Sequence<0, 0, 0>{};
+
+        constexpr auto b_e_n_ho_wo_global_iterator_hacks =
+            make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
+                       make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
+
+        constexpr auto b_e_n_ho_wo_global_move_slice_window_iterator_hack =
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{};
+
+        // hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
+        // hack for NKHW format
+        constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks =
+            make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{}),
+                       make_tuple(Sequence<0, 2, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{}));
+
+#if 1
+        // GEMM
+        using gridwise_gemm = GridwiseDynamicGemmDlops_km_kn_mn_v3<
+            BlockSize,
+            FloatAB,
+            FloatAcc,
+            FloatC,
+            InMemoryDataOperationEnum_t::Set,
+            decltype(wei_e_k_global_desc),
+            decltype(in_e_n_ho_wo_global_desc),
+            decltype(out_k_n_ho_wo_global_desc),
+            KPerBlock,
+            HoPerBlock,
+            WoPerBlock,
+            EPerBlock,
+            KPerThread,
+            HoPerThread,
+            WoPerThread,
+            EPerThread,
+            ABlockTransferThreadSliceLengths_E_K,
+            ABlockTransferThreadClusterLengths_E_K,
+            Sequence<1, 0>,
+            Sequence<1, 0>,
+            0,
+            ABlockTransferSrcScalarPerVector_E,
+            ABlockTransferDstScalarPerVector_K,
+            false, // don't move back src coordinate after threadwise copy
+            Sequence<0, 2, 3, 1>,
+            3,
+            BThreadTransferSrcScalarPerVector_W,
+            false, // don't move back src coordinate after threadwise copy, which will be fused with
+                   // MoveSrcSliceWindow() to save addr computation
+            Sequence<0, 2, 3, 1>,
+            0,
+            CThreadTransferDstScalarPerVector_W,
+            decltype(a_e_k_global_iterator_hacks),
+            decltype(b_e_n_ho_wo_global_iterator_hacks),
+            decltype(c_k_n_ho_wo_global_tensor_iterator_hacks),
+            decltype(a_e_k_global_move_slice_window_iterator_hack),
+            decltype(b_e_n_ho_wo_global_move_slice_window_iterator_hack)>;
+
+        const auto GridSize = (K / KPerBlock) * (Ho / HoPerBlock) * (Wo / WoPerBlock) * N;
+
+        const bool has_main_k_block_loop = (E + EPerBlock) / (2 * EPerBlock) > 1;
+
+        const bool has_double_tail_k_block_loop = (E / EPerBlock) % 2 == 0;
+
+        index_t nrepeat = 100;
+
+        for(index_t i = 0; i < 5; ++i)
+        {
+            std::cout << "Start running " << nrepeat << " times..." << std::endl;
+
+            KernelTimer timer;
+            timer.Start();
+            std::cout << "has_main_k_block_loop: " << has_main_k_block_loop
+                      << " has_double_tail_k_block_loop: " << has_double_tail_k_block_loop
+                      << std::endl;
+
+            for(index_t j = 0; j < nrepeat; ++j)
+            {
+                if(has_main_k_block_loop && has_double_tail_k_block_loop)
+                {
+                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                                                               decltype(wei_e_k_global_desc),
+                                                               const FloatAB*,
+                                                               decltype(in_e_n_ho_wo_global_desc),
+                                                               const FloatAB*,
+                                                               decltype(out_k_n_ho_wo_global_desc),
+                                                               FloatC*,
+                                                               integral_constant<bool, true>,
+                                                               integral_constant<bool, true>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  wei_e_k_global_desc,
+                                  p_wei_global,
+                                  in_e_n_ho_wo_global_desc,
+                                  p_in_global,
+                                  out_k_n_ho_wo_global_desc,
+                                  p_out_global,
+                                  integral_constant<bool, true>{},
+                                  integral_constant<bool, true>{});
+                }
+                else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
+                {
+                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                                                               decltype(wei_e_k_global_desc),
+                                                               const FloatAB*,
+                                                               decltype(in_e_n_ho_wo_global_desc),
+                                                               const FloatAB*,
+                                                               decltype(out_k_n_ho_wo_global_desc),
+                                                               FloatC*,
+                                                               integral_constant<bool, true>,
+                                                               integral_constant<bool, false>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  wei_e_k_global_desc,
+                                  p_wei_global,
+                                  in_e_n_ho_wo_global_desc,
+                                  p_in_global,
+                                  out_k_n_ho_wo_global_desc,
+                                  p_out_global,
+                                  integral_constant<bool, true>{},
+                                  integral_constant<bool, false>{});
+                }
+                else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
+                {
+                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                                                               decltype(wei_e_k_global_desc),
+                                                               const FloatAB*,
+                                                               decltype(in_e_n_ho_wo_global_desc),
+                                                               const FloatAB*,
+                                                               decltype(out_k_n_ho_wo_global_desc),
+                                                               FloatC*,
+                                                               integral_constant<bool, false>,
+                                                               integral_constant<bool, true>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  wei_e_k_global_desc,
+                                  p_wei_global,
+                                  in_e_n_ho_wo_global_desc,
+                                  p_in_global,
+                                  out_k_n_ho_wo_global_desc,
+                                  p_out_global,
+                                  integral_constant<bool, false>{},
+                                  integral_constant<bool, true>{});
+                }
+                else
+                {
+                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                                                               decltype(wei_e_k_global_desc),
+                                                               const FloatAB*,
+                                                               decltype(in_e_n_ho_wo_global_desc),
+                                                               const FloatAB*,
+                                                               decltype(out_k_n_ho_wo_global_desc),
+                                                               FloatC*,
+                                                               integral_constant<bool, false>,
+                                                               integral_constant<bool, false>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  wei_e_k_global_desc,
+                                  p_wei_global,
+                                  in_e_n_ho_wo_global_desc,
+                                  p_in_global,
+                                  out_k_n_ho_wo_global_desc,
+                                  p_out_global,
+                                  integral_constant<bool, false>{},
+                                  integral_constant<bool, false>{});
+                }
+            }
+
+            timer.End();
+
+            float ave_time = timer.GetElapsedTime() / nrepeat;
+
+            float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc,
+                                                            wei_k_c_y_x_global_desc,
+                                                            out_n_k0_ho_wo_k1_global_desc) /
+                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
+                      << std::endl;
+        }
+#endif
+    }
+};
+#endif
diff --git a/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp b/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
new file mode 100644
index 0000000000..b7f8e6039c
--- /dev/null
+++ b/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
@@ -0,0 +1,367 @@
+#ifndef DRIVER_DYNAMIC_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NCHW_KCYX_NKHW_OUTPAD_HPP
+#define DRIVER_DYNAMIC_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NCHW_KCYX_NKHW_OUTPAD_HPP
+
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "gridwise_dynamic_gemm_dlops_v2.hpp"
+#include "gridwise_operation_wrapper.hpp"
+
+template <ck::index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          ck::index_t KPerBlock,
+          ck::index_t HoPerBlock,
+          ck::index_t WoPerBlock,
+          ck::index_t EPerBlock,
+          ck::index_t KPerThread,
+          ck::index_t HoPerThread,
+          ck::index_t WoPerThread,
+          ck::index_t EPerThread,
+          typename ABlockTransferThreadSliceLengths_E_K,
+          typename ABlockTransferThreadClusterLengths_E_K,
+          ck::index_t ABlockTransferSrcScalarPerVector_E,
+          ck::index_t ABlockTransferDstScalarPerVector_K,
+          ck::index_t BThreadTransferSrcScalarPerVector_W,
+          ck::index_t CThreadTransferDstScalarPerVector_W>
+struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outpad
+{
+    template <typename... Wei,
+              typename... In,
+              typename... Out,
+              typename ConvStrides,
+              typename ConvDilations,
+              typename InLeftPads,
+              typename InRightPads>
+    __host__ void Run(const ck::DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
+                      const ck::DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
+                      const ck::DynamicTensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
+                      const ConvStrides& conv_strides,
+                      const ConvDilations& conv_dilations,
+                      const InLeftPads& in_left_pads,
+                      const InRightPads& in_right_pads,
+                      const FloatAB* __restrict__ p_wei_global,
+                      const FloatAB* __restrict__ p_in_global,
+                      FloatC* __restrict__ p_out_global) const
+    {
+        using namespace ck;
+
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+        constexpr auto I4 = Number<4>{};
+
+        const auto N  = in_n_c_hi_wi_global_desc.GetLength(I0);
+        const auto C  = in_n_c_hi_wi_global_desc.GetLength(I1);
+        const auto K0 = out_n_k0_ho_wo_k1_global_desc.GetLength(I1);
+
+        const auto Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
+        const auto Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
+
+        const auto Ho = out_n_k0_ho_wo_k1_global_desc.GetLength(I2);
+        const auto Wo = out_n_k0_ho_wo_k1_global_desc.GetLength(I3);
+
+        const auto K1 = out_n_k0_ho_wo_k1_global_desc.GetLength(I4);
+
+        const auto K = wei_k_c_y_x_global_desc.GetLength(I0);
+        const auto Y = wei_k_c_y_x_global_desc.GetLength(I2);
+        const auto X = wei_k_c_y_x_global_desc.GetLength(I3);
+
+        const auto ConvStrideH = conv_strides[I0];
+        const auto ConvStrideW = conv_strides[I1];
+
+        const auto ConvDilationH = conv_dilations[I0];
+        const auto ConvDilationW = conv_dilations[I1];
+
+        const auto Hop = (Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock;
+        const auto Wop = (Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock;
+
+        const auto OutRightPadH = Hop - Ho;
+        const auto OutRightPadW = Wop - Wo;
+
+        const auto InLeftPadH = in_left_pads[I0];
+        const auto InLeftPadW = in_left_pads[I1];
+
+        const auto InRightPadH = in_right_pads[I0] + OutRightPadH * ConvStrideH;
+        const auto InRightPadW = in_right_pads[I1] + OutRightPadW * ConvStrideW;
+
+        std::cerr << "OutRightPadH = " << OutRightPadH << " OutRightPadW = " << OutRightPadW
+                  << std::endl;
+        std::cerr << "InRightPadH = " << InRightPadH << " InRightPadW = " << InRightPadW
+                  << std::endl;
+
+        // weight tensor
+        const auto wei_e_k_global_desc = transform_dynamic_tensor_descriptor(
+            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C * Y * X)),
+            make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+        // input tensor
+        const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
+            in_n_c_hi_wi_global_desc,
+            make_tuple(make_pass_through_transform(N),
+                       make_pass_through_transform(C),
+                       make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                       make_pad_transform(Wi, InLeftPadW, InRightPadW)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+        const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor(
+            in_n_c_hip_wip_global_desc,
+            make_tuple(
+                make_pass_through_transform(N),
+                make_pass_through_transform(C),
+                make_embed_transform(make_tuple(Y, Hop), make_tuple(ConvDilationH, ConvStrideH)),
+                make_embed_transform(make_tuple(X, Wop), make_tuple(ConvDilationW, ConvStrideW))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
+
+        const auto in_e_n_ho_wo_global_desc = transform_dynamic_tensor_descriptor(
+            in_n_c_y_ho_x_wo_global_desc,
+            make_tuple(make_merge_transform(make_tuple(C, Y, X)),
+                       make_pass_through_transform(N),
+                       make_pass_through_transform(Hop),
+                       make_pass_through_transform(Wop)),
+            make_tuple(Sequence<1, 2, 4>{}, Sequence<0>{}, Sequence<3>{}, Sequence<5>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+        // output tensor
+        const auto out_k_n_hop_wop_global_desc = transform_dynamic_tensor_descriptor(
+            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Ho, Wo, K1)),
+            make_tuple(make_merge_transform(make_tuple(K0, K1)),
+                       make_pass_through_transform(N),
+                       make_pad_transform(Ho, 0, OutRightPadH),
+                       make_pad_transform(Wo, 0, OutRightPadW)),
+            make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+        const auto E = C * Y * X;
+
+        std::cerr << "Hop = " << Hop << " Wop = " << Wop << std::endl;
+
+        if(!((K % KPerBlock) == 0 && (Hop % HoPerBlock) == 0 && (Wop % WoPerBlock) == 0 &&
+             (E % EPerBlock) == 0))
+        {
+            throw std::runtime_error("wrong! GEMM size no divisible");
+        }
+
+        // hack to control index calculation when iterating over a_k_m_global tensor
+        constexpr auto a_e_k_global_iterator_hacks =
+            make_tuple(make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}),
+                       make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}));
+
+        constexpr auto a_e_k_global_move_slice_window_iterator_hack = Sequence<0, 0, 0>{};
+
+        constexpr auto b_e_n_ho_wo_global_iterator_hacks =
+            make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
+                       make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
+
+        constexpr auto b_e_n_ho_wo_global_move_slice_window_iterator_hack =
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{};
+
+        // hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
+        // hack for NKHW format
+        constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks =
+            make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{}),
+                       make_tuple(Sequence<0, 2, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{}));
+
+        // GEMM
+        using gridwise_gemm = GridwiseDynamicGemmDlops_km_kn_mn_v3<
+            BlockSize,
+            FloatAB,
+            FloatAcc,
+            FloatC,
+            InMemoryDataOperationEnum_t::Set,
+            decltype(wei_e_k_global_desc),
+            decltype(in_e_n_ho_wo_global_desc),
+            decltype(out_k_n_hop_wop_global_desc),
+            KPerBlock,
+            HoPerBlock,
+            WoPerBlock,
+            EPerBlock,
+            KPerThread,
+            HoPerThread,
+            WoPerThread,
+            EPerThread,
+            ABlockTransferThreadSliceLengths_E_K,
+            ABlockTransferThreadClusterLengths_E_K,
+            Sequence<1, 0>,
+            Sequence<1, 0>,
+            0,
+            ABlockTransferSrcScalarPerVector_E,
+            ABlockTransferDstScalarPerVector_K,
+            false, // don't move back src coordinate after threadwise copy
+            Sequence<0, 2, 3, 1>,
+            3,
+            BThreadTransferSrcScalarPerVector_W,
+            false, // don't move back src coordinate after threadwise copy, which will be fused with
+                   // MoveSrcSliceWindow() to save addr computation
+            Sequence<0, 2, 3, 1>,
+            0,
+            CThreadTransferDstScalarPerVector_W,
+            decltype(a_e_k_global_iterator_hacks),
+            decltype(b_e_n_ho_wo_global_iterator_hacks),
+            decltype(c_k_n_ho_wo_global_tensor_iterator_hacks),
+            decltype(a_e_k_global_move_slice_window_iterator_hack),
+            decltype(b_e_n_ho_wo_global_move_slice_window_iterator_hack)>;
+
+        const auto GridSize = (K / KPerBlock) * (Hop / HoPerBlock) * (Wop / WoPerBlock) * N;
+
+        const bool has_main_k_block_loop = (E + EPerBlock) / (2 * EPerBlock) > 1;
+
+        const bool has_double_tail_k_block_loop = (E / EPerBlock) % 2 == 0;
+
+        index_t nrepeat = 100;
+
+        for(index_t i = 0; i < 5; ++i)
+        {
+            std::cout << "Start running " << nrepeat << " times..." << std::endl;
+
+            KernelTimer timer;
+            timer.Start();
+            std::cout << "has_main_k_block_loop: " << has_main_k_block_loop
+                      << " has_double_tail_k_block_loop: " << has_double_tail_k_block_loop
+                      << std::endl;
+
+            for(index_t j = 0; j < nrepeat; ++j)
+            {
+                if(has_main_k_block_loop && has_double_tail_k_block_loop)
+                {
+                    const auto kernel =
+                        run_gridwise_operation<gridwise_gemm,
+                                               decltype(wei_e_k_global_desc),
+                                               const FloatAB*,
+                                               decltype(in_e_n_ho_wo_global_desc),
+                                               const FloatAB*,
+                                               decltype(out_k_n_hop_wop_global_desc),
+                                               FloatC*,
+                                               integral_constant<bool, true>,
+                                               integral_constant<bool, true>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  wei_e_k_global_desc,
+                                  p_wei_global,
+                                  in_e_n_ho_wo_global_desc,
+                                  p_in_global,
+                                  out_k_n_hop_wop_global_desc,
+                                  p_out_global,
+                                  integral_constant<bool, true>{},
+                                  integral_constant<bool, true>{});
+                }
+                else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
+                {
+                    const auto kernel =
+                        run_gridwise_operation<gridwise_gemm,
+                                               decltype(wei_e_k_global_desc),
+                                               const FloatAB*,
+                                               decltype(in_e_n_ho_wo_global_desc),
+                                               const FloatAB*,
+                                               decltype(out_k_n_hop_wop_global_desc),
+                                               FloatC*,
+                                               integral_constant<bool, true>,
+                                               integral_constant<bool, false>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  wei_e_k_global_desc,
+                                  p_wei_global,
+                                  in_e_n_ho_wo_global_desc,
+                                  p_in_global,
+                                  out_k_n_hop_wop_global_desc,
+                                  p_out_global,
+                                  integral_constant<bool, true>{},
+                                  integral_constant<bool, false>{});
+                }
+                else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
+                {
+                    const auto kernel =
+                        run_gridwise_operation<gridwise_gemm,
+                                               decltype(wei_e_k_global_desc),
+                                               const FloatAB*,
+                                               decltype(in_e_n_ho_wo_global_desc),
+                                               const FloatAB*,
+                                               decltype(out_k_n_hop_wop_global_desc),
+                                               FloatC*,
+                                               integral_constant<bool, false>,
+                                               integral_constant<bool, true>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  wei_e_k_global_desc,
+                                  p_wei_global,
+                                  in_e_n_ho_wo_global_desc,
+                                  p_in_global,
+                                  out_k_n_hop_wop_global_desc,
+                                  p_out_global,
+                                  integral_constant<bool, false>{},
+                                  integral_constant<bool, true>{});
+                }
+                else
+                {
+                    const auto kernel =
+                        run_gridwise_operation<gridwise_gemm,
+                                               decltype(wei_e_k_global_desc),
+                                               const FloatAB*,
+                                               decltype(in_e_n_ho_wo_global_desc),
+                                               const FloatAB*,
+                                               decltype(out_k_n_hop_wop_global_desc),
+                                               FloatC*,
+                                               integral_constant<bool, false>,
+                                               integral_constant<bool, false>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  wei_e_k_global_desc,
+                                  p_wei_global,
+                                  in_e_n_ho_wo_global_desc,
+                                  p_in_global,
+                                  out_k_n_hop_wop_global_desc,
+                                  p_out_global,
+                                  integral_constant<bool, false>{},
+                                  integral_constant<bool, false>{});
+                }
+            }
+
+            timer.End();
+
+            float ave_time = timer.GetElapsedTime() / nrepeat;
+
+            float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc,
+                                                            wei_k_c_y_x_global_desc,
+                                                            out_n_k0_ho_wo_k1_global_desc) /
+                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
+                      << std::endl;
+        }
+    }
+};
+#endif
diff --git a/host/driver_offline/include/driver_dynamic_gemm_dlops_v1r2.hpp b/host/driver_offline/include/driver_dynamic_gemm_dlops_v1r2.hpp
new file mode 100644
index 0000000000..0ebc68b48a
--- /dev/null
+++ b/host/driver_offline/include/driver_dynamic_gemm_dlops_v1r2.hpp
@@ -0,0 +1,415 @@
+#ifndef DRIVER_DYNAMIC_GEMM_DLOPS_V1R2
+#define DRIVER_DYNAMIC_GEMM_DLOPS_V1R2
+
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "gridwise_dynamic_gemm_dlops_v1r2.hpp"
+
+template <ck::index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          ck::InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
+          typename AKMGridDesc,
+          typename BKNGridDesc,
+          typename CMNGridDesc,
+          ck::index_t MPerBlock,
+          ck::index_t NPerBlock,
+          ck::index_t KPerBlock,
+          ck::index_t M1PerThread,
+          ck::index_t N1PerThread,
+          ck::index_t KPerThread,
+          ck::index_t M1N1ThreadClusterM10,
+          ck::index_t M1N1ThreadClusterN10,
+          ck::index_t M1N1ThreadClusterM11,
+          ck::index_t M1N1ThreadClusterN11,
+          typename ABlockTransferThreadSliceLengths_K_M0_M1,
+          typename ABlockTransferThreadClusterLengths_K_M0_M1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          ck::index_t ABlockTransferSrcVectorDim,
+          ck::index_t ABlockTransferSrcScalarPerVector,
+          ck::index_t ABlockTransferDstScalarPerVector_M1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          typename BBlockTransferThreadSliceLengths_K_N0_N1,
+          typename BBlockTransferThreadClusterLengths_K_N0_N1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          ck::index_t BBlockTransferSrcVectorDim,
+          ck::index_t BBlockTransferSrcScalarPerVector,
+          ck::index_t BBlockTransferDstScalarPerVector_N1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          typename CThreadTransferSrcDstAccessOrder,
+          ck::index_t CThreadTransferSrcDstVectorDim,
+          ck::index_t CThreadTransferDstScalarPerVector,
+          typename AGridIteratorHacks,
+          typename BGridIteratorHacks,
+          typename CGridIteratorHacks,
+          typename AGridMoveSliceWindowIteratorHacks,
+          typename BGridMoveSliceWindowIteratorHacks>
+__host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
+                                              const FloatAB* p_b_grid,
+                                              FloatC* p_c_grid,
+                                              const AKMGridDesc& a_k_m_grid_desc,
+                                              const BKNGridDesc& b_k_n_grid_desc,
+                                              const CMNGridDesc& c_m_n_grid_desc,
+                                              AGridIteratorHacks,
+                                              BGridIteratorHacks,
+                                              CGridIteratorHacks,
+                                              AGridMoveSliceWindowIteratorHacks,
+                                              BGridMoveSliceWindowIteratorHacks,
+                                              ck::index_t nrepeat)
+
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+
+    // GEMM
+    using GridwiseGemm =
+        GridwiseDynamicGemmDlops_km_kn_mn_v1r2<BlockSize,
+                                               FloatAB,
+                                               FloatAcc,
+                                               FloatC,
+                                               CGlobalMemoryDataOperation,
+                                               AKMGridDesc,
+                                               BKNGridDesc,
+                                               CMNGridDesc,
+                                               MPerBlock,
+                                               NPerBlock,
+                                               KPerBlock,
+                                               M1PerThread,
+                                               N1PerThread,
+                                               KPerThread,
+                                               M1N1ThreadClusterM10,
+                                               M1N1ThreadClusterN10,
+                                               M1N1ThreadClusterM11,
+                                               M1N1ThreadClusterN11,
+                                               ABlockTransferThreadSliceLengths_K_M0_M1,
+                                               ABlockTransferThreadClusterLengths_K_M0_M1,
+                                               ABlockTransferThreadClusterArrangeOrder,
+                                               ABlockTransferSrcAccessOrder,
+                                               ABlockTransferSrcVectorDim,
+                                               ABlockTransferSrcScalarPerVector,
+                                               ABlockTransferDstScalarPerVector_M1,
+                                               AThreadTransferSrcResetCoordinateAfterRun,
+                                               BBlockTransferThreadSliceLengths_K_N0_N1,
+                                               BBlockTransferThreadClusterLengths_K_N0_N1,
+                                               BBlockTransferThreadClusterArrangeOrder,
+                                               BBlockTransferSrcAccessOrder,
+                                               BBlockTransferSrcVectorDim,
+                                               BBlockTransferSrcScalarPerVector,
+                                               BBlockTransferDstScalarPerVector_N1,
+                                               BThreadTransferSrcResetCoordinateAfterRun,
+                                               CThreadTransferSrcDstAccessOrder,
+                                               CThreadTransferSrcDstVectorDim,
+                                               CThreadTransferDstScalarPerVector,
+                                               AGridIteratorHacks,
+                                               BGridIteratorHacks,
+                                               CGridIteratorHacks,
+                                               AGridMoveSliceWindowIteratorHacks,
+                                               BGridMoveSliceWindowIteratorHacks>;
+
+    const auto M = a_k_m_grid_desc.GetLength(I1);
+    const auto N = b_k_n_grid_desc.GetLength(I1);
+    const auto K = a_k_m_grid_desc.GetLength(I0);
+
+    if(!GridwiseGemm::CheckValidity(a_k_m_grid_desc, b_k_n_grid_desc, c_m_n_grid_desc))
+    {
+        throw std::runtime_error(
+            "wrong! GridwiseDynamicGemmDlops_km_kn_mn_v1r2 has invalid setting");
+    }
+
+    const auto a_k_m0_m1_grid_desc = GridwiseGemm::MakeAKM0M1GridDescriptor(a_k_m_grid_desc);
+    const auto b_k_n0_n1_grid_desc = GridwiseGemm::MakeBKN0N1GridDescriptor(b_k_n_grid_desc);
+
+    using AKM0M1GridDesc = decltype(a_k_m0_m1_grid_desc);
+    using BKN0N1GridDesc = decltype(b_k_n0_n1_grid_desc);
+
+    // c_m0_m10_m11_n0_n10_n11_grid_desc
+    const auto c_m0_m10_m11_n0_n10_n11_grid_desc =
+        GridwiseGemm::MakeCM0M10M11N0N10N11GridDescriptor(c_m_n_grid_desc);
+
+    using CM0M10M11N0N10N11GridDesc = decltype(c_m0_m10_m11_n0_n10_n11_grid_desc);
+
+    // c_blockid_to_m0_n0_block_cluster_adaptor
+    const auto c_blockid_to_m0_n0_block_cluster_adaptor =
+        GridwiseGemm::MakeCBlockIdToM0N0BlockClusterAdaptor(c_m_n_grid_desc);
+
+    using CBlockIdToM0N0BlockClusterAdaptor = decltype(c_blockid_to_m0_n0_block_cluster_adaptor);
+
+    const index_t grid_size = GridwiseGemm::CalculateGridSize(M, N);
+
+    const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K);
+
+    const bool has_double_tail_k_block_loop = GridwiseGemm::CalculateHasDoubleTailKBlockLoop(K);
+
+    {
+        std::cout << "a_k_m0_m1_grid_desc{" << a_k_m0_m1_grid_desc.GetLength(I0) << ", "
+                  << a_k_m0_m1_grid_desc.GetLength(I1) << ", " << a_k_m0_m1_grid_desc.GetLength(I2)
+                  << "}" << std::endl;
+
+        std::cout << "b_k_n0_n1_grid_desc{" << b_k_n0_n1_grid_desc.GetLength(I0) << ", "
+                  << b_k_n0_n1_grid_desc.GetLength(I1) << ", " << b_k_n0_n1_grid_desc.GetLength(I2)
+                  << "}" << std::endl;
+
+        std::cout << "c_m0_m10_m11_n0_n10_n11_grid_desc{ "
+                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I0) << ", "
+                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I1) << ", "
+                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I2) << ", "
+                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I3) << ", "
+                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I4) << ", "
+                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I5) << "}" << std::endl;
+    }
+
+#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
+    float ave_time = 0;
+
+    if(has_main_k_block_loop && has_double_tail_k_block_loop)
+    {
+        const auto kernel =
+            kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm,
+                                           FloatAB,
+                                           FloatC,
+                                           remove_reference_t<AKM0M1GridDesc>,
+                                           remove_reference_t<BKN0N1GridDesc>,
+                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                           true,
+                                           true>;
+
+        ave_time = launch_and_time_kernel(kernel,
+                                          nrepeat,
+                                          dim3(grid_size),
+                                          dim3(BlockSize),
+                                          0,
+                                          0,
+                                          p_a_grid,
+                                          p_b_grid,
+                                          p_c_grid,
+                                          a_k_m0_m1_grid_desc,
+                                          b_k_n0_n1_grid_desc,
+                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
+                                          c_blockid_to_m0_n0_block_cluster_adaptor);
+    }
+    else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
+    {
+        const auto kernel =
+            kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm,
+                                           FloatAB,
+                                           FloatC,
+                                           remove_reference_t<AKM0M1GridDesc>,
+                                           remove_reference_t<BKN0N1GridDesc>,
+                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                           true,
+                                           false>;
+
+        ave_time = launch_and_time_kernel(kernel,
+                                          nrepeat,
+                                          dim3(grid_size),
+                                          dim3(BlockSize),
+                                          0,
+                                          0,
+                                          p_a_grid,
+                                          p_b_grid,
+                                          p_c_grid,
+                                          a_k_m0_m1_grid_desc,
+                                          b_k_n0_n1_grid_desc,
+                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
+                                          c_blockid_to_m0_n0_block_cluster_adaptor);
+    }
+    else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
+    {
+        const auto kernel =
+            kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm,
+                                           FloatAB,
+                                           FloatC,
+                                           remove_reference_t<AKM0M1GridDesc>,
+                                           remove_reference_t<BKN0N1GridDesc>,
+                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                           false,
+                                           true>;
+
+        ave_time = launch_and_time_kernel(kernel,
+                                          nrepeat,
+                                          dim3(grid_size),
+                                          dim3(BlockSize),
+                                          0,
+                                          0,
+                                          p_a_grid,
+                                          p_b_grid,
+                                          p_c_grid,
+                                          a_k_m0_m1_grid_desc,
+                                          b_k_n0_n1_grid_desc,
+                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
+                                          c_blockid_to_m0_n0_block_cluster_adaptor);
+    }
+    else
+    {
+        const auto kernel =
+            kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm,
+                                           FloatAB,
+                                           FloatC,
+                                           remove_reference_t<AKM0M1GridDesc>,
+                                           remove_reference_t<BKN0N1GridDesc>,
+                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                           false,
+                                           false>;
+
+        ave_time = launch_and_time_kernel(kernel,
+                                          nrepeat,
+                                          dim3(grid_size),
+                                          dim3(BlockSize),
+                                          0,
+                                          0,
+                                          p_a_grid,
+                                          p_b_grid,
+                                          p_c_grid,
+                                          a_k_m0_m1_grid_desc,
+                                          b_k_n0_n1_grid_desc,
+                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
+                                          c_blockid_to_m0_n0_block_cluster_adaptor);
+    }
+
+    return ave_time;
+#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
+    DeviceMem a_k_m0_m1_grid_desc_dev_buf(sizeof(AKM0M1GridDesc));
+    DeviceMem b_k_n0_n1_grid_desc_dev_buf(sizeof(BKN0N1GridDesc));
+    DeviceMem c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf(sizeof(CM0M10M11N0N10N11GridDesc));
+    DeviceMem c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf(
+        sizeof(CBlockIdToM0N0BlockClusterAdaptor));
+
+    a_k_m0_m1_grid_desc_dev_buf.ToDevice(&a_k_m0_m1_grid_desc);
+    b_k_n0_n1_grid_desc_dev_buf.ToDevice(&b_k_n0_n1_grid_desc);
+    c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.ToDevice(&c_m0_m10_m11_n0_n10_n11_grid_desc);
+    c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.ToDevice(
+        &c_blockid_to_m0_n0_block_cluster_adaptor);
+
+    float ave_time = 0;
+
+    if(has_main_k_block_loop && has_double_tail_k_block_loop)
+    {
+        const auto kernel =
+            kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm,
+                                           FloatAB,
+                                           FloatC,
+                                           remove_reference_t<AKM0M1GridDesc>,
+                                           remove_reference_t<BKN0N1GridDesc>,
+                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                           true,
+                                           true>;
+
+        ave_time = launch_and_time_kernel(
+            kernel,
+            nrepeat,
+            dim3(grid_size),
+            dim3(BlockSize),
+            0,
+            0,
+            p_a_grid,
+            p_b_grid,
+            p_c_grid,
+            (void CONSTANT*)a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer(),
+            (void CONSTANT*)b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer(),
+            (void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(),
+            (void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer());
+    }
+    else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
+    {
+        const auto kernel =
+            kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm,
+                                           FloatAB,
+                                           FloatC,
+                                           remove_reference_t<AKM0M1GridDesc>,
+                                           remove_reference_t<BKN0N1GridDesc>,
+                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                           true,
+                                           false>;
+
+        ave_time = launch_and_time_kernel(
+            kernel,
+            nrepeat,
+            dim3(grid_size),
+            dim3(BlockSize),
+            0,
+            0,
+            p_a_grid,
+            p_b_grid,
+            p_c_grid,
+            (void CONSTANT*)a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer(),
+            (void CONSTANT*)b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer(),
+            (void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(),
+            (void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer());
+    }
+    else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
+    {
+        const auto kernel =
+            kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm,
+                                           FloatAB,
+                                           FloatC,
+                                           remove_reference_t<AKM0M1GridDesc>,
+                                           remove_reference_t<BKN0N1GridDesc>,
+                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                           false,
+                                           true>;
+
+        ave_time = launch_and_time_kernel(
+            kernel,
+            nrepeat,
+            dim3(grid_size),
+            dim3(BlockSize),
+            0,
+            0,
+            p_a_grid,
+            p_b_grid,
+            p_c_grid,
+            (void CONSTANT*)a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer(),
+            (void CONSTANT*)b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer(),
+            (void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(),
+            (void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer());
+    }
+    else
+    {
+        const auto kernel =
+            kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm,
+                                           FloatAB,
+                                           FloatC,
+                                           remove_reference_t<AKM0M1GridDesc>,
+                                           remove_reference_t<BKN0N1GridDesc>,
+                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                           false,
+                                           false>;
+
+        ave_time = launch_and_time_kernel(
+            kernel,
+            nrepeat,
+            dim3(grid_size),
+            dim3(BlockSize),
+            0,
+            0,
+            p_a_grid,
+            p_b_grid,
+            p_c_grid,
+            (void CONSTANT*)a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer(),
+            (void CONSTANT*)b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer(),
+            (void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(),
+            (void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer());
+    }
+
+    return ave_time;
+#endif
+}
+#endif
diff --git a/host/driver_offline/include/driver_dynamic_gemm_dlops_v1r3.hpp b/host/driver_offline/include/driver_dynamic_gemm_dlops_v1r3.hpp
new file mode 100644
index 0000000000..d075eac822
--- /dev/null
+++ b/host/driver_offline/include/driver_dynamic_gemm_dlops_v1r3.hpp
@@ -0,0 +1,411 @@
+#ifndef DRIVER_DYNAMIC_GEMM_DLOPS_V1R3
+#define DRIVER_DYNAMIC_GEMM_DLOPS_V1R3
+
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "gridwise_dynamic_gemm_dlops_v1r3.hpp"
+
+template <ck::index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          ck::InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
+          typename AK0MK1GridDesc,
+          typename BK0NK1GridDesc,
+          typename CMNGridDesc,
+          ck::index_t MPerBlock,
+          ck::index_t NPerBlock,
+          ck::index_t KPerBlock,
+          ck::index_t M1PerThread,
+          ck::index_t N1PerThread,
+          ck::index_t KPerThread,
+          typename M1N1ThreadClusterM1Xs,
+          typename M1N1ThreadClusterN1Xs,
+          typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
+          typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
+          typename ABlockTransferSrcVectorTensorContiguousDimOrder,
+          typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
+          typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
+          typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
+          typename BBlockTransferSrcVectorTensorContiguousDimOrder,
+          typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
+          typename CThreadTransferSrcDstAccessOrder,
+          ck::index_t CThreadTransferSrcDstVectorDim,
+          ck::index_t CThreadTransferDstScalarPerVector,
+          typename AGridIteratorHacks,
+          typename BGridIteratorHacks,
+          typename CGridIteratorHacks,
+          typename AGridMoveSliceWindowIteratorHacks,
+          typename BGridMoveSliceWindowIteratorHacks>
+__host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
+                                              const FloatAB* p_b_grid,
+                                              FloatC* p_c_grid,
+                                              const AK0MK1GridDesc& a_k0_m_k1_grid_desc,
+                                              const BK0NK1GridDesc& b_k0_n_k1_grid_desc,
+                                              const CMNGridDesc& c_m_n_grid_desc,
+                                              AGridIteratorHacks,
+                                              BGridIteratorHacks,
+                                              CGridIteratorHacks,
+                                              AGridMoveSliceWindowIteratorHacks,
+                                              BGridMoveSliceWindowIteratorHacks,
+                                              ck::index_t nrepeat)
+
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+
+    // GEMM
+    using GridwiseGemm =
+        GridwiseDynamicGemmDlops_km_kn_mn_v1r3<BlockSize,
+                                               FloatAB,
+                                               FloatAcc,
+                                               FloatC,
+                                               CGlobalMemoryDataOperation,
+                                               AK0MK1GridDesc,
+                                               BK0NK1GridDesc,
+                                               CMNGridDesc,
+                                               MPerBlock,
+                                               NPerBlock,
+                                               KPerBlock,
+                                               M1PerThread,
+                                               N1PerThread,
+                                               KPerThread,
+                                               M1N1ThreadClusterM1Xs,
+                                               M1N1ThreadClusterN1Xs,
+                                               ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
+                                               ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
+                                               ABlockTransferThreadClusterArrangeOrder,
+                                               ABlockTransferSrcAccessOrder,
+                                               ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
+                                               ABlockTransferSrcVectorTensorContiguousDimOrder,
+                                               ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
+                                               BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
+                                               BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
+                                               BBlockTransferThreadClusterArrangeOrder,
+                                               BBlockTransferSrcAccessOrder,
+                                               BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
+                                               BBlockTransferSrcVectorTensorContiguousDimOrder,
+                                               BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
+                                               CThreadTransferSrcDstAccessOrder,
+                                               CThreadTransferSrcDstVectorDim,
+                                               CThreadTransferDstScalarPerVector,
+                                               AGridIteratorHacks,
+                                               BGridIteratorHacks,
+                                               CGridIteratorHacks,
+                                               AGridMoveSliceWindowIteratorHacks,
+                                               BGridMoveSliceWindowIteratorHacks>;
+
+    const auto M  = a_k0_m_k1_grid_desc.GetLength(I1);
+    const auto N  = b_k0_n_k1_grid_desc.GetLength(I1);
+    const auto K0 = a_k0_m_k1_grid_desc.GetLength(I0);
+
+    if(!GridwiseGemm::CheckValidity(a_k0_m_k1_grid_desc, b_k0_n_k1_grid_desc, c_m_n_grid_desc))
+    {
+        throw std::runtime_error(
+            "wrong! GridwiseDynamicGemmDlops_km_kn_mn_v1r3 has invalid setting");
+    }
+
+    const auto a_k0_m0_m1_k1_grid_desc =
+        GridwiseGemm::MakeAK0M0M1K1GridDescriptor(a_k0_m_k1_grid_desc);
+    const auto b_k0_n0_n1_k1_grid_desc =
+        GridwiseGemm::MakeBK0N0N1K1GridDescriptor(b_k0_n_k1_grid_desc);
+
+    using AK0M0M1K1GridDesc = decltype(a_k0_m0_m1_k1_grid_desc);
+    using BK0N0N1K1GridDesc = decltype(b_k0_n0_n1_k1_grid_desc);
+
+    // c_m0_m10_m11_n0_n10_n11_grid_desc
+    const auto c_m0_m10_m11_n0_n10_n11_grid_desc =
+        GridwiseGemm::MakeCM0M10M11N0N10N11GridDescriptor(c_m_n_grid_desc);
+
+    using CM0M10M11N0N10N11GridDesc = decltype(c_m0_m10_m11_n0_n10_n11_grid_desc);
+
+    // c_blockid_to_m0_n0_block_cluster_adaptor
+    const auto c_blockid_to_m0_n0_block_cluster_adaptor =
+        GridwiseGemm::MakeCBlockIdToM0N0BlockClusterAdaptor(c_m_n_grid_desc);
+
+    using CBlockIdToM0N0BlockClusterAdaptor = decltype(c_blockid_to_m0_n0_block_cluster_adaptor);
+
+    const index_t grid_size = GridwiseGemm::CalculateGridSize(M, N);
+
+    const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K0);
+
+    const bool has_double_tail_k_block_loop = GridwiseGemm::CalculateHasDoubleTailKBlockLoop(K0);
+
+    {
+        std::cout << "a_k0_m0_m1_k1_grid_desc{" << a_k0_m0_m1_k1_grid_desc.GetLength(I0) << ", "
+                  << a_k0_m0_m1_k1_grid_desc.GetLength(I1) << ", "
+                  << a_k0_m0_m1_k1_grid_desc.GetLength(I2) << ", "
+                  << a_k0_m0_m1_k1_grid_desc.GetLength(I3) << "}" << std::endl;
+
+        std::cout << "b_k0_n0_n1_k1_grid_desc{" << b_k0_n0_n1_k1_grid_desc.GetLength(I0) << ", "
+                  << b_k0_n0_n1_k1_grid_desc.GetLength(I1) << ", "
+                  << b_k0_n0_n1_k1_grid_desc.GetLength(I2) << ", "
+                  << b_k0_n0_n1_k1_grid_desc.GetLength(I3) << "}" << std::endl;
+
+        std::cout << "c_m0_m10_m11_n0_n10_n11_grid_desc{ "
+                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I0) << ", "
+                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I1) << ", "
+                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I2) << ", "
+                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I3) << ", "
+                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I4) << ", "
+                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I5) << "}" << std::endl;
+    }
+
+#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
+    float ave_time = 0;
+
+    if(has_main_k_block_loop && has_double_tail_k_block_loop)
+    {
+        const auto kernel =
+            kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm,
+                                           FloatAB,
+                                           FloatC,
+                                           remove_reference_t<AK0M0M1K1GridDesc>,
+                                           remove_reference_t<BK0N0N1K1GridDesc>,
+                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                           true,
+                                           true>;
+
+        ave_time = launch_and_time_kernel(kernel,
+                                          nrepeat,
+                                          dim3(grid_size),
+                                          dim3(BlockSize),
+                                          0,
+                                          0,
+                                          p_a_grid,
+                                          p_b_grid,
+                                          p_c_grid,
+                                          a_k0_m0_m1_k1_grid_desc,
+                                          b_k0_n0_n1_k1_grid_desc,
+                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
+                                          c_blockid_to_m0_n0_block_cluster_adaptor);
+    }
+    else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
+    {
+        const auto kernel =
+            kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm,
+                                           FloatAB,
+                                           FloatC,
+                                           remove_reference_t<AK0M0M1K1GridDesc>,
+                                           remove_reference_t<BK0N0N1K1GridDesc>,
+                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                           true,
+                                           false>;
+
+        ave_time = launch_and_time_kernel(kernel,
+                                          nrepeat,
+                                          dim3(grid_size),
+                                          dim3(BlockSize),
+                                          0,
+                                          0,
+                                          p_a_grid,
+                                          p_b_grid,
+                                          p_c_grid,
+                                          a_k0_m0_m1_k1_grid_desc,
+                                          b_k0_n0_n1_k1_grid_desc,
+                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
+                                          c_blockid_to_m0_n0_block_cluster_adaptor);
+    }
+    else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
+    {
+        const auto kernel =
+            kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm,
+                                           FloatAB,
+                                           FloatC,
+                                           remove_reference_t<AK0M0M1K1GridDesc>,
+                                           remove_reference_t<BK0N0N1K1GridDesc>,
+                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                           false,
+                                           true>;
+
+        ave_time = launch_and_time_kernel(kernel,
+                                          nrepeat,
+                                          dim3(grid_size),
+                                          dim3(BlockSize),
+                                          0,
+                                          0,
+                                          p_a_grid,
+                                          p_b_grid,
+                                          p_c_grid,
+                                          a_k0_m0_m1_k1_grid_desc,
+                                          b_k0_n0_n1_k1_grid_desc,
+                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
+                                          c_blockid_to_m0_n0_block_cluster_adaptor);
+    }
+    else
+    {
+        const auto kernel =
+            kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm,
+                                           FloatAB,
+                                           FloatC,
+                                           remove_reference_t<AK0M0M1K1GridDesc>,
+                                           remove_reference_t<BK0N0N1K1GridDesc>,
+                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                           false,
+                                           false>;
+
+        ave_time = launch_and_time_kernel(kernel,
+                                          nrepeat,
+                                          dim3(grid_size),
+                                          dim3(BlockSize),
+                                          0,
+                                          0,
+                                          p_a_grid,
+                                          p_b_grid,
+                                          p_c_grid,
+                                          a_k0_m0_m1_k1_grid_desc,
+                                          b_k0_n0_n1_k1_grid_desc,
+                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
+                                          c_blockid_to_m0_n0_block_cluster_adaptor);
+    }
+
+    return ave_time;
+#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
+    DeviceMem a_k0_m0_m1_k1_grid_desc_dev_buf(sizeof(AK0M0M1K1GridDesc));
+    DeviceMem b_k0_n0_n1_k1_grid_desc_dev_buf(sizeof(BK0N0N1K1GridDesc));
+    DeviceMem c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf(sizeof(CM0M10M11N0N10N11GridDesc));
+    DeviceMem c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf(
+        sizeof(CBlockIdToM0N0BlockClusterAdaptor));
+
+    a_k0_m0_m1_k1_grid_desc_dev_buf.ToDevice(&a_k0_m0_m1_k1_grid_desc);
+    b_k0_n0_n1_k1_grid_desc_dev_buf.ToDevice(&b_k0_n0_n1_k1_grid_desc);
+    c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.ToDevice(&c_m0_m10_m11_n0_n10_n11_grid_desc);
+    c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.ToDevice(
+        &c_blockid_to_m0_n0_block_cluster_adaptor);
+
+    float ave_time = 0;
+
+    if(has_main_k_block_loop && has_double_tail_k_block_loop)
+    {
+        const auto kernel =
+            kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm,
+                                           FloatAB,
+                                           FloatC,
+                                           remove_reference_t<AK0M0M1K1GridDesc>,
+                                           remove_reference_t<BK0N0N1K1GridDesc>,
+                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                           true,
+                                           true>;
+
+        ave_time = launch_and_time_kernel(
+            kernel,
+            nrepeat,
+            dim3(grid_size),
+            dim3(BlockSize),
+            0,
+            0,
+            p_a_grid,
+            p_b_grid,
+            p_c_grid,
+            (void CONSTANT*)a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer(),
+            (void CONSTANT*)b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer(),
+            (void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(),
+            (void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer());
+    }
+    else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
+    {
+        const auto kernel =
+            kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm,
+                                           FloatAB,
+                                           FloatC,
+                                           remove_reference_t<AK0M0M1K1GridDesc>,
+                                           remove_reference_t<BK0N0N1K1GridDesc>,
+                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                           true,
+                                           false>;
+
+        ave_time = launch_and_time_kernel(
+            kernel,
+            nrepeat,
+            dim3(grid_size),
+            dim3(BlockSize),
+            0,
+            0,
+            p_a_grid,
+            p_b_grid,
+            p_c_grid,
+            (void CONSTANT*)a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer(),
+            (void CONSTANT*)b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer(),
+            (void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(),
+            (void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer());
+    }
+    else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
+    {
+        const auto kernel =
+            kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm,
+                                           FloatAB,
+                                           FloatC,
+                                           remove_reference_t<AK0M0M1K1GridDesc>,
+                                           remove_reference_t<BK0N0N1K1GridDesc>,
+                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                           false,
+                                           true>;
+
+        ave_time = launch_and_time_kernel(
+            kernel,
+            nrepeat,
+            dim3(grid_size),
+            dim3(BlockSize),
+            0,
+            0,
+            p_a_grid,
+            p_b_grid,
+            p_c_grid,
+            (void CONSTANT*)a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer(),
+            (void CONSTANT*)b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer(),
+            (void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(),
+            (void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer());
+    }
+    else
+    {
+        const auto kernel =
+            kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm,
+                                           FloatAB,
+                                           FloatC,
+                                           remove_reference_t<AK0M0M1K1GridDesc>,
+                                           remove_reference_t<BK0N0N1K1GridDesc>,
+                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                           false,
+                                           false>;
+
+        ave_time = launch_and_time_kernel(
+            kernel,
+            nrepeat,
+            dim3(grid_size),
+            dim3(BlockSize),
+            0,
+            0,
+            p_a_grid,
+            p_b_grid,
+            p_c_grid,
+            (void CONSTANT*)a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer(),
+            (void CONSTANT*)b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer(),
+            (void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(),
+            (void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer());
+    }
+
+    return ave_time;
+#endif
+}
+#endif
diff --git a/host/driver_offline/include/driver_dynamic_gemm_xdlops_v2r3.hpp b/host/driver_offline/include/driver_dynamic_gemm_xdlops_v2r3.hpp
new file mode 100644
index 0000000000..481d08188d
--- /dev/null
+++ b/host/driver_offline/include/driver_dynamic_gemm_xdlops_v2r3.hpp
@@ -0,0 +1,196 @@
+#ifndef DRIVER_DYNAMIC_GEMM_XDLOPS_V2R3
+#define DRIVER_DYNAMIC_GEMM_XDLOPS_V2R3
+
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "gridwise_dynamic_gemm_xdlops_v2r3.hpp"
+
+template <ck::index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          ck::InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
+          typename AK0MK1GridDesc,
+          typename BK0NK1GridDesc,
+          typename CMNGridDesc,
+          ck::index_t MPerBlock,
+          ck::index_t NPerBlock,
+          ck::index_t KPerBlock,
+          ck::index_t MPerWave,
+          ck::index_t NPerWave,
+          ck::index_t K1,
+          ck::index_t MRepeat,
+          ck::index_t NRepeat,
+          typename ABlockTransferThreadSliceLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          ck::index_t ABlockTransferSrcVectorDim,
+          ck::index_t ABlockTransferSrcScalarPerVector,
+          ck::index_t ABlockTransferDstScalarPerVector_K1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          typename BBlockTransferThreadSliceLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          ck::index_t BBlockTransferSrcVectorDim,
+          ck::index_t BBlockTransferSrcScalarPerVector,
+          ck::index_t BBlockTransferDstScalarPerVector_K1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          typename CThreadTransferSrcDstAccessOrder,
+          ck::index_t CThreadTransferSrcDstVectorDim,
+          ck::index_t CThreadTransferDstScalarPerVector,
+          typename AGridIteratorHacks,
+          typename BGridIteratorHacks,
+          typename CGridIteratorHacks,
+          typename AGridMoveSliceWindowIteratorHacks,
+          typename BGridMoveSliceWindowIteratorHacks,
+          bool CAccessOrderMRepeatNRepeat>
+__host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
+                                               const FloatAB* p_b_grid,
+                                               FloatC* p_c_grid,
+                                               const AK0MK1GridDesc& a_k0_m_k1_grid_desc,
+                                               const BK0NK1GridDesc& b_k0_n_k1_grid_desc,
+                                               const CMNGridDesc& c_m_n_grid_desc,
+                                               AGridIteratorHacks,
+                                               BGridIteratorHacks,
+                                               CGridIteratorHacks,
+                                               AGridMoveSliceWindowIteratorHacks,
+                                               BGridMoveSliceWindowIteratorHacks,
+                                               ck::index_t nrepeat)
+
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+
+    using GridwiseGemm =
+        GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
+                                                       FloatAB,
+                                                       FloatAcc,
+                                                       FloatC,
+                                                       CGlobalMemoryDataOperation,
+                                                       AK0MK1GridDesc,
+                                                       BK0NK1GridDesc,
+                                                       CMNGridDesc,
+                                                       MPerBlock,
+                                                       NPerBlock,
+                                                       KPerBlock,
+                                                       MPerWave,
+                                                       NPerWave,
+                                                       K1,
+                                                       MRepeat,
+                                                       NRepeat,
+                                                       ABlockTransferThreadSliceLengths_K0_M_K1,
+                                                       ABlockTransferThreadClusterLengths_K0_M_K1,
+                                                       ABlockTransferThreadClusterArrangeOrder,
+                                                       ABlockTransferSrcAccessOrder,
+                                                       ABlockTransferSrcVectorDim,
+                                                       ABlockTransferSrcScalarPerVector,
+                                                       ABlockTransferDstScalarPerVector_K1,
+                                                       AThreadTransferSrcResetCoordinateAfterRun,
+                                                       BBlockTransferThreadSliceLengths_K0_N_K1,
+                                                       BBlockTransferThreadClusterLengths_K0_N_K1,
+                                                       BBlockTransferThreadClusterArrangeOrder,
+                                                       BBlockTransferSrcAccessOrder,
+                                                       BBlockTransferSrcVectorDim,
+                                                       BBlockTransferSrcScalarPerVector,
+                                                       BBlockTransferDstScalarPerVector_K1,
+                                                       BThreadTransferSrcResetCoordinateAfterRun,
+                                                       CThreadTransferSrcDstAccessOrder,
+                                                       CThreadTransferSrcDstVectorDim,
+                                                       CThreadTransferDstScalarPerVector,
+                                                       AGridIteratorHacks,
+                                                       BGridIteratorHacks,
+                                                       CGridIteratorHacks,
+                                                       AGridMoveSliceWindowIteratorHacks,
+                                                       BGridMoveSliceWindowIteratorHacks,
+                                                       CAccessOrderMRepeatNRepeat>;
+
+    {
+        std::cout << "a_k0_m_k1_grid_desc{" << a_k0_m_k1_grid_desc.GetLength(I0) << ", "
+                  << a_k0_m_k1_grid_desc.GetLength(I1) << ", " << a_k0_m_k1_grid_desc.GetLength(I2)
+                  << "}" << std::endl;
+
+        std::cout << "b_k0_n_k1_grid_desc{" << b_k0_n_k1_grid_desc.GetLength(I0) << ", "
+                  << b_k0_n_k1_grid_desc.GetLength(I1) << ", " << b_k0_n_k1_grid_desc.GetLength(I2)
+                  << "}" << std::endl;
+
+        std::cout << "c_m_n_grid_desc{ " << c_m_n_grid_desc.GetLength(I0) << ", "
+                  << c_m_n_grid_desc.GetLength(I1) << "}" << std::endl;
+    }
+
+    if(!GridwiseGemm::CheckValidity(a_k0_m_k1_grid_desc, b_k0_n_k1_grid_desc, c_m_n_grid_desc))
+    {
+        throw std::runtime_error(
+            "wrong! GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v2r3 has invalid setting");
+    }
+
+    const auto c_m0_m1_m2_n_grid_desc = GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
+
+    using CM0M1M2NGridDesc = decltype(c_m0_m1_m2_n_grid_desc);
+
+    const auto c_block_cluster_adaptor = GridwiseGemm::MakeCBlockClusterAdaptor(c_m_n_grid_desc);
+
+    using CBlockClusterAdaptor = decltype(c_block_cluster_adaptor);
+
+    const index_t grid_size = GridwiseGemm::CalculateGridSize(c_m_n_grid_desc);
+
+    const auto kernel = kernel_dynamic_gemm_xdlops_v2r3<GridwiseGemm,
+                                                        FloatAB,
+                                                        FloatC,
+                                                        remove_reference_t<AK0MK1GridDesc>,
+                                                        remove_reference_t<BK0NK1GridDesc>,
+                                                        remove_reference_t<CM0M1M2NGridDesc>,
+                                                        remove_reference_t<CBlockClusterAdaptor>>;
+
+#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
+    float ave_time = launch_and_time_kernel(kernel,
+                                            nrepeat,
+                                            dim3(grid_size),
+                                            dim3(BlockSize),
+                                            0,
+                                            0,
+                                            p_a_grid,
+                                            p_b_grid,
+                                            p_c_grid,
+                                            a_k0_m_k1_grid_desc,
+                                            b_k0_n_k1_grid_desc,
+                                            c_m0_m1_m2_n_grid_desc,
+                                            c_block_cluster_adaptor);
+
+#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
+    DeviceMem a_k0_m_k1_grid_desc_dev_buf(sizeof(AK0MK1GridDesc));
+    DeviceMem b_k0_n_k1_grid_desc_dev_buf(sizeof(BK0NK1GridDesc));
+    DeviceMem c_m0_m1_m2_n_grid_desc_dev_buf(sizeof(CM0M1M2NGridDesc));
+    DeviceMem c_block_cluster_adaptor_dev_buf(sizeof(CBlockClusterAdaptor));
+
+    a_k0_m_k1_grid_desc_dev_buf.ToDevice(&a_k0_m_k1_grid_desc);
+    b_k0_n_k1_grid_desc_dev_buf.ToDevice(&b_k0_n_k1_grid_desc);
+    c_m0_m1_m2_n_grid_desc_dev_buf.ToDevice(&c_m0_m1_m2_n_grid_desc);
+    c_block_cluster_adaptor_dev_buf.ToDevice(&c_block_cluster_adaptor);
+
+    float ave_time =
+        launch_and_time_kernel(kernel,
+                               nrepeat,
+                               dim3(grid_size),
+                               dim3(BlockSize),
+                               0,
+                               0,
+                               p_a_grid,
+                               p_b_grid,
+                               p_c_grid,
+                               (void CONSTANT*)a_k0_m_k1_grid_desc_dev_buf.GetDeviceBuffer(),
+                               (void CONSTANT*)b_k0_n_k1_grid_desc_dev_buf.GetDeviceBuffer(),
+                               (void CONSTANT*)c_m0_m1_m2_n_grid_desc_dev_buf.GetDeviceBuffer(),
+                               (void CONSTANT*)c_block_cluster_adaptor_dev_buf.GetDeviceBuffer());
+#endif
+    return ave_time;
+}
+#endif
diff --git a/host/driver_online/CMakeLists.txt b/host/driver_online/CMakeLists.txt
new file mode 100644
index 0000000000..2ae05e0ba5
--- /dev/null
+++ b/host/driver_online/CMakeLists.txt
@@ -0,0 +1,21 @@
+include_directories(BEFORE
+    include
+    ${PROJECT_BINARY_DIR}/host/online_compilation/include
+    ${PROJECT_SOURCE_DIR}/host/online_compilation/include
+    ${PROJECT_SOURCE_DIR}/host/host_tensor/include
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include/utility
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_operation
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include/problem_transform
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include/driver
+    ${PROJECT_SOURCE_DIR}/external/rocm/include
+    ${PROJECT_SOURCE_DIR}/external/half/include
+)
+
+set(CONV_FWD_DRIVER_ONLINE_SOURCE conv_fwd_driver_online.cpp)
+
+add_executable(conv_fwd_driver_online ${CONV_FWD_DRIVER_ONLINE_SOURCE})
+
+target_link_libraries(conv_fwd_driver_online PRIVATE host_tensor)
+target_link_libraries(conv_fwd_driver_online PRIVATE online_compilation)
diff --git a/host/driver_online/conv_fwd_driver_online.cpp b/host/driver_online/conv_fwd_driver_online.cpp
new file mode 100644
index 0000000000..c91f76fa24
--- /dev/null
+++ b/host/driver_online/conv_fwd_driver_online.cpp
@@ -0,0 +1,453 @@
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "config.hpp"
+#include "print.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "conv_common.hpp"
+#include "host_conv.hpp"
+#include "device_tensor.hpp"
+#include "handle.hpp"
+#include "hipCheck.hpp"
+#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp"
+#include "online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp"
+#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
+#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp"
+
+#define USE_CONV_FWD_V4R4_NCHW 1
+#define USE_CONV_FWD_V6R1_NCHW 1
+#define USE_CONV_FWD_V4R4_XDLOPS_NCHW 1
+#define USE_CONV_FWD_V4R4_XDLOPS_NHWC 1
+
+enum ConvForwardAlgo
+{
+    V4R4NCHW,    // 0
+    V6R1NCHW,    // 1
+    V4R4XDLNCHW, // 2
+    V4R4XDLNHWC  // 3
+};
+
+int main(int argc, char* argv[])
+{
+    using namespace ck;
+    using namespace ck_driver;
+    using size_t = std::size_t;
+
+    hipStream_t stream;
+    olCompile::Handle* handle;
+
+    MY_HIP_CHECK(hipStreamCreate(&stream));
+
+    handle = new olCompile::Handle(stream);
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+    constexpr auto I6 = Number<6>{};
+
+    if(argc != 22)
+    {
+        printf("arg1 to 5: layout, algo, do_verification, init_method, do_log, nrepeat\n");
+        printf("rest: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx\n");
+        exit(1);
+    }
+
+    const ConvTensorLayout layout = static_cast<ConvTensorLayout>(atoi(argv[1]));
+    const ConvForwardAlgo algo    = static_cast<ConvForwardAlgo>(atoi(argv[2]));
+    const bool do_verification    = atoi(argv[3]);
+    const int init_method         = atoi(argv[4]);
+    const bool do_log             = atoi(argv[5]);
+    const int nrepeat             = atoi(argv[6]);
+
+    const index_t N  = atoi(argv[7]);
+    const index_t K  = atoi(argv[8]);
+    const index_t C  = atoi(argv[9]);
+    const index_t Y  = atoi(argv[10]);
+    const index_t X  = atoi(argv[11]);
+    const index_t Hi = atoi(argv[12]);
+    const index_t Wi = atoi(argv[13]);
+
+    const index_t conv_stride_h   = atoi(argv[14]);
+    const index_t conv_stride_w   = atoi(argv[15]);
+    const index_t conv_dilation_h = atoi(argv[16]);
+    const index_t conv_dilation_w = atoi(argv[17]);
+    const index_t in_left_pad_h   = atoi(argv[18]);
+    const index_t in_left_pad_w   = atoi(argv[19]);
+    const index_t in_right_pad_h  = atoi(argv[20]);
+    const index_t in_right_pad_w  = atoi(argv[21]);
+
+    const index_t YEff = (Y - 1) * conv_dilation_h + 1;
+    const index_t XEff = (X - 1) * conv_dilation_w + 1;
+
+    const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+    const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+
+#if 1
+    using in_data_t  = float;
+    using acc_data_t = float;
+    using out_data_t = float;
+#elif 0
+    using in_data_t  = half_t;
+    using acc_data_t = float;
+    using out_data_t = half_t;
+#elif 1
+    using in_data_t  = int8_t;
+    using acc_data_t = int32_t;
+    using out_data_t = int8_t;
+#endif
+
+    std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4);
+
+    switch(layout)
+    {
+    case ConvTensorLayout::NCHW:
+        // NCHW
+        in_lengths_host[0] = static_cast<std::size_t>(N);
+        in_lengths_host[1] = static_cast<std::size_t>(C);
+        in_lengths_host[2] = static_cast<std::size_t>(Hi);
+        in_lengths_host[3] = static_cast<std::size_t>(Wi);
+
+        wei_lengths_host[0] = static_cast<std::size_t>(K);
+        wei_lengths_host[1] = static_cast<std::size_t>(C);
+        wei_lengths_host[2] = static_cast<std::size_t>(Y);
+        wei_lengths_host[3] = static_cast<std::size_t>(X);
+
+        out_lengths_host[0] = static_cast<std::size_t>(N);
+        out_lengths_host[1] = static_cast<std::size_t>(K);
+        out_lengths_host[2] = static_cast<std::size_t>(Ho);
+        out_lengths_host[3] = static_cast<std::size_t>(Wo);
+        break;
+    case ConvTensorLayout::NHWC:
+        // NHWC
+        in_lengths_host[0] = static_cast<std::size_t>(N);
+        in_lengths_host[1] = static_cast<std::size_t>(Hi);
+        in_lengths_host[2] = static_cast<std::size_t>(Wi);
+        in_lengths_host[3] = static_cast<std::size_t>(C);
+
+        wei_lengths_host[0] = static_cast<std::size_t>(K);
+        wei_lengths_host[1] = static_cast<std::size_t>(Y);
+        wei_lengths_host[2] = static_cast<std::size_t>(X);
+        wei_lengths_host[3] = static_cast<std::size_t>(C);
+
+        out_lengths_host[0] = static_cast<std::size_t>(N);
+        out_lengths_host[1] = static_cast<std::size_t>(Ho);
+        out_lengths_host[2] = static_cast<std::size_t>(Wo);
+        out_lengths_host[3] = static_cast<std::size_t>(K);
+        break;
+    default: throw std::runtime_error("wrong! not implemented");
+    }
+
+    Tensor<in_data_t> in(in_lengths_host);
+    Tensor<in_data_t> wei(wei_lengths_host);
+    Tensor<out_data_t> out_host(out_lengths_host);
+    Tensor<out_data_t> out_device(out_lengths_host);
+
+    std::cout << "layout: " << layout << std::endl;
+    ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
+    ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
+    ostream_HostTensorDescriptor(out_host.mDesc, std::cout << "out: ");
+    print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
+    print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
+    print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
+    print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
+
+    std::size_t num_thread = std::thread::hardware_concurrency();
+
+    switch(init_method)
+    {
+    case 0:
+        // no initialization
+        break;
+    case 1:
+        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        break;
+    case 2:
+        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        break;
+    case 3:
+        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        break;
+    case 4:
+        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        break;
+    case 5:
+        in.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
+        break;
+    default:
+        in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
+
+        auto gen_wei = [](auto... is) {
+            return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
+        };
+        wei.GenerateTensorValue(gen_wei, num_thread);
+    }
+
+    auto f_make_for_device_nchw = [&]() {
+        const auto in_lengths_dev  = make_tuple(N, C, Hi, Wi);
+        const auto wei_lengths_dev = make_tuple(K, C, Y, X);
+        const auto out_lengths_dev = make_tuple(N, K, Ho, Wo);
+
+        return make_tuple(in_lengths_dev, wei_lengths_dev, out_lengths_dev);
+    };
+
+    auto f_make_for_device_nhwc = [&]() {
+        const auto in_lengths_dev  = make_tuple(N, Hi, Wi, C);
+        const auto wei_lengths_dev = make_tuple(K, Y, X, C);
+        const auto out_lengths_dev = make_tuple(N, Ho, Wo, K);
+
+        return make_tuple(in_lengths_dev, wei_lengths_dev, out_lengths_dev);
+    };
+
+    const auto conv_strides   = make_tuple(conv_stride_h, conv_stride_w);
+    const auto conv_dilations = make_tuple(conv_dilation_h, conv_dilation_w);
+    const auto in_left_pads   = make_tuple(in_left_pad_h, in_left_pad_w);
+    const auto in_right_pads  = make_tuple(in_right_pad_h, in_right_pad_w);
+
+#if USE_CONV_FWD_V4R4_NCHW
+    if(algo == ConvForwardAlgo::V4R4NCHW)
+    {
+        if(layout != ConvTensorLayout::NCHW)
+        {
+            throw std::runtime_error("wrong! layout");
+        }
+
+        const auto tmp = f_make_for_device_nchw();
+
+        tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* tunable =
+            &default_tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw;
+
+        online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw<
+            in_data_t,
+            acc_data_t,
+            out_data_t>(handle,
+                        tmp[I0],
+                        tmp[I1],
+                        tmp[I2],
+                        conv_strides,
+                        conv_dilations,
+                        in_left_pads,
+                        in_right_pads,
+                        in,
+                        wei,
+                        out_device,
+                        tunable,
+                        nrepeat);
+    }
+#endif
+
+#if USE_CONV_FWD_V6R1_NCHW
+    if(algo == ConvForwardAlgo::V6R1NCHW)
+    {
+        if(layout != ConvTensorLayout::NCHW)
+        {
+            throw std::runtime_error("wrong! layout");
+        }
+
+        const auto tmp = f_make_for_device_nchw();
+
+#if 1
+        const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param = {
+            get_datatype_enum_from_type<in_data_t>::value,
+            get_datatype_enum_from_type<acc_data_t>::value,
+            get_datatype_enum_from_type<out_data_t>::value,
+            256,
+            4,
+            1,
+            128,
+            32,
+            8,
+            4,
+            4,
+            1,
+            {8, 2},
+            {8, 2},
+            {4, 1, 1, 1, 1},
+            {2, 1, 1, 128, 1},
+            {4, 1, 1, 1, 1},
+            {1, 1, 1, 1, 1},
+            {1, 4, 1, 1, 1},
+            {8, 1, 1, 32, 1},
+            {1, 1, 1, 1, 1},
+            {1, 1, 1, 1, 1},
+            4,
+            true,
+            true};
+#elif 0
+        const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param = {
+            get_datatype_enum_from_type<in_data_t>::value,
+            get_datatype_enum_from_type<acc_data_t>::value,
+            get_datatype_enum_from_type<out_data_t>::value,
+            256,
+            4,
+            2,
+            128,
+            32,
+            8,
+            4,
+            4,
+            1,
+            {8, 2},
+            {8, 2},
+            {4, 1, 1, 1, 2},
+            {2, 1, 1, 128, 1},
+            {4, 1, 1, 1, 1},
+            {1, 1, 1, 1, 1},
+            {1, 4, 1, 1, 2},
+            {8, 1, 1, 32, 1},
+            {1, 1, 1, 1, 1},
+            {1, 1, 1, 1, 1},
+            4,
+            true,
+            true};
+#elif 1
+        const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param = {
+            get_datatype_enum_from_type<in_data_t>::value,
+            get_datatype_enum_from_type<acc_data_t>::value,
+            get_datatype_enum_from_type<out_data_t>::value,
+            256,
+            4,
+            4,
+            128,
+            32,
+            8,
+            4,
+            4,
+            1,
+            {8, 2},
+            {8, 2},
+            {4, 1, 1, 1, 4},
+            {2, 1, 1, 128, 1},
+            {4, 1, 1, 1, 1},
+            {1, 1, 1, 1, 1},
+            {1, 4, 1, 1, 4},
+            {8, 1, 1, 32, 1},
+            {1, 1, 1, 1, 1},
+            {1, 1, 1, 1, 1},
+            4,
+            true,
+            true};
+#endif
+
+        online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw<
+            in_data_t,
+            acc_data_t,
+            out_data_t>(handle,
+                        tmp[I0],
+                        tmp[I1],
+                        tmp[I2],
+                        conv_strides,
+                        conv_dilations,
+                        in_left_pads,
+                        in_right_pads,
+                        in,
+                        wei,
+                        out_device,
+                        compile_param,
+                        nrepeat);
+    }
+#endif
+
+#if USE_CONV_FWD_V4R4_XDLOPS_NCHW
+    if(algo == ConvForwardAlgo::V4R4XDLNCHW)
+    {
+        if(layout != ConvTensorLayout::NCHW)
+        {
+            throw std::runtime_error("wrong! layout");
+        }
+
+        const auto tmp = f_make_for_device_nchw();
+
+        tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* tunable =
+            &default_tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw;
+
+        online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw<
+            in_data_t,
+            acc_data_t,
+            out_data_t>(handle,
+                        tmp[I0],
+                        tmp[I1],
+                        tmp[I2],
+                        conv_strides,
+                        conv_dilations,
+                        in_left_pads,
+                        in_right_pads,
+                        in,
+                        wei,
+                        out_device,
+                        tunable,
+                        nrepeat);
+    }
+#endif
+
+#if USE_CONV_FWD_V4R4_XDLOPS_NHWC
+    if(algo == ConvForwardAlgo::V4R4XDLNHWC)
+    {
+        if(layout != ConvTensorLayout::NHWC)
+        {
+            throw std::runtime_error("wrong! layout");
+        }
+
+        const auto tmp = f_make_for_device_nhwc();
+
+        tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* tunable =
+            &default_tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk;
+
+        online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk<
+            in_data_t,
+            acc_data_t,
+            out_data_t>(handle,
+                        tmp[I0],
+                        tmp[I1],
+                        tmp[I2],
+                        conv_strides,
+                        conv_dilations,
+                        in_left_pads,
+                        in_right_pads,
+                        in,
+                        wei,
+                        out_device,
+                        tunable,
+                        nrepeat);
+    }
+#endif
+
+    if(do_verification)
+    {
+        host_direct_convolution(in,
+                                wei,
+                                out_host,
+                                make_tuple(conv_stride_h, conv_stride_w),
+                                make_tuple(conv_dilation_h, conv_dilation_w),
+                                make_tuple(in_left_pad_h, in_left_pad_w),
+                                make_tuple(in_right_pad_h, in_right_pad_w),
+                                layout);
+
+        check_error(out_host, out_device);
+
+#if 0
+        if(do_log)
+        {
+            LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "out_host  : ", out_host.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
+        }
+#endif
+    }
+
+    delete handle;
+    MY_HIP_CHECK(hipStreamDestroy(stream));
+}
diff --git a/host/driver_online/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp b/host/driver_online/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp
new file mode 100644
index 0000000000..b0c4921019
--- /dev/null
+++ b/host/driver_online/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp
@@ -0,0 +1,673 @@
+#ifndef CONV_IGEMM_FWD_V6R1_DLOPS_NCHW_KCYX_NKHW_HPP
+#define CONV_IGEMM_FWD_V6R1_DLOPS_NCHW_KCYX_NKHW_HPP
+
+#include <numeric>
+
+namespace ck_driver {
+
+struct CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw
+{
+    ck::DataTypeEnum_t ABDataTypeEnum;
+    ck::DataTypeEnum_t AccDataTypeEnum;
+    ck::DataTypeEnum_t CDataTypeEnum;
+
+    int BlockSize;
+
+    int GN0;
+    int GK1;
+
+    int GM1PerBlockGM11;
+    int GN1PerBlockGN11;
+    int GK0PerBlock;
+
+    int BM1PerThreadBM11;
+    int BN1PerThreadBN11;
+    int BK0PerThread;
+
+    std::array<int, 2> BM10BN10ThreadClusterBM10Xs;
+    std::array<int, 2> BM10BN10ThreadClusterBN10Xs;
+
+    std::array<int, 5> ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1;
+    std::array<int, 5> ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1;
+    std::array<int, 5> ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
+    std::array<int, 5> ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
+
+    std::array<int, 5> BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1;
+    std::array<int, 5> BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1;
+    std::array<int, 5> BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
+    std::array<int, 5> BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
+
+    int CThreadTransferDstScalarPerVector;
+
+    bool HasMainKBlockLoop;
+    bool HasDoubleTailKBlockLoop;
+
+    auto GetCompileParameterString() const
+    {
+        // clang-format off
+        return
+            " -DCK_PARAM_ABDataTypeEnum=" + 
+                std::to_string(ABDataTypeEnum) + 
+            " -DCK_PARAM_AccDataTypeEnum=" + 
+                std::to_string(AccDataTypeEnum) +
+            " -DCK_PARAM_CDataTypeEnum=" + 
+                std::to_string(CDataTypeEnum) + 
+            " -DCK_PARAM_BlockSize=" +
+                std::to_string(BlockSize) +
+            " -DCK_PARAM_GN0=" +
+                std::to_string(GN0) +
+            " -DCK_PARAM_GK1=" +
+                std::to_string(GK1) +
+            " -DCK_PARAM_GM1PerBlockGM11=" +
+                std::to_string(GM1PerBlockGM11) +
+            " -DCK_PARAM_GN1PerBlockGN11=" +
+                std::to_string(GN1PerBlockGN11) +
+            " -DCK_PARAM_GK0PerBlock=" + 
+                std::to_string(GK0PerBlock) +
+            " -DCK_PARAM_BM1PerThreadBM11=" +
+                std::to_string(BM1PerThreadBM11) +
+            " -DCK_PARAM_BN1PerThreadBN11=" +
+                std::to_string(BN1PerThreadBN11) +
+            " -DCK_PARAM_BK0PerThread=" +
+                std::to_string(BK0PerThread) +
+            " -DCK_PARAM_BM10BN10ThreadClusterBM10Xs=" +
+                std::to_string(BM10BN10ThreadClusterBM10Xs[0]) + "," +
+                std::to_string(BM10BN10ThreadClusterBM10Xs[1]) +
+            " -DCK_PARAM_BM10BN10ThreadClusterBN10Xs=" +
+                std::to_string(BM10BN10ThreadClusterBN10Xs[0]) + "," +
+                std::to_string(BM10BN10ThreadClusterBN10Xs[1]) +
+            " -DCK_PARAM_ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1=" +
+                std::to_string(ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[0]) + "," +
+                std::to_string(ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[1]) + "," +
+                std::to_string(ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[2]) + "," +
+                std::to_string(ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[3]) + "," +
+                std::to_string(ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[4]) +
+            " -DCK_PARAM_ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1=" +
+                std::to_string(ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[0]) + "," +
+                std::to_string(ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[1]) + "," +
+                std::to_string(ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[2]) + "," +
+                std::to_string(ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[3]) + "," +
+                std::to_string(ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[4]) +
+            " -DCK_PARAM_ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1=" +
+                std::to_string(ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0]) +  "," +
+                std::to_string(ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1]) + "," +
+                std::to_string(ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2]) + "," +
+                std::to_string(ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3]) + "," +
+                std::to_string(ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4]) +
+            " -DCK_PARAM_ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1=" +
+                std::to_string(ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0]) + "," +
+                std::to_string(ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1]) + "," +
+                std::to_string(ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2]) + "," +
+                std::to_string(ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3]) + "," +
+                std::to_string(ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4]) +
+            " -DCK_PARAM_BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1=" +
+                std::to_string(BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[0]) + "," +
+                std::to_string(BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[1]) + "," +
+                std::to_string(BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[2]) + "," +
+                std::to_string(BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[3]) + "," +
+                std::to_string(BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[4]) +
+            " -DCK_PARAM_BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1=" +
+                std::to_string(BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[0]) + "," +
+                std::to_string(BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[1]) + "," +
+                std::to_string(BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[2]) + "," +
+                std::to_string(BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[3]) + "," +
+                std::to_string(BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[4]) +
+            " -DCK_PARAM_BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1=" +
+                std::to_string(BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0]) + "," +
+                std::to_string(BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1]) + "," +
+                std::to_string(BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2]) + "," +
+                std::to_string(BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3]) + "," +
+                std::to_string(BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4]) +
+            " -DCK_PARAM_BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1=" +
+                std::to_string(BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0]) + "," +
+                std::to_string(BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1]) + "," +
+                std::to_string(BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2]) + "," +
+                std::to_string(BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3]) + "," +
+                std::to_string(BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4]) +
+            " -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
+                std::to_string(CThreadTransferDstScalarPerVector) +
+            " -DCK_PARAM_HasMainKBlockLoop=" +
+                std::to_string(HasMainKBlockLoop) + 
+            " -DCK_PARAM_HasDoubleTailKBlockLoop=" +
+                std::to_string(HasDoubleTailKBlockLoop);
+        // clang-format on
+    }
+};
+
+struct TunableConvIgemmFwdV6r1DlopsNchwKcyxNkhw
+{
+    ck::DataTypeEnum_t ABDataTypeEnum;
+    ck::DataTypeEnum_t CDataTypeEnum;
+
+    int BlockSize;
+
+    int GN0;
+    int GK1;
+
+    int GM1PerBlockGM11;
+    int GN1PerBlockGN11;
+    int GK0PerBlock;
+
+    int BM1PerThreadBM11;
+    int BN1PerThreadBN11;
+    int BK0PerThread;
+
+    std::array<int, 2> BM10BN10ThreadClusterBM10Xs;
+    std::array<int, 2> BM10BN10ThreadClusterBN10Xs;
+
+    std::array<int, 5> ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1;
+    std::array<int, 5> ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1;
+    std::array<int, 5> ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
+    std::array<int, 5> ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
+
+    std::array<int, 5> BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1;
+    std::array<int, 5> BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1;
+    std::array<int, 5> BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
+    std::array<int, 5> BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
+};
+
+inline static auto generate_tunable_list_conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw()
+{
+    constexpr auto f32 = ck::DataTypeEnum_t::Float;
+    constexpr auto f16 = ck::DataTypeEnum_t::Half;
+    constexpr auto i8  = ck::DataTypeEnum_t::Int8;
+
+    return std::vector<TunableConvIgemmFwdV6r1DlopsNchwKcyxNkhw>{
+        // clang-format off
+        // fp32
+        {f32, f32, 256, 1, 1, 128, 128, 16, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 2, 1}, {4, 1, 1,  64, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 1, 1, 4, 1}, { 8, 1, 1,  32, 1}, {1, 1, 1, 4, 1}, {1, 1, 1, 4, 1}},
+
+        {f32, f32, 256, 1, 1, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 1}, { 8, 1, 1,  32, 1}, {1, 1, 1, 4, 1}, {1, 1, 1, 4, 1}},
+        {f32, f32, 256, 1, 1, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 1}, { 8, 1, 1,  32, 1}, {1, 1, 1, 2, 1}, {1, 1, 1, 4, 1}},
+        {f32, f32, 256, 1, 1, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 1}, { 8, 1, 1,  32, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 1}},
+
+        {f32, f32, 256, 1, 1, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {4, 1, 1, 1, 1}, { 2, 1, 1, 128, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
+        {f32, f32, 256, 2, 1, 128,  64,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 2, 1, 1, 1}, { 4, 1, 1,  64, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
+        {f32, f32, 256, 4, 1, 128,  32,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 4, 1, 1, 1}, { 8, 1, 1,  32, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
+
+        {f32, f32, 256, 8, 1, 128,  16, 16, 4, 4, 1, {8, 2}, {8, 2}, {8, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 8, 1, 1, 1}, {16, 1, 1,  16, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
+
+        {f32, f32, 128, 1, 1,  64, 128,  8, 4, 4, 1, {4, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1,  64, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {8, 1, 1, 1, 1}, { 1, 1, 1, 128, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
+
+        // fp16
+        {f16, f16, 256, 1, 2, 128, 128, 16, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 2, 2}, {4, 1, 1,  64, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 1, 1, 4, 2}, { 8, 1, 1,  32, 1}, {1, 1, 1, 4, 1}, {1, 1, 1, 4, 1}},
+
+        {f16, f16, 256, 1, 2, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 2}, { 8, 1, 1,  32, 1}, {1, 1, 1, 4, 1}, {1, 1, 1, 4, 1}},
+        {f16, f16, 256, 1, 2, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 2}, { 8, 1, 1,  32, 1}, {1, 1, 1, 2, 1}, {1, 1, 1, 4, 1}},
+        {f16, f16, 256, 1, 2, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 2}, { 8, 1, 1,  32, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 1}},
+
+        {f16, f16, 256, 1, 2, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {4, 1, 1, 1, 2}, { 2, 1, 1, 128, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
+        {f16, f16, 256, 2, 2, 128,  64,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 2, 1, 1, 2}, { 4, 1, 1,  64, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
+        {f16, f16, 256, 4, 2, 128,  32,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 4, 1, 1, 2}, { 8, 1, 1,  32, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
+
+        {f16, f16, 256, 8, 2, 128,  16, 16, 4, 4, 1, {8, 2}, {8, 2}, {8, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 8, 1, 1, 2}, {16, 1, 1,  16, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
+
+        {f16, f16, 128, 1, 2,  64, 128,  8, 4, 4, 1, {4, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1,  64, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {8, 1, 1, 1, 2}, { 1, 1, 1, 128, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
+
+        // i8
+        { i8,  i8, 256, 1, 4, 128, 128, 16, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 2, 4}, {4, 1, 1,  64, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 1, 1, 4, 4}, { 8, 1, 1,  32, 1}, {1, 1, 1, 4, 1}, {1, 1, 1, 4, 1}},
+
+        { i8,  i8, 256, 1, 4, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 4}, { 8, 1, 1,  32, 1}, {1, 1, 1, 4, 1}, {1, 1, 1, 4, 1}},
+        { i8,  i8, 256, 1, 4, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 4}, { 8, 1, 1,  32, 1}, {1, 1, 1, 2, 1}, {1, 1, 1, 4, 1}},
+        { i8,  i8, 256, 1, 4, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 4}, { 8, 1, 1,  32, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 1}},
+
+        { i8,  i8, 256, 1, 4, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {4, 1, 1, 1, 4}, { 2, 1, 1, 128, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
+        { i8,  i8, 256, 2, 4, 128,  64,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 2, 1, 1, 4}, { 4, 1, 1,  64, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
+        { i8,  i8, 256, 4, 4, 128,  32,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 4, 1, 1, 4}, { 8, 1, 1,  32, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
+
+        { i8,  i8, 256, 8, 4, 128,  16, 16, 4, 4, 1, {8, 2}, {8, 2}, {8, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 8, 1, 1, 4}, {16, 1, 1,  16, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
+
+        { i8,  i8, 128, 1, 4,  64, 128,  8, 4, 4, 1, {4, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1,  64, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {8, 1, 1, 1, 4}, { 1, 1, 1, 128, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}}
+        // clang-format on
+    };
+}
+
+// TODO make this common interface and write specs for it
+struct ConvIgemmFwdV6r1DlopsNchwKcyxNkhw
+{
+    static auto
+    CalculateCompileParameterBasedOnTunable(const ConvolutionProblemDescriptor& conv_problem_desc,
+                                            const TunableConvIgemmFwdV6r1DlopsNchwKcyxNkhw& tunable)
+    {
+        using namespace ck;
+
+        const int C  = conv_problem_desc.C;
+        const int Y  = conv_problem_desc.Y;
+        const int X  = conv_problem_desc.X;
+        const int Ho = conv_problem_desc.Ho;
+        const int Wo = conv_problem_desc.Wo;
+
+        if(!(conv_problem_desc.InDataTypeEnum == tunable.ABDataTypeEnum &&
+             conv_problem_desc.WeiDataTypeEnum == tunable.ABDataTypeEnum &&
+             conv_problem_desc.OutDataTypeEnum == tunable.CDataTypeEnum))
+            return std::make_tuple(CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{}, false);
+
+        const auto ABDataTypeEnum = conv_problem_desc.InDataTypeEnum;
+        const auto CDataTypeEnum  = conv_problem_desc.OutDataTypeEnum;
+
+        DataTypeEnum_t AccDataTypeEnum;
+
+        switch(ABDataTypeEnum)
+        {
+        case DataTypeEnum_t::Float:
+        case DataTypeEnum_t::Half: AccDataTypeEnum = DataTypeEnum_t::Float; break;
+        case DataTypeEnum_t::Int8: AccDataTypeEnum = DataTypeEnum_t::Int32; break;
+        default: return std::make_tuple(CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{}, false);
+        }
+
+        const int BlockSize = tunable.BlockSize;
+
+        const int GN0 = tunable.GN0;
+        const int GK1 = tunable.GK1;
+
+        const int GM11        = tunable.GM1PerBlockGM11;
+        const int GN11        = tunable.GN1PerBlockGN11;
+        const int GK0PerBlock = tunable.GK0PerBlock;
+
+        const int BM11         = tunable.BM1PerThreadBM11;
+        const int BN11         = tunable.BN1PerThreadBN11;
+        const int BK0PerThread = tunable.BK0PerThread;
+
+        const auto BM10BN10ThreadClusterBM10Xs = tunable.BM10BN10ThreadClusterBM10Xs;
+        const auto BM10BN10ThreadClusterBN10Xs = tunable.BM10BN10ThreadClusterBN10Xs;
+
+        const auto ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1 =
+            tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1;
+        const auto ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1 =
+            tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1;
+        const auto ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 =
+            tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
+        const auto ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 =
+            tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
+
+        const auto BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1 =
+            tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1;
+        const auto BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1 =
+            tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1;
+        const auto BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 =
+            tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
+        const auto BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 =
+            tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
+
+        // C threadwise copy: {BN11} or {BN} or {BN1} or {GN11} is Dst vector dim
+        const int CThreadTransferDstScalarPerVector = gcd(4, GN11, BN11, Ho * Wo);
+
+        const int C0 = GK1;
+
+        if(!(C % C0 == 0))
+            return std::make_tuple(CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{}, false);
+
+        const int C1 = C / C0;
+
+        const int GK0 = C1 * Y * X;
+
+        if(!(GK0 % GK0PerBlock == 0))
+            return std::make_tuple(CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{}, false);
+
+        const bool HasMainKBlockLoop = ((GK0 + GK0PerBlock) / (2 * GK0PerBlock) > 1);
+
+        const bool HasDoubleTailKBlockLoop = ((GK0 / GK0PerBlock) % 2 == 0);
+
+        return std::make_tuple(
+            CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{
+                ABDataTypeEnum,
+                AccDataTypeEnum,
+                CDataTypeEnum,
+                BlockSize,
+                GN0,
+                GK1,
+                GM11,
+                GN11,
+                GK0PerBlock,
+                BM11,
+                BN11,
+                BK0PerThread,
+                BM10BN10ThreadClusterBM10Xs,
+                BM10BN10ThreadClusterBN10Xs,
+                ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1,
+                ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1,
+                ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
+                ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
+                BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1,
+                BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1,
+                BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
+                BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
+                CThreadTransferDstScalarPerVector,
+                HasMainKBlockLoop,
+                HasDoubleTailKBlockLoop},
+            true);
+    }
+
+    static auto GetDefaultCompileParameter(const ConvolutionProblemDescriptor& conv_problem_desc)
+    {
+        for(const auto& tunable : generate_tunable_list_conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw())
+        {
+            CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param;
+            bool found = false;
+
+            std::tie(compile_param, found) =
+                CalculateCompileParameterBasedOnTunable(conv_problem_desc, tunable);
+
+            if(found && IsValidCompileParameter(conv_problem_desc, compile_param))
+                return std::make_tuple(compile_param, true);
+        }
+
+        return std::make_tuple(CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{}, false);
+    }
+
+    static bool IsApplicable(const ConvolutionProblemDescriptor& conv_problem_desc)
+    {
+        bool found = false;
+
+        std::tie(std::ignore, found) = GetDefaultCompileParameter(conv_problem_desc);
+
+        return found;
+    }
+
+    static bool
+    IsValidCompileParameter(const ConvolutionProblemDescriptor& conv_problem_desc,
+                            const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw& compile_param)
+    {
+        using namespace ck;
+
+        const int N  = conv_problem_desc.N;
+        const int K  = conv_problem_desc.K;
+        const int C  = conv_problem_desc.C;
+        const int Y  = conv_problem_desc.Y;
+        const int X  = conv_problem_desc.X;
+        const int Ho = conv_problem_desc.Ho;
+        const int Wo = conv_problem_desc.Wo;
+
+        const int GK1  = compile_param.GK1;
+        const int GN0  = compile_param.GN0;
+        const int GM11 = compile_param.GM1PerBlockGM11;
+        const int GN11 = compile_param.GN1PerBlockGN11;
+
+        const int BM11 = compile_param.BM1PerThreadBM11;
+        const int BN11 = compile_param.BN1PerThreadBN11;
+
+        const int C0 = GK1;
+        const int N0 = GN0;
+
+        if(!(C % C0 == 0))
+            return false;
+
+        const int C1 = C / C0;
+
+        if(!(N % N0 == 0))
+            return false;
+
+        const int N1 = N / N0;
+
+        const int GM0 = 1;
+        const int GM1 = K;
+        const int GN1 = N1 * Ho * Wo;
+        const int GK0 = C1 * Y * X;
+
+        // check data type
+        {
+            if(!(conv_problem_desc.InDataTypeEnum == conv_problem_desc.WeiDataTypeEnum &&
+                 conv_problem_desc.InDataTypeEnum == compile_param.ABDataTypeEnum))
+                return false;
+
+            if(compile_param.ABDataTypeEnum == DataTypeEnum_t::Float ||
+               compile_param.ABDataTypeEnum == DataTypeEnum_t::Half)
+            {
+                if(!(compile_param.AccDataTypeEnum == DataTypeEnum_t::Float))
+                    return false;
+            }
+            else if(compile_param.ABDataTypeEnum == DataTypeEnum_t::Int8)
+            {
+                if(!(compile_param.AccDataTypeEnum == DataTypeEnum_t::Int32))
+                    return false;
+            }
+        }
+
+        // check gridwise contraction
+        {
+            if(!(GM1 % GM11 == 0 && GN1 % GN11 == 0 && GK0 % compile_param.GK0PerBlock == 0))
+                return false;
+
+            const bool has_main_k_block_loop =
+                ((GK0 + compile_param.GK0PerBlock) / (2 * compile_param.GK0PerBlock) > 1);
+
+            const bool has_double_tail_k_block_loop = ((GK0 / compile_param.GK0PerBlock) % 2 == 0);
+
+            if(!(has_main_k_block_loop == compile_param.HasMainKBlockLoop &&
+                 has_double_tail_k_block_loop == compile_param.HasDoubleTailKBlockLoop))
+                return false;
+        }
+
+        // check A blockwise copy
+        {
+            const auto block_slice_lengths =
+                std::array<int, 5>{compile_param.GK0PerBlock, GM0, 1, GM11, GK1};
+            const auto& cluster_lengths =
+                compile_param.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1;
+            const auto& thread_slice_lengths =
+                compile_param.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1;
+            const auto& src_vector_lengths =
+                compile_param.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
+            const auto& dst_vector_lengths =
+                compile_param.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
+
+            // check number of working thread
+            const int num_work_thread = std::accumulate(
+                cluster_lengths.begin(), cluster_lengths.end(), 1, std::multiplies<int>{});
+
+            if(!(compile_param.BlockSize >= num_work_thread))
+                return false;
+
+            // check block slice lengths vs thread slice lengths vs cluster lengths
+            for(int i = 0; i < 5; ++i)
+            {
+                if(!(cluster_lengths[i] * thread_slice_lengths[i] == block_slice_lengths[i]))
+                    return false;
+            }
+
+            // check thread slice lengths vs vector lengths
+            for(int i = 0; i < 5; ++i)
+            {
+                if(!(thread_slice_lengths[i] % src_vector_lengths[i] == 0))
+                    return false;
+
+                if(!(thread_slice_lengths[i] % dst_vector_lengths[i] == 0))
+                    return false;
+            }
+
+            // check Src vectorization, GK0 is global mem vector dim
+            if(!(src_vector_lengths[1] == 1 && src_vector_lengths[2] == 1 &&
+                 src_vector_lengths[3] == 1 && src_vector_lengths[4] == 1))
+                return false;
+
+            // check Dst vectorization, {GM11, GK1} are LDS vector dims
+            if(dst_vector_lengths[4] == GK1)
+            { // vectorize on {GM11, GK1}
+                if(!(GM11 % dst_vector_lengths[3] == 0))
+                    return false;
+            }
+            else
+            { // vectorize on {GK1} only
+                if(!(GK1 % dst_vector_lengths[4] == 0))
+                    return false;
+
+                if(!(dst_vector_lengths[3] == 1))
+                    return false;
+            }
+        }
+
+        // check B blockwise copy
+        {
+            const auto block_slice_lengths =
+                std::array<int, 5>{compile_param.GK0PerBlock, GN0, 1, GN11, GK1};
+            const auto& cluster_lengths =
+                compile_param.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1;
+            const auto& thread_slice_lengths =
+                compile_param.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1;
+            const auto& src_vector_lengths =
+                compile_param.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
+            const auto& dst_vector_lengths =
+                compile_param.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
+
+            // check number of working thread
+            const int num_work_thread = std::accumulate(
+                cluster_lengths.begin(), cluster_lengths.end(), 1, std::multiplies<int>{});
+
+            if(!(compile_param.BlockSize >= num_work_thread))
+                return false;
+
+            // check block slice lengths vs thread slice lengths vs cluster lengths
+            for(int i = 0; i < 5; ++i)
+            {
+                if(!(cluster_lengths[i] * thread_slice_lengths[i] == block_slice_lengths[i]))
+                    return false;
+            }
+
+            // check thread slice lengths vs vector lengths
+            for(int i = 0; i < 5; ++i)
+            {
+                if(!(thread_slice_lengths[i] % src_vector_lengths[i] == 0 &&
+                     thread_slice_lengths[i] % dst_vector_lengths[i] == 0))
+                    return false;
+            }
+
+            // check Src vectorization: {GN11} is global mem vector dim
+            if(!(src_vector_lengths[0] == 1 && src_vector_lengths[1] == 1 &&
+                 src_vector_lengths[2] == 1 && src_vector_lengths[4] == 1))
+                return false;
+
+            // check Src tensor layout related vectorization
+            if(Y == 1 && X == 1 && conv_problem_desc.ConvStrideH == 1 &&
+               conv_problem_desc.ConvStrideW == 1 && conv_problem_desc.InLeftPadH == 0 &&
+               conv_problem_desc.InLeftPadW == 0 && conv_problem_desc.InRightPadH == 0 &&
+               conv_problem_desc.InRightPadW == 0)
+            {
+                if(!((Ho * Wo) % src_vector_lengths[3] == 0))
+                    return false;
+            }
+            else if(conv_problem_desc.ConvStrideW == 1 && conv_problem_desc.InLeftPadW == 0 &&
+                    conv_problem_desc.InRightPadW == 0)
+            {
+                if(!(Wo % src_vector_lengths[3] == 0))
+                    return false;
+            }
+            else
+            {
+                if(!(src_vector_lengths[3] == 1))
+                    return false;
+            }
+
+            // check Dst vectorization: {GN11, GK1} are LDS vector dims
+            if(dst_vector_lengths[4] == GK1)
+            { // vectorize on {GN11, GK1}
+                if(!(GN11 % dst_vector_lengths[3] == 0))
+                    return false;
+            }
+            else
+            { // vectorize on {GK1} only
+                if(!(dst_vector_lengths[3] == 1))
+                    return false;
+
+                if(!(GK1 % dst_vector_lengths[4] == 0))
+                    return false;
+            }
+        }
+
+        // check blockwise GEMM
+        {
+            const int BM10 = std::accumulate(compile_param.BM10BN10ThreadClusterBM10Xs.begin(),
+                                             compile_param.BM10BN10ThreadClusterBM10Xs.end(),
+                                             1,
+                                             std::multiplies<int>{});
+
+            const int BN10 = std::accumulate(compile_param.BM10BN10ThreadClusterBN10Xs.begin(),
+                                             compile_param.BM10BN10ThreadClusterBN10Xs.end(),
+                                             1,
+                                             std::multiplies<int>{});
+
+            if(!(compile_param.BlockSize == BM10 * BN10))
+                return false;
+
+            const int BM = GM0 * GM11;
+            const int BN = GN0 * GN11;
+
+            const int BM1 = BM10 * BM11;
+            const int BN1 = BN10 * BN11;
+
+            if(!(BM % BM1 == 0 && BN % BN1 == 0))
+                return false;
+
+            const int BM0 = BM / BM1;
+            const int BN0 = BN / BN1;
+
+            // blockwise GEMM currently only support BM0 == 2 && BN0 == 2
+            if(!(BM0 == 2 && BN0 == 2))
+                return false;
+
+            if(!(compile_param.GK0PerBlock % compile_param.BK0PerThread == 0))
+                return false;
+        }
+
+        // check C threadwise copy
+        {
+            // {BN11} or {BN} or {BN1} or {GN11} is Dst vector dim
+            const int dst_vector_len_gn11 = compile_param.CThreadTransferDstScalarPerVector;
+
+            // check slice length vs Dst vector length:
+            if(!(BN11 % dst_vector_len_gn11 == 0 && GN11 % dst_vector_len_gn11 == 0))
+                return false;
+
+            // check Dst memory layout related vectorization:
+            if(!((Ho * Wo) % compile_param.CThreadTransferDstScalarPerVector == 0))
+                return false;
+        }
+
+        return true;
+    };
+
+    static int GetBlockSize(const ConvolutionProblemDescriptor&,
+                            const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw& compile_param)
+    {
+        return compile_param.BlockSize;
+    }
+
+    static int GetGridSize(const ConvolutionProblemDescriptor& conv_problem_desc,
+                           const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw& compile_param)
+    {
+        const int N  = conv_problem_desc.N;
+        const int K  = conv_problem_desc.K;
+        const int Ho = conv_problem_desc.Ho;
+        const int Wo = conv_problem_desc.Wo;
+
+        const int N0 = compile_param.GN0;
+        const int N1 = N / N0;
+
+        const int GM1 = K;
+        const int GN1 = N1 * Ho * Wo;
+
+        const int GM11 = compile_param.GM1PerBlockGM11;
+        const int GN11 = compile_param.GN1PerBlockGN11;
+
+        const int GM10 = GM1 / GM11;
+        const int GN10 = GN1 / GN11;
+
+        return GM10 * GN10;
+    }
+
+    static std::size_t GetWorkSpaceSize(const ConvolutionProblemDescriptor&,
+                                        const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw&)
+    {
+        // workspace is used for save transformed tensor descritpors created by prepare kernel
+        return 4096L;
+    }
+
+    static std::size_t GetMaxWorkSpaceSize(const ConvolutionProblemDescriptor&) { return 4096L; }
+
+    static auto GetTunableList()
+    {
+        return generate_tunable_list_conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw();
+    }
+};
+
+} // namespace ck_driver
+#endif
diff --git a/host/driver_online/include/conv_tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp b/host/driver_online/include/conv_tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp
new file mode 100644
index 0000000000..58fe588ad9
--- /dev/null
+++ b/host/driver_online/include/conv_tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp
@@ -0,0 +1,51 @@
+#ifndef CONV_TUNABLE_FWD_V4R4_DLOPS_NCHW_KCYX_NKHW_HPP
+#define CONV_TUNABLE_FWD_V4R4_DLOPS_NCHW_KCYX_NKHW_HPP
+
+struct tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw
+{
+    int BlockSize;
+
+    int MPerBlock;
+    int NPerBlock;
+    int KPerBlock;
+
+    int M1PerThread;
+    int N1PerThread;
+    int KPerThread;
+
+    int M1N1ThreadClusterM10;
+    int M1N1ThreadClusterN10;
+    int M1N1ThreadClusterM11;
+    int M1N1ThreadClusterN11;
+
+    std::array<int, 3> ABlockTransferThreadSliceLengths_K_M0_M1;
+    std::array<int, 3> ABlockTransferThreadClusterLengths_K_M0_M1;
+    std::array<int, 3> ABlockTransferThreadClusterArrangeOrder;
+    std::array<int, 3> ABlockTransferSrcAccessOrder;
+    int ABlockTransferSrcVectorDim;
+    int ABlockTransferSrcScalarPerVector;
+    int ABlockTransferDstScalarPerVector_M1;
+    bool AThreadTransferSrcResetCoordinateAfterRun;
+
+    std::array<int, 3> BBlockTransferThreadSliceLengths_K_N0_N1;
+    std::array<int, 3> BBlockTransferThreadClusterLengths_K_N0_N1;
+    std::array<int, 3> BBlockTransferThreadClusterArrangeOrder;
+    std::array<int, 3> BBlockTransferSrcAccessOrder;
+    int BBlockTransferSrcVectorDim;
+    int BBlockTransferSrcScalarPerVector;
+    int BBlockTransferDstScalarPerVector_N1;
+    bool BThreadTransferSrcResetCoordinateAfterRun;
+
+    std::array<int, 6> CThreadTransferSrcDstAccessOrder;
+    int CThreadTransferSrcDstVectorDim;
+    int CThreadTransferDstScalarPerVector;
+};
+
+static tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw
+    default_tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw = {
+        256,       128,       128, 8, 4,         4,           1,
+        8,         8,         2,   2, {4, 1, 1}, {2, 1, 128}, {2, 1, 0},
+        {2, 1, 0}, 0,         4,   1, false,     {4, 1, 1},   {2, 1, 128},
+        {0, 1, 2}, {0, 1, 2}, 2,   1, 1,         false,       {3, 4, 5, 0, 1, 2},
+        5,         1};
+#endif
diff --git a/host/driver_online/include/conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp b/host/driver_online/include/conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp
new file mode 100644
index 0000000000..97ce326346
--- /dev/null
+++ b/host/driver_online/include/conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp
@@ -0,0 +1,73 @@
+#ifndef CONV_TUNABLE_FWD_V4R4_XDLOPS_NCHW_KCYX_NKHW_HPP
+#define CONV_TUNABLE_FWD_V4R4_XDLOPS_NCHW_KCYX_NKHW_HPP
+
+struct tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
+{
+    int BlockSize;
+
+    int MPerBlock;
+    int NPerBlock;
+    int KPerBlock;
+
+    int MPerWave;
+    int NPerWave;
+    int K1;
+
+    int MRepeat;
+    int NRepeat;
+
+    std::array<int, 3> ABlockTransferThreadSliceLengths_K0_M_K1;
+    std::array<int, 3> ABlockTransferThreadClusterLengths_K0_M_K1;
+    std::array<int, 3> ABlockTransferThreadClusterArrangeOrder;
+    std::array<int, 3> ABlockTransferSrcAccessOrder;
+    int ABlockTransferSrcVectorDim;
+    int ABlockTransferSrcScalarPerVector;
+    int ABlockTransferDstScalarPerVector_K1;
+    bool AThreadTransferSrcResetCoordinateAfterRun;
+
+    std::array<int, 3> BBlockTransferThreadSliceLengths_K0_N_K1;
+    std::array<int, 3> BBlockTransferThreadClusterLengths_K0_N_K1;
+    std::array<int, 3> BBlockTransferThreadClusterArrangeOrder;
+    std::array<int, 3> BBlockTransferSrcAccessOrder;
+    int BBlockTransferSrcVectorDim;
+    int BBlockTransferSrcScalarPerVector;
+    int BBlockTransferDstScalarPerVector_K1;
+    bool BThreadTransferSrcResetCoordinateAfterRun;
+
+    std::array<int, 8> CThreadTransferSrcDstAccessOrder;
+    int CThreadTransferSrcDstVectorDim;
+    int CThreadTransferDstScalarPerVector;
+};
+
+static tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
+    default_tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw = {
+        256,                      // BlockSize
+        128,                      // MPerBlock,
+        128,                      // NPerBlock,
+        4,                        // KPerBlock,
+        32,                       // MPerWave,
+        32,                       // NPerWave,
+        4,                        // K1,
+        2,                        // MRepeat,
+        2,                        // NRepeat,
+        {1, 2, 4},                // ABlockTransferThreadSliceLengths_K0_M_K1,
+        {4, 64, 1},               // ABlockTransferThreadClusterLengths_K0_M_K1,
+        {1, 0, 2},                // ABlockTransferThreadClusterArrangeOrder,
+        {1, 0, 2},                // ABlockTransferSrcAccessOrder,
+        2,                        // ABlockTransferSrcVectorDim
+        1,                        // ABlockTransferSrcScalarPerVector,
+        4,                        // ABlockTransferDstScalarPerVector_K1,
+        false,                    // AThreadTransferSrcResetCoordinateAfterRun,
+        {1, 2, 4},                // BBlockTransferThreadSliceLengths_K0_N_K1,
+        {4, 64, 1},               // BBlockTransferThreadClusterLengths_K0_N_K1,
+        {0, 2, 1},                // BBlockTransferThreadClusterArrangeOrder,
+        {1, 0, 2},                // BBlockTransferSrcAccessOrder,
+        1,                        // BBlockTransferSrcVectorDim
+        1,                        // BBlockTransferSrcScalarPerVector
+        4,                        // BBlockTransferDstScalarPerVector_K1
+        false,                    // BThreadTransferSrcResetCoordinateAfterRun
+        {3, 0, 1, 2, 7, 5, 4, 6}, // CThreadTransferSrcDstAccessOrder
+        7,                        // CThreadTransferSrcDstVectorDim,
+        1                         // CThreadTransferDstScalarPerVector
+};
+#endif
diff --git a/host/driver_online/include/conv_tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp b/host/driver_online/include/conv_tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
new file mode 100644
index 0000000000..263c21a13b
--- /dev/null
+++ b/host/driver_online/include/conv_tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,73 @@
+#ifndef CONV_TUNABLE_FWD_V4R4_XDLOPS_NHWC_KYXC_NHWK_HPP
+#define CONV_TUNABLE_FWD_V4R4_XDLOPS_NHWC_KYXC_NHWK_HPP
+
+struct tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
+{
+    int BlockSize;
+
+    int MPerBlock;
+    int NPerBlock;
+    int KPerBlock;
+
+    int MPerWave;
+    int NPerWave;
+    int K1;
+
+    int MRepeat;
+    int NRepeat;
+
+    std::array<int, 3> ABlockTransferThreadSliceLengths_K0_M_K1;
+    std::array<int, 3> ABlockTransferThreadClusterLengths_K0_M_K1;
+    std::array<int, 3> ABlockTransferThreadClusterArrangeOrder;
+    std::array<int, 3> ABlockTransferSrcAccessOrder;
+    int ABlockTransferSrcVectorDim;
+    int ABlockTransferSrcScalarPerVector;
+    int ABlockTransferDstScalarPerVector_K1;
+    bool AThreadTransferSrcResetCoordinateAfterRun;
+
+    std::array<int, 3> BBlockTransferThreadSliceLengths_K0_N_K1;
+    std::array<int, 3> BBlockTransferThreadClusterLengths_K0_N_K1;
+    std::array<int, 3> BBlockTransferThreadClusterArrangeOrder;
+    std::array<int, 3> BBlockTransferSrcAccessOrder;
+    int BBlockTransferSrcVectorDim;
+    int BBlockTransferSrcScalarPerVector;
+    int BBlockTransferDstScalarPerVector_K1;
+    bool BThreadTransferSrcResetCoordinateAfterRun;
+
+    std::array<int, 8> CThreadTransferSrcDstAccessOrder;
+    int CThreadTransferSrcDstVectorDim;
+    int CThreadTransferDstScalarPerVector;
+};
+
+static tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
+    default_tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk = {
+        256,                      // BlockSize
+        128,                      // MPerBlock,
+        128,                      // NPerBlock,
+        4,                        // KPerBlock,
+        32,                       // MPerWave,
+        32,                       // NPerWave,
+        4,                        // K1,
+        2,                        // MRepeat,
+        2,                        // NRepeat,
+        {1, 2, 4},                // ABlockTransferThreadSliceLengths_K0_M_K1,
+        {4, 64, 1},               // ABlockTransferThreadClusterLengths_K0_M_K1,
+        {1, 0, 2},                // ABlockTransferThreadClusterArrangeOrder,
+        {1, 0, 2},                // ABlockTransferSrcAccessOrder,
+        2,                        // ABlockTransferSrcVectorDim
+        4,                        // ABlockTransferSrcScalarPerVector,
+        4,                        // ABlockTransferDstScalarPerVector_K1,
+        false,                    // AThreadTransferSrcResetCoordinateAfterRun,
+        {1, 2, 4},                // BBlockTransferThreadSliceLengths_K0_N_K1,
+        {4, 64, 1},               // BBlockTransferThreadClusterLengths_K0_N_K1,
+        {1, 0, 2},                // BBlockTransferThreadClusterArrangeOrder,
+        {1, 0, 2},                // BBlockTransferSrcAccessOrder,
+        2,                        // BBlockTransferSrcVectorDim
+        4,                        // BBlockTransferSrcScalarPerVector
+        4,                        // BBlockTransferDstScalarPerVector_K1
+        false,                    // BThreadTransferSrcResetCoordinateAfterRun
+        {2, 3, 0, 1, 7, 5, 4, 6}, // CThreadTransferSrcDstAccessOrder
+        7,                        // CThreadTransferSrcDstVectorDim,
+        1                         // CThreadTransferDstScalarPerVector
+};
+#endif
diff --git a/host/driver_online/include/convolution_problem_descriptor.hpp b/host/driver_online/include/convolution_problem_descriptor.hpp
new file mode 100644
index 0000000000..df9c110e70
--- /dev/null
+++ b/host/driver_online/include/convolution_problem_descriptor.hpp
@@ -0,0 +1,79 @@
+#ifndef CONVOLUTION_PROBLEM_DESCRIPTOR
+#define CONVOLUTION_PROBLEM_DESCRIPTOR
+
+namespace ck_driver {
+
+struct ConvolutionProblemDescriptor
+{
+    ConvolutionProblemDescriptor() = default;
+
+    ConvolutionProblemDescriptor(int N_,
+                                 int K_,
+                                 int C_,
+                                 int Y_,
+                                 int X_,
+                                 int Hi_,
+                                 int Wi_,
+                                 int Ho_,
+                                 int Wo_,
+                                 int ConvStrideH_,
+                                 int ConvStrideW_,
+                                 int ConvDilationH_,
+                                 int ConvDilationW_,
+                                 int InLeftPadH_,
+                                 int InLeftPadW_,
+                                 int InRightPadH_,
+                                 int InRightPadW_,
+                                 ck::DataTypeEnum_t InDataTypeEnum_,
+                                 ck::DataTypeEnum_t WeiDataTypeEnum_,
+                                 ck::DataTypeEnum_t OutDataTypeEnum_)
+        : N{N_},
+          K{K_},
+          C{C_},
+          Y{Y_},
+          X{X_},
+          Hi{Hi_},
+          Wi{Wi_},
+          Ho{Ho_},
+          Wo{Wo_},
+          ConvStrideH{ConvStrideH_},
+          ConvStrideW{ConvStrideW_},
+          ConvDilationH{ConvDilationH_},
+          ConvDilationW{ConvDilationW_},
+          InLeftPadH{InLeftPadH_},
+          InLeftPadW{InLeftPadW_},
+          InRightPadH{InRightPadH_},
+          InRightPadW{InRightPadW_},
+          InDataTypeEnum{InDataTypeEnum_},
+          WeiDataTypeEnum{WeiDataTypeEnum_},
+          OutDataTypeEnum{OutDataTypeEnum_}
+    {
+    }
+
+    int N;
+    int K;
+    int C;
+    int Y;
+    int X;
+    int Hi;
+    int Wi;
+    int Ho;
+    int Wo;
+    int ConvStrideH;
+    int ConvStrideW;
+    int ConvDilationH;
+    int ConvDilationW;
+    int InLeftPadH;
+    int InLeftPadW;
+    int InRightPadH;
+    int InRightPadW;
+
+    ck::DataTypeEnum_t InDataTypeEnum;
+    ck::DataTypeEnum_t WeiDataTypeEnum;
+    ck::DataTypeEnum_t OutDataTypeEnum;
+
+    std::size_t CalculateFlop() const { return 2L * N * K * C * Y * X * Ho * Wo; }
+};
+
+} // namespace ck_driver
+#endif
diff --git a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
new file mode 100644
index 0000000000..628bb6d96d
--- /dev/null
+++ b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
@@ -0,0 +1,395 @@
+#pragma once
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "handle.hpp"
+#include "online_driver_common.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp"
+#include "conv_tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp"
+
+namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw {
+
+template <typename TInWei, typename TAcc, typename TOut>
+static std::string get_network_config_string_from_types()
+{
+    using namespace ck;
+
+    std::string out;
+
+    out += std::to_string(get_datatype_enum_from_type<TInWei>::value) + "_" +
+           std::to_string(get_datatype_enum_from_type<TAcc>::value) + "_" +
+           std::to_string(get_datatype_enum_from_type<TOut>::value);
+
+    return (out);
+};
+
+static std::string
+get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* pt)
+{
+    std::string out("TUN_");
+
+    out += std::to_string(pt->BlockSize) + "_";
+
+    out += std::to_string(pt->MPerBlock) + "x" + std::to_string(pt->NPerBlock) + "x" +
+           std::to_string(pt->KPerBlock) + "_";
+    out += std::to_string(pt->M1PerThread) + "x" + std::to_string(pt->N1PerThread) + "x" +
+           std::to_string(pt->KPerThread) + "_";
+    out += std::to_string(pt->M1N1ThreadClusterM10) + "x" +
+           std::to_string(pt->M1N1ThreadClusterN10) + "x" +
+           std::to_string(pt->M1N1ThreadClusterM11) + "x" +
+           std::to_string(pt->M1N1ThreadClusterN11) + "_";
+
+    out += std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[0]) + "x" +
+           std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[1]) + "x" +
+           std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[2]) + "_";
+
+    out += std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[0]) + "x" +
+           std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[1]) + "x" +
+           std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[2]) + "_";
+
+    out += std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "x" +
+           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "x" +
+           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "_";
+
+    out += std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "x" +
+           std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "x" +
+           std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "_";
+
+    out += std::to_string(pt->ABlockTransferSrcVectorDim) + "_";
+    out += std::to_string(pt->ABlockTransferSrcScalarPerVector) + "_";
+    out += std::to_string(pt->ABlockTransferDstScalarPerVector_M1) + "_";
+    out += std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun) + "_";
+
+    out += std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[0]) + "x" +
+           std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[1]) + "x" +
+           std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[2]) + "_";
+
+    out += std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[0]) + "x" +
+           std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[1]) + "x" +
+           std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[2]) + "_";
+
+    out += std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "x" +
+           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "x" +
+           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "_";
+
+    out += std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "x" +
+           std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "x" +
+           std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "_";
+
+    out += std::to_string(pt->BBlockTransferSrcVectorDim) + "_";
+    out += std::to_string(pt->BBlockTransferSrcScalarPerVector) + "_";
+    out += std::to_string(pt->BBlockTransferDstScalarPerVector_N1) + "_";
+    out += std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun) + "_";
+
+    out += std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "x" +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "x" +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "x" +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "x" +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "x" +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "_";
+
+    out += std::to_string(pt->CThreadTransferSrcDstVectorDim) + "_";
+    out += std::to_string(pt->CThreadTransferDstScalarPerVector);
+
+    return (out);
+};
+
+template <typename TInWei, typename TAcc, typename TOut>
+static std::string get_definition_string_from_types()
+{
+    using namespace ck;
+
+    std::string out;
+
+    out +=
+        " -DCK_PARAM_ABDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TInWei>::value) +
+        " -DCK_PARAM_AccDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TAcc>::value) +
+        " -DCK_PARAM_CDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TOut>::value);
+
+    return (out);
+};
+
+static std::string
+get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* pt)
+{
+    std::string out;
+
+    out += " -DCK_PARAM_BlockSize=" + std::to_string(pt->BlockSize);
+
+    out += " -DCK_PARAM_MPerBlock=" + std::to_string(pt->MPerBlock) +
+           " -DCK_PARAM_NPerBlock=" + std::to_string(pt->NPerBlock) +
+           " -DCK_PARAM_KPerBlock=" + std::to_string(pt->KPerBlock);
+    out += " -DCK_PARAM_M1PerThread=" + std::to_string(pt->M1PerThread) +
+           " -DCK_PARAM_N1PerThread=" + std::to_string(pt->N1PerThread) +
+           " -DCK_PARAM_KPerThread=" + std::to_string(pt->KPerThread);
+
+    out += " -DCK_PARAM_M1N1ThreadClusterM10=" + std::to_string(pt->M1N1ThreadClusterM10) +
+           " -DCK_PARAM_M1N1ThreadClusterN10=" + std::to_string(pt->M1N1ThreadClusterN10) +
+           " -DCK_PARAM_M1N1ThreadClusterM11=" + std::to_string(pt->M1N1ThreadClusterM11) +
+           " -DCK_PARAM_M1N1ThreadClusterN11=" + std::to_string(pt->M1N1ThreadClusterN11);
+
+    out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_K_M0_M1=" +
+           std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[0]) + "," +
+           std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[1]) + "," +
+           std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[2]);
+
+    out += " -DCK_PARAM_ABlockTransferThreadClusterLengths_K_M0_M1=" +
+           std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[0]) + "," +
+           std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[1]) + "," +
+           std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[2]);
+
+    out += " -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder=" +
+           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "," +
+           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "," +
+           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]);
+
+    out += " -DCK_PARAM_ABlockTransferSrcAccessOrder=" +
+           std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "," +
+           std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "," +
+           std::to_string(pt->ABlockTransferSrcAccessOrder[2]);
+
+    out +=
+        " -DCK_PARAM_ABlockTransferSrcVectorDim=" + std::to_string(pt->ABlockTransferSrcVectorDim);
+    out += " -DCK_PARAM_ABlockTransferSrcScalarPerVector=" +
+           std::to_string(pt->ABlockTransferSrcScalarPerVector);
+    out += " -DCK_PARAM_ABlockTransferDstScalarPerVector_M1=" +
+           std::to_string(pt->ABlockTransferDstScalarPerVector_M1);
+    out += " -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun=" +
+           std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun);
+
+    out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_K_N0_N1=" +
+           std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[0]) + "," +
+           std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[1]) + "," +
+           std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[2]);
+
+    out += " -DCK_PARAM_BBlockTransferThreadClusterLengths_K_N0_N1=" +
+           std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[0]) + "," +
+           std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[1]) + "," +
+           std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[2]);
+
+    out += " -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder=" +
+           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "," +
+           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "," +
+           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]);
+
+    out += " -DCK_PARAM_BBlockTransferSrcAccessOrder=" +
+           std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "," +
+           std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "," +
+           std::to_string(pt->BBlockTransferSrcAccessOrder[2]);
+
+    out +=
+        " -DCK_PARAM_BBlockTransferSrcVectorDim=" + std::to_string(pt->BBlockTransferSrcVectorDim);
+    out += " -DCK_PARAM_BBlockTransferSrcScalarPerVector=" +
+           std::to_string(pt->BBlockTransferSrcScalarPerVector);
+    out += " -DCK_PARAM_BBlockTransferDstScalarPerVector_N1=" +
+           std::to_string(pt->BBlockTransferDstScalarPerVector_N1);
+    out += " -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun=" +
+           std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun);
+
+    out += " -DCK_PARAM_CThreadTransferSrcDstAccessOrder=" +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "," +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "," +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "," +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "," +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "," +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]);
+
+    out += " -DCK_PARAM_CThreadTransferSrcDstVectorDim=" +
+           std::to_string(pt->CThreadTransferSrcDstVectorDim);
+    out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
+           std::to_string(pt->CThreadTransferDstScalarPerVector);
+
+    return (out);
+};
+
+} // namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw
+
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
+    olCompile::Handle* handle,
+    const InLengths& in_n_c_hi_wi_lengths,
+    const WeiLengths& wei_k_c_y_x_lengths,
+    const OutLengths& out_n_k_ho_wo_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_c_hi_wi,
+    const Tensor<TInWei>& wei_k_c_y_x,
+    Tensor<TOut>& out_n_k_ho_wo,
+    const tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* tunable,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+    using namespace ck_driver;
+    using namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw;
+    using size_t = std::size_t;
+
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // The follow codes are only used for computing the grid_size, hasMainKBlockLoop,
+    // hasDoubleTailKBlockLoop
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    const auto in_n_c_hi_wi_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
+    const auto wei_k_c_y_x_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
+    const auto out_n_k_ho_wo_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
+
+    const auto descs =
+        transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
+                                                                        in_n_c_hi_wi_desc,
+                                                                        out_n_k_ho_wo_desc,
+                                                                        conv_strides,
+                                                                        conv_dilations,
+                                                                        in_left_pads,
+                                                                        in_right_pads);
+    const auto a_k_m_grid_desc = descs[I0];
+    const auto c_m_n_grid_desc = descs[I2];
+    const auto M               = c_m_n_grid_desc.GetLength(I0);
+    const auto N               = c_m_n_grid_desc.GetLength(I1);
+    const auto K               = a_k_m_grid_desc.GetLength(I0);
+
+    const index_t grid_size            = (M / tunable->MPerBlock) * (N / tunable->NPerBlock);
+    const bool hasMainKBlockLoop       = ((K + tunable->KPerBlock) / (2 * tunable->KPerBlock) > 1);
+    const bool hasDoubleTailKBlockLoop = ((K / tunable->KPerBlock) % 2 == 0);
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    // these buffers are usually provided by the user application
+    DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
+    DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
+    DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
+
+    in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data());
+    wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data());
+    out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data());
+
+    // these are workspace buffers that should be expressed to the user by the corresponding
+    // workspace API
+    DeviceMem workspace_buf(4096);
+
+    void* a_k_m0_m1_grid_desc_dev_buf = workspace_buf.GetDeviceBuffer();
+    void* b_k_n0_n1_grid_desc_dev_buf =
+        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 1024);
+    void* c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf =
+        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 2048);
+    void* c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf =
+        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 3072);
+
+    const std::vector<size_t> vld  = {static_cast<size_t>(tunable->BlockSize), 1, 1};
+    const std::vector<size_t> vgd1 = {static_cast<size_t>(tunable->BlockSize), 1, 1};
+    const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * tunable->BlockSize), 1, 1};
+
+    std::string program_name =
+        "dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp";
+    std::string algo_name = "implicit_gemm_conv_fwd_v4r4_dlops_nchw";
+
+    std::string param = " -std=c++17 ";
+    std::string network_config;
+
+    param += get_definition_string_from_types<TInWei, TAcc, TOut>() + " " +
+             get_definition_string_from_tunable(tunable) +
+             " -DCK_PARAM_HAS_MAIN_KBLOCK_LOOP=" + std::to_string(hasMainKBlockLoop) +
+             " -DCK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP=" + std::to_string(hasDoubleTailKBlockLoop);
+    network_config = get_network_config_string_from_types<TInWei, TAcc, TOut>() + "_" +
+                     get_network_config_string_from_tunable(tunable) + "_" +
+                     std::to_string(hasMainKBlockLoop) + "_" +
+                     std::to_string(hasDoubleTailKBlockLoop);
+
+    std::vector<float> kernel1_times;
+    std::vector<float> kernel2_times;
+
+    for(index_t i = 0; i < nrepeat; ++i)
+    {
+        KernelTimer timer1, timer2;
+        std::string kernel_name;
+
+        kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw_prepare";
+        auto network_config_1 = network_config + "_1";
+
+        timer1.Start();
+        handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)(
+            static_cast<index_t>(in_n_c_hi_wi_lengths[I0]),
+            static_cast<index_t>(in_n_c_hi_wi_lengths[I1]),
+            static_cast<index_t>(in_n_c_hi_wi_lengths[I2]),
+            static_cast<index_t>(in_n_c_hi_wi_lengths[I3]),
+            static_cast<index_t>(wei_k_c_y_x_lengths[I0]),
+            static_cast<index_t>(wei_k_c_y_x_lengths[I2]),
+            static_cast<index_t>(wei_k_c_y_x_lengths[I3]),
+            conv_strides[I0],
+            conv_strides[I1],
+            conv_dilations[I0],
+            conv_dilations[I1],
+            in_left_pads[I0],
+            in_left_pads[I1],
+            in_right_pads[I0],
+            in_right_pads[I1],
+            a_k_m0_m1_grid_desc_dev_buf,
+            b_k_n0_n1_grid_desc_dev_buf,
+            c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf,
+            c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf);
+        timer1.End();
+
+        kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw";
+        auto network_config_2 = network_config + "_2";
+
+        timer2.Start();
+        handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)(
+            reinterpret_cast<const TInWei*>(wei_k_c_y_x_dev_buf.GetDeviceBuffer()),
+            reinterpret_cast<const TInWei*>(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()),
+            reinterpret_cast<TOut*>(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()),
+            (const void*)(a_k_m0_m1_grid_desc_dev_buf),
+            (const void*)(b_k_n0_n1_grid_desc_dev_buf),
+            (const void*)(c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf),
+            (const void*)(c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf));
+        timer2.End();
+
+        kernel1_times.push_back(timer1.GetElapsedTime());
+        kernel2_times.push_back(timer2.GetElapsedTime());
+    }
+
+    {
+        auto ave_time1 =
+            std::accumulate(
+                std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus<float>{}) /
+            (nrepeat - 1);
+        auto ave_time2 =
+            std::accumulate(
+                std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus<float>{}) /
+            (nrepeat - 1);
+
+        const auto N = in_n_c_hi_wi_lengths[I0];
+        const auto C = in_n_c_hi_wi_lengths[I1];
+
+        const auto K  = out_n_k_ho_wo_lengths[I1];
+        const auto Ho = out_n_k_ho_wo_lengths[I2];
+        const auto Wo = out_n_k_ho_wo_lengths[I3];
+
+        const auto Y = wei_k_c_y_x_lengths[I2];
+        const auto X = wei_k_c_y_x_lengths[I3];
+
+        float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
+                     (std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2);
+
+        std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
+                  << ave_time2 << "), " << perf << " TFlop/s" << std::endl;
+    };
+
+    // copy result back to host
+    out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data());
+}
diff --git a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
new file mode 100644
index 0000000000..1e213b92e1
--- /dev/null
+++ b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
@@ -0,0 +1,386 @@
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "handle.hpp"
+#include "online_driver_common.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
+
+namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw {
+
+template <typename TInWei, typename TAcc, typename TOut>
+static std::string get_network_config_string_from_types()
+{
+    using namespace ck;
+
+    std::string out;
+
+    out += std::to_string(get_datatype_enum_from_type<TInWei>::value) + "_" +
+           std::to_string(get_datatype_enum_from_type<TAcc>::value) + "_" +
+           std::to_string(get_datatype_enum_from_type<TOut>::value);
+
+    return (out);
+};
+
+static std::string
+get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* pt)
+{
+    std::string out("TUN_");
+
+    out += std::to_string(pt->BlockSize) + "_";
+
+    out += std::to_string(pt->MPerBlock) + "x" + std::to_string(pt->NPerBlock) + "x" +
+           std::to_string(pt->KPerBlock) + "_";
+    out += std::to_string(pt->MPerWave) + "x" + std::to_string(pt->NPerWave) + "x" +
+           std::to_string(pt->MRepeat) + "x" + std::to_string(pt->NRepeat) + "x" +
+           std::to_string(pt->K1) + "_";
+
+    out += std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[0]) + "x" +
+           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[1]) + "x" +
+           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[2]) + "_";
+
+    out += std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[0]) + "x" +
+           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[1]) + "x" +
+           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[2]) + "_";
+
+    out += std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "x" +
+           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "x" +
+           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "_";
+
+    out += std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "x" +
+           std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "x" +
+           std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "_";
+
+    out += std::to_string(pt->ABlockTransferSrcVectorDim) + "_";
+    out += std::to_string(pt->ABlockTransferSrcScalarPerVector) + "_";
+    out += std::to_string(pt->ABlockTransferDstScalarPerVector_K1) + "_";
+    out += std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun) + "_";
+
+    out += std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[0]) + "x" +
+           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[1]) + "x" +
+           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[2]) + "_";
+
+    out += std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[0]) + "x" +
+           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[1]) + "x" +
+           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[2]) + "_";
+
+    out += std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "x" +
+           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "x" +
+           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "_";
+
+    out += std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "x" +
+           std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "x" +
+           std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "_";
+
+    out += std::to_string(pt->BBlockTransferSrcVectorDim) + "_";
+    out += std::to_string(pt->BBlockTransferSrcScalarPerVector) + "_";
+    out += std::to_string(pt->BBlockTransferDstScalarPerVector_K1) + "_";
+    out += std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun) + "_";
+
+    out += std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "x" +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "x" +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "x" +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "x" +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "x" +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "x" +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[6]) + "x" +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[7]) + "_";
+
+    out += std::to_string(pt->CThreadTransferSrcDstVectorDim) + "_";
+    out += std::to_string(pt->CThreadTransferDstScalarPerVector);
+
+    return (out);
+};
+
+template <typename TInWei, typename TAcc, typename TOut>
+static std::string get_definition_string_from_types()
+{
+    using namespace ck;
+
+    std::string out;
+
+    out +=
+        " -DCK_PARAM_ABDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TInWei>::value) +
+        " -DCK_PARAM_AccDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TAcc>::value) +
+        " -DCK_PARAM_CDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TOut>::value);
+
+    return (out);
+};
+
+static std::string
+get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* pt)
+{
+    std::string out;
+
+    out += " -DCK_PARAM_BlockSize=" + std::to_string(pt->BlockSize);
+
+    out += " -DCK_PARAM_MPerBlock=" + std::to_string(pt->MPerBlock) +
+           " -DCK_PARAM_NPerBlock=" + std::to_string(pt->NPerBlock) +
+           " -DCK_PARAM_KPerBlock=" + std::to_string(pt->KPerBlock);
+    out += " -DCK_PARAM_MPerWave=" + std::to_string(pt->MPerWave) +
+           " -DCK_PARAM_NPerWave=" + std::to_string(pt->NPerWave) +
+           " -DCK_PARAM_K1=" + std::to_string(pt->K1) +
+           " -DCK_PARAM_MRepeat=" + std::to_string(pt->MRepeat) +
+           " -DCK_PARAM_NRepeat=" + std::to_string(pt->NRepeat);
+
+    out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_K0_M_K1=" +
+           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[0]) + "," +
+           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[1]) + "," +
+           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[2]);
+
+    out += " -DCK_PARAM_ABlockTransferThreadClusterLengths_K0_M_K1=" +
+           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[0]) + "," +
+           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[1]) + "," +
+           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[2]);
+
+    out += " -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder=" +
+           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "," +
+           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "," +
+           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]);
+
+    out += " -DCK_PARAM_ABlockTransferSrcAccessOrder=" +
+           std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "," +
+           std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "," +
+           std::to_string(pt->ABlockTransferSrcAccessOrder[2]);
+
+    out +=
+        " -DCK_PARAM_ABlockTransferSrcVectorDim=" + std::to_string(pt->ABlockTransferSrcVectorDim);
+    out += " -DCK_PARAM_ABlockTransferSrcScalarPerVector=" +
+           std::to_string(pt->ABlockTransferSrcScalarPerVector);
+    out += " -DCK_PARAM_ABlockTransferDstScalarPerVector_K1=" +
+           std::to_string(pt->ABlockTransferDstScalarPerVector_K1);
+    out += " -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun=" +
+           std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun);
+
+    out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_K0_N_K1=" +
+           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[0]) + "," +
+           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[1]) + "," +
+           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[2]);
+
+    out += " -DCK_PARAM_BBlockTransferThreadClusterLengths_K0_N_K1=" +
+           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[0]) + "," +
+           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[1]) + "," +
+           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[2]);
+
+    out += " -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder=" +
+           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "," +
+           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "," +
+           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]);
+
+    out += " -DCK_PARAM_BBlockTransferSrcAccessOrder=" +
+           std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "," +
+           std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "," +
+           std::to_string(pt->BBlockTransferSrcAccessOrder[2]);
+
+    out +=
+        " -DCK_PARAM_BBlockTransferSrcVectorDim=" + std::to_string(pt->BBlockTransferSrcVectorDim);
+    out += " -DCK_PARAM_BBlockTransferSrcScalarPerVector=" +
+           std::to_string(pt->BBlockTransferSrcScalarPerVector);
+    out += " -DCK_PARAM_BBlockTransferDstScalarPerVector_K1=" +
+           std::to_string(pt->BBlockTransferDstScalarPerVector_K1);
+    out += " -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun=" +
+           std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun);
+
+    out += " -DCK_PARAM_CThreadTransferSrcDstAccessOrder=" +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "," +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "," +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "," +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "," +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "," +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "," +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[6]) + "," +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[7]);
+
+    out += " -DCK_PARAM_CThreadTransferSrcDstVectorDim=" +
+           std::to_string(pt->CThreadTransferSrcDstVectorDim);
+    out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
+           std::to_string(pt->CThreadTransferDstScalarPerVector);
+
+    return (out);
+};
+
+} // namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
+
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(
+    olCompile::Handle* handle,
+    const InLengths& in_n_c_hi_wi_lengths,
+    const WeiLengths& wei_k_c_y_x_lengths,
+    const OutLengths& out_n_k_ho_wo_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_c_hi_wi,
+    const Tensor<TInWei>& wei_k_c_y_x,
+    Tensor<TOut>& out_n_k_ho_wo,
+    const tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* tunable,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+    using namespace ck_driver;
+    using namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw;
+    using size_t = std::size_t;
+
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    const auto in_n_c_hi_wi_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
+    const auto wei_k_c_y_x_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
+    const auto out_n_k_ho_wo_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
+
+    const auto n  = in_n_c_hi_wi_desc.GetLength(I0);
+    const auto c  = in_n_c_hi_wi_desc.GetLength(I1);
+    const auto hi = in_n_c_hi_wi_desc.GetLength(I2);
+    const auto wi = in_n_c_hi_wi_desc.GetLength(I3);
+    const auto k  = wei_k_c_y_x_desc.GetLength(I0);
+    const auto y  = wei_k_c_y_x_desc.GetLength(I2);
+    const auto x  = wei_k_c_y_x_desc.GetLength(I3);
+    const auto ho = out_n_k_ho_wo_desc.GetLength(I2);
+    const auto wo = out_n_k_ho_wo_desc.GetLength(I3);
+
+    const auto M  = k;
+    const auto N  = n * ho * wo;
+    const auto K  = c * y * x;
+    const auto K0 = K / tunable->K1;
+
+    const index_t grid_size = (M / tunable->MPerBlock) * (N / tunable->NPerBlock);
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    // these buffers are usually provided by the user application
+    DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
+    DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
+    DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
+
+    in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data());
+    wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data());
+    out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data());
+
+    // these are workspace buffers that should be expressed to the user by the corresponding
+    // workspace API
+    DeviceMem workspace_buf(4096);
+
+    void* a_k_m0_m1_grid_desc_dev_buf = workspace_buf.GetDeviceBuffer();
+    void* b_k_n0_n1_grid_desc_dev_buf =
+        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 1024);
+    void* c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf =
+        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 2048);
+    void* c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf =
+        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 3072);
+
+    const std::vector<size_t> vld  = {static_cast<size_t>(tunable->BlockSize), 1, 1};
+    const std::vector<size_t> vgd1 = {static_cast<size_t>(tunable->BlockSize), 1, 1};
+    const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * tunable->BlockSize), 1, 1};
+
+    std::string program_name =
+        "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp";
+    std::string algo_name = "implicit_gemm_conv_fwd_v4r4_xdlops_nchw";
+
+    std::string param = " -std=c++17 ";
+    std::string network_config;
+
+    param += get_definition_string_from_types<TInWei, TAcc, TOut>() + " " + " -DCK_USE_AMD_XDLOPS" +
+             get_definition_string_from_tunable(tunable);
+
+    network_config = get_network_config_string_from_types<TInWei, TAcc, TOut>() + "_" +
+                     get_network_config_string_from_tunable(tunable);
+
+    std::vector<float> kernel1_times;
+    std::vector<float> kernel2_times;
+
+    for(index_t i = 0; i < nrepeat; ++i)
+    {
+        KernelTimer timer1, timer2;
+        std::string kernel_name;
+
+        kernel_name =
+            "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_prepare";
+        auto network_config_1 = network_config + "_1";
+
+        timer1.Start();
+        handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)(
+            static_cast<index_t>(in_n_c_hi_wi_lengths[I0]),
+            static_cast<index_t>(in_n_c_hi_wi_lengths[I1]),
+            static_cast<index_t>(in_n_c_hi_wi_lengths[I2]),
+            static_cast<index_t>(in_n_c_hi_wi_lengths[I3]),
+            static_cast<index_t>(wei_k_c_y_x_lengths[I0]),
+            static_cast<index_t>(wei_k_c_y_x_lengths[I2]),
+            static_cast<index_t>(wei_k_c_y_x_lengths[I3]),
+            conv_strides[I0],
+            conv_strides[I1],
+            conv_dilations[I0],
+            conv_dilations[I1],
+            in_left_pads[I0],
+            in_left_pads[I1],
+            in_right_pads[I0],
+            in_right_pads[I1],
+            a_k_m0_m1_grid_desc_dev_buf,
+            b_k_n0_n1_grid_desc_dev_buf,
+            c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf,
+            c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf);
+        timer1.End();
+
+        kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw";
+        auto network_config_2 = network_config + "_2";
+
+        timer2.Start();
+        handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)(
+            reinterpret_cast<const TInWei*>(wei_k_c_y_x_dev_buf.GetDeviceBuffer()),
+            reinterpret_cast<const TInWei*>(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()),
+            reinterpret_cast<TOut*>(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()),
+            (const void*)(a_k_m0_m1_grid_desc_dev_buf),
+            (const void*)(b_k_n0_n1_grid_desc_dev_buf),
+            (const void*)(c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf),
+            (const void*)(c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf));
+        timer2.End();
+
+        kernel1_times.push_back(timer1.GetElapsedTime());
+        kernel2_times.push_back(timer2.GetElapsedTime());
+    }
+
+    {
+        auto ave_time1 =
+            std::accumulate(
+                std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus<float>{}) /
+            (nrepeat - 1);
+        auto ave_time2 =
+            std::accumulate(
+                std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus<float>{}) /
+            (nrepeat - 1);
+
+        const auto N = in_n_c_hi_wi_lengths[I0];
+        const auto C = in_n_c_hi_wi_lengths[I1];
+
+        const auto K  = out_n_k_ho_wo_lengths[I1];
+        const auto Ho = out_n_k_ho_wo_lengths[I2];
+        const auto Wo = out_n_k_ho_wo_lengths[I3];
+
+        const auto Y = wei_k_c_y_x_lengths[I2];
+        const auto X = wei_k_c_y_x_lengths[I3];
+
+        float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
+                     (std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2);
+
+        std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
+                  << ave_time2 << "), " << perf << " TFlop/s" << std::endl;
+    };
+
+    // copy result back to host
+    out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data());
+}
diff --git a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
new file mode 100644
index 0000000000..8eed1a9934
--- /dev/null
+++ b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,389 @@
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "handle.hpp"
+#include "online_driver_common.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
+#include "conv_tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp"
+
+namespace detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk {
+
+template <typename TInWei, typename TAcc, typename TOut>
+static std::string get_network_config_string_from_types()
+{
+    using namespace ck;
+
+    std::string out;
+
+    out += std::to_string(get_datatype_enum_from_type<TInWei>::value) + "_" +
+           std::to_string(get_datatype_enum_from_type<TAcc>::value) + "_" +
+           std::to_string(get_datatype_enum_from_type<TOut>::value);
+
+    return (out);
+};
+
+static std::string
+get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* pt)
+{
+    std::string out("TUN_");
+
+    out += std::to_string(pt->BlockSize) + "_";
+
+    out += std::to_string(pt->MPerBlock) + "x" + std::to_string(pt->NPerBlock) + "x" +
+           std::to_string(pt->KPerBlock) + "_";
+    out += std::to_string(pt->MPerWave) + "x" + std::to_string(pt->NPerWave) + "x" +
+           std::to_string(pt->MRepeat) + "x" + std::to_string(pt->NRepeat) + "x" +
+           std::to_string(pt->K1) + "_";
+
+    out += std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[0]) + "x" +
+           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[1]) + "x" +
+           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[2]) + "_";
+
+    out += std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[0]) + "x" +
+           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[1]) + "x" +
+           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[2]) + "_";
+
+    out += std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "x" +
+           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "x" +
+           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "_";
+
+    out += std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "x" +
+           std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "x" +
+           std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "_";
+
+    out += std::to_string(pt->ABlockTransferSrcVectorDim) + "_";
+    out += std::to_string(pt->ABlockTransferSrcScalarPerVector) + "_";
+    out += std::to_string(pt->ABlockTransferDstScalarPerVector_K1) + "_";
+    out += std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun) + "_";
+
+    out += std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[0]) + "x" +
+           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[1]) + "x" +
+           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[2]) + "_";
+
+    out += std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[0]) + "x" +
+           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[1]) + "x" +
+           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[2]) + "_";
+
+    out += std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "x" +
+           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "x" +
+           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "_";
+
+    out += std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "x" +
+           std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "x" +
+           std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "_";
+
+    out += std::to_string(pt->BBlockTransferSrcVectorDim) + "_";
+    out += std::to_string(pt->BBlockTransferSrcScalarPerVector) + "_";
+    out += std::to_string(pt->BBlockTransferDstScalarPerVector_K1) + "_";
+    out += std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun) + "_";
+
+    out += std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "x" +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "x" +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "x" +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "x" +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "x" +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "x" +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[6]) + "x" +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[7]) + "_";
+
+    out += std::to_string(pt->CThreadTransferSrcDstVectorDim) + "_";
+    out += std::to_string(pt->CThreadTransferDstScalarPerVector);
+
+    return (out);
+};
+
+template <typename TInWei, typename TAcc, typename TOut>
+static std::string get_definition_string_from_types()
+{
+    using namespace ck;
+
+    std::string out;
+
+    out +=
+        " -DCK_PARAM_ABDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TInWei>::value) +
+        " -DCK_PARAM_AccDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TAcc>::value) +
+        " -DCK_PARAM_CDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TOut>::value);
+
+    return (out);
+};
+
+static std::string
+get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* pt)
+{
+    std::string out;
+
+    out += " -DCK_PARAM_BlockSize=" + std::to_string(pt->BlockSize);
+
+    out += " -DCK_PARAM_MPerBlock=" + std::to_string(pt->MPerBlock) +
+           " -DCK_PARAM_NPerBlock=" + std::to_string(pt->NPerBlock) +
+           " -DCK_PARAM_KPerBlock=" + std::to_string(pt->KPerBlock);
+    out += " -DCK_PARAM_MPerWave=" + std::to_string(pt->MPerWave) +
+           " -DCK_PARAM_NPerWave=" + std::to_string(pt->NPerWave) +
+           " -DCK_PARAM_K1=" + std::to_string(pt->K1) +
+           " -DCK_PARAM_MRepeat=" + std::to_string(pt->MRepeat) +
+           " -DCK_PARAM_NRepeat=" + std::to_string(pt->NRepeat);
+
+    out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_K0_M_K1=" +
+           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[0]) + "," +
+           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[1]) + "," +
+           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[2]);
+
+    out += " -DCK_PARAM_ABlockTransferThreadClusterLengths_K0_M_K1=" +
+           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[0]) + "," +
+           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[1]) + "," +
+           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[2]);
+
+    out += " -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder=" +
+           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "," +
+           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "," +
+           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]);
+
+    out += " -DCK_PARAM_ABlockTransferSrcAccessOrder=" +
+           std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "," +
+           std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "," +
+           std::to_string(pt->ABlockTransferSrcAccessOrder[2]);
+
+    out +=
+        " -DCK_PARAM_ABlockTransferSrcVectorDim=" + std::to_string(pt->ABlockTransferSrcVectorDim);
+    out += " -DCK_PARAM_ABlockTransferSrcScalarPerVector=" +
+           std::to_string(pt->ABlockTransferSrcScalarPerVector);
+    out += " -DCK_PARAM_ABlockTransferDstScalarPerVector_K1=" +
+           std::to_string(pt->ABlockTransferDstScalarPerVector_K1);
+    out += " -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun=" +
+           std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun);
+
+    out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_K0_N_K1=" +
+           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[0]) + "," +
+           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[1]) + "," +
+           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[2]);
+
+    out += " -DCK_PARAM_BBlockTransferThreadClusterLengths_K0_N_K1=" +
+           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[0]) + "," +
+           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[1]) + "," +
+           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[2]);
+
+    out += " -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder=" +
+           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "," +
+           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "," +
+           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]);
+
+    out += " -DCK_PARAM_BBlockTransferSrcAccessOrder=" +
+           std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "," +
+           std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "," +
+           std::to_string(pt->BBlockTransferSrcAccessOrder[2]);
+
+    out +=
+        " -DCK_PARAM_BBlockTransferSrcVectorDim=" + std::to_string(pt->BBlockTransferSrcVectorDim);
+    out += " -DCK_PARAM_BBlockTransferSrcScalarPerVector=" +
+           std::to_string(pt->BBlockTransferSrcScalarPerVector);
+    out += " -DCK_PARAM_BBlockTransferDstScalarPerVector_K1=" +
+           std::to_string(pt->BBlockTransferDstScalarPerVector_K1);
+    out += " -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun=" +
+           std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun);
+
+    out += " -DCK_PARAM_CThreadTransferSrcDstAccessOrder=" +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "," +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "," +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "," +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "," +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "," +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "," +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[6]) + "," +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[7]);
+
+    out += " -DCK_PARAM_CThreadTransferSrcDstVectorDim=" +
+           std::to_string(pt->CThreadTransferSrcDstVectorDim);
+    out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
+           std::to_string(pt->CThreadTransferDstScalarPerVector);
+
+    return (out);
+};
+
+} // namespace detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
+
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk(
+    olCompile::Handle* handle,
+    const InLengths& in_n_hi_wi_c_lengths,
+    const WeiLengths& wei_k_y_x_c_lengths,
+    const OutLengths& out_n_ho_wo_k_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_hi_wi_c,
+    const Tensor<TInWei>& wei_k_y_x_c,
+    Tensor<TOut>& out_n_ho_wo_k,
+    const tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* tunable,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+    using namespace detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk;
+    using size_t = std::size_t;
+
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // The follow codes are only used for computing the grid_size, hasMainKBlockLoop,
+    // hasDoubleTailKBlockLoop
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    const auto in_n_hi_wi_c_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths);
+    const auto wei_k_y_x_c_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths);
+    const auto out_n_ho_wo_k_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths);
+
+    const auto n  = in_n_hi_wi_c_desc.GetLength(I0);
+    const auto hi = in_n_hi_wi_c_desc.GetLength(I1);
+    const auto wi = in_n_hi_wi_c_desc.GetLength(I2);
+    const auto c  = in_n_hi_wi_c_desc.GetLength(I3);
+
+    const auto k = wei_k_y_x_c_desc.GetLength(I0);
+    const auto y = wei_k_y_x_c_desc.GetLength(I1);
+    const auto x = wei_k_y_x_c_desc.GetLength(I2);
+
+    const auto ho = out_n_ho_wo_k_desc.GetLength(I1);
+    const auto wo = out_n_ho_wo_k_desc.GetLength(I2);
+
+    const auto M  = k;
+    const auto N  = n * ho * wo;
+    const auto K  = c * y * x;
+    const auto K0 = K / tunable->K1;
+
+    const index_t grid_size = (M / tunable->MPerBlock) * (N / tunable->NPerBlock);
+
+    // these buffers are usually provided by the user application
+    DeviceMem in_n_hi_wi_c_dev_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
+    DeviceMem wei_k_y_x_c_dev_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
+    DeviceMem out_n_ho_wo_k_dev_buf(sizeof(TOut) * out_n_ho_wo_k.mDesc.GetElementSpace());
+
+    in_n_hi_wi_c_dev_buf.ToDevice(in_n_hi_wi_c.mData.data());
+    wei_k_y_x_c_dev_buf.ToDevice(wei_k_y_x_c.mData.data());
+    out_n_ho_wo_k_dev_buf.ToDevice(out_n_ho_wo_k.mData.data());
+
+    // these are workspace buffers that should be expressed to the user by the corresponding
+    // workspace API
+    DeviceMem workspace_buf(4096);
+
+    void* a_k0_m_k1_grid_desc_dev_buf = workspace_buf.GetDeviceBuffer();
+    void* b_k0_n_k1_grid_desc_dev_buf =
+        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 1024);
+    void* c_m0_m1_m2_n_grid_desc_dev_buf =
+        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 2048);
+    void* c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf =
+        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 3072);
+
+    const std::vector<size_t> vld  = {static_cast<size_t>(tunable->BlockSize), 1, 1};
+    const std::vector<size_t> vgd1 = {static_cast<size_t>(tunable->BlockSize), 1, 1};
+    const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * tunable->BlockSize), 1, 1};
+
+    std::string program_name =
+        "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp";
+    std::string algo_name = "implicit_gemm_conv_fwd_v4r4_xdlops_nhwc";
+
+    std::string param = " -std=c++17 ";
+    std::string network_config;
+
+    param += get_definition_string_from_types<TInWei, TAcc, TOut>() + " -DCK_USE_AMD_XDLOPS ";
+    param += get_definition_string_from_tunable(tunable);
+
+    network_config = get_network_config_string_from_types<TInWei, TAcc, TOut>() + "_" +
+                     get_network_config_string_from_tunable(tunable);
+
+    std::vector<float> kernel1_times;
+    std::vector<float> kernel2_times;
+
+    for(index_t i = 0; i < nrepeat; ++i)
+    {
+        KernelTimer timer1, timer2;
+        std::string kernel_name;
+
+        kernel_name =
+            "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk_prepare";
+        auto network_config_1 = network_config + "_1";
+
+        timer1.Start();
+        handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)(
+            static_cast<index_t>(in_n_hi_wi_c_lengths[I0]),
+            static_cast<index_t>(in_n_hi_wi_c_lengths[I1]),
+            static_cast<index_t>(in_n_hi_wi_c_lengths[I2]),
+            static_cast<index_t>(in_n_hi_wi_c_lengths[I3]),
+            static_cast<index_t>(wei_k_y_x_c_lengths[I0]),
+            static_cast<index_t>(wei_k_y_x_c_lengths[I1]),
+            static_cast<index_t>(wei_k_y_x_c_lengths[I2]),
+            conv_strides[I0],
+            conv_strides[I1],
+            conv_dilations[I0],
+            conv_dilations[I1],
+            in_left_pads[I0],
+            in_left_pads[I1],
+            in_right_pads[I0],
+            in_right_pads[I1],
+            a_k0_m_k1_grid_desc_dev_buf,
+            b_k0_n_k1_grid_desc_dev_buf,
+            c_m0_m1_m2_n_grid_desc_dev_buf,
+            c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf);
+        timer1.End();
+
+        kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk";
+        auto network_config_2 = network_config + "_2";
+
+        timer2.Start();
+        handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)(
+            reinterpret_cast<const TInWei*>(in_n_hi_wi_c_dev_buf.GetDeviceBuffer()),
+            reinterpret_cast<const TInWei*>(wei_k_y_x_c_dev_buf.GetDeviceBuffer()),
+            reinterpret_cast<TOut*>(out_n_ho_wo_k_dev_buf.GetDeviceBuffer()),
+            (const void*)(a_k0_m_k1_grid_desc_dev_buf),
+            (const void*)(b_k0_n_k1_grid_desc_dev_buf),
+            (const void*)(c_m0_m1_m2_n_grid_desc_dev_buf),
+            (const void*)(c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf));
+        timer2.End();
+
+        kernel1_times.push_back(timer1.GetElapsedTime());
+        kernel2_times.push_back(timer2.GetElapsedTime());
+    }
+
+    {
+        auto ave_time1 =
+            std::accumulate(
+                std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus<float>{}) /
+            (nrepeat - 1);
+        auto ave_time2 =
+            std::accumulate(
+                std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus<float>{}) /
+            (nrepeat - 1);
+
+        const auto N = in_n_hi_wi_c_lengths[I0];
+        const auto C = in_n_hi_wi_c_lengths[I3];
+
+        const auto Ho = out_n_ho_wo_k_lengths[I1];
+        const auto Wo = out_n_ho_wo_k_lengths[I2];
+        const auto K  = out_n_ho_wo_k_lengths[I3];
+
+        const auto Y = wei_k_y_x_c_lengths[I1];
+        const auto X = wei_k_y_x_c_lengths[I2];
+
+        float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
+                     (std::size_t(1000) * 1000 * 1000) / ave_time2;
+
+        std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
+                  << ave_time2 << "), " << perf << " TFlop/s" << std::endl;
+    };
+
+    // copy result back to host
+    out_n_ho_wo_k_dev_buf.FromDevice(out_n_ho_wo_k.mData.data());
+}
diff --git a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
new file mode 100644
index 0000000000..260c94ee0e
--- /dev/null
+++ b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
@@ -0,0 +1,182 @@
+#pragma once
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "handle.hpp"
+#include "online_driver_common.hpp"
+#include "convolution_problem_descriptor.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp"
+#include "conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp"
+
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
+    olCompile::Handle* handle,
+    const InLengths& in_n_c_hi_wi_lengths,
+    const WeiLengths& wei_k_c_y_x_lengths,
+    const OutLengths& out_n_k_ho_wo_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_c_hi_wi,
+    const Tensor<TInWei>& wei_k_c_y_x,
+    Tensor<TOut>& out_n_k_ho_wo,
+    const ck_driver::CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw& compile_param,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+    using namespace ck_driver;
+    using size_t = std::size_t;
+
+    std::cout << __func__ << std::endl;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    ConvolutionProblemDescriptor conv_problem_desc{in_n_c_hi_wi_lengths[I0],
+                                                   out_n_k_ho_wo_lengths[I1],
+                                                   in_n_c_hi_wi_lengths[I1],
+                                                   wei_k_c_y_x_lengths[I2],
+                                                   wei_k_c_y_x_lengths[I3],
+                                                   in_n_c_hi_wi_lengths[I2],
+                                                   in_n_c_hi_wi_lengths[I3],
+                                                   out_n_k_ho_wo_lengths[I2],
+                                                   out_n_k_ho_wo_lengths[I3],
+                                                   conv_strides[I0],
+                                                   conv_strides[I1],
+                                                   conv_dilations[I0],
+                                                   conv_dilations[I1],
+                                                   in_left_pads[I0],
+                                                   in_left_pads[I1],
+                                                   in_right_pads[I0],
+                                                   in_right_pads[I1],
+                                                   get_datatype_enum_from_type<TInWei>::value,
+                                                   get_datatype_enum_from_type<TInWei>::value,
+                                                   get_datatype_enum_from_type<TOut>::value};
+
+    if(!ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::IsValidCompileParameter(conv_problem_desc,
+                                                                   compile_param))
+    {
+        throw std::runtime_error("wrong! IsValidCompileParameter fail");
+    }
+
+    DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
+    DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
+    DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
+
+    in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data());
+    wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data());
+    out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data());
+
+    // workspace is used for save transformed tensor descritpors created by prepare kernel
+    DeviceMem workspace_dev_buf(
+        ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetWorkSpaceSize(conv_problem_desc, compile_param));
+
+    const auto block_size = std::size_t(
+        ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetBlockSize(conv_problem_desc, compile_param));
+
+    const auto grid_size = std::size_t(
+        ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetGridSize(conv_problem_desc, compile_param));
+
+    const std::vector<size_t> vld1 = {1, 1, 1};
+    const std::vector<size_t> vgd1 = {1, 1, 1};
+
+    const std::vector<size_t> vld2 = {static_cast<size_t>(block_size), 1, 1};
+    const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * block_size), 1, 1};
+
+    std::string program_name =
+        "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp";
+    std::string algo_name = "implicit_gemm_conv_fwd_v6r1_dlops_nchw";
+
+    std::string compile_param_string = " -std=c++17 " + compile_param.GetCompileParameterString();
+    std::string network_config       = compile_param_string;
+
+    std::vector<float> kernel1_times;
+    std::vector<float> kernel2_times;
+
+    for(index_t i = 0; i < nrepeat; ++i)
+    {
+        KernelTimer timer1, timer2;
+        std::string kernel_name;
+
+        kernel_name = "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare";
+        auto network_config_1 = network_config + "_1";
+
+        timer1.Start();
+        handle->AddKernel(algo_name,
+                          network_config_1,
+                          program_name,
+                          kernel_name,
+                          vld1,
+                          vgd1,
+                          compile_param_string)(static_cast<index_t>(in_n_c_hi_wi_lengths[I0]),
+                                                static_cast<index_t>(in_n_c_hi_wi_lengths[I1]),
+                                                static_cast<index_t>(in_n_c_hi_wi_lengths[I2]),
+                                                static_cast<index_t>(in_n_c_hi_wi_lengths[I3]),
+                                                static_cast<index_t>(wei_k_c_y_x_lengths[I0]),
+                                                static_cast<index_t>(wei_k_c_y_x_lengths[I2]),
+                                                static_cast<index_t>(wei_k_c_y_x_lengths[I3]),
+                                                conv_strides[I0],
+                                                conv_strides[I1],
+                                                conv_dilations[I0],
+                                                conv_dilations[I1],
+                                                in_left_pads[I0],
+                                                in_left_pads[I1],
+                                                in_right_pads[I0],
+                                                in_right_pads[I1],
+                                                (void*)(workspace_dev_buf.GetDeviceBuffer()));
+        timer1.End();
+
+        kernel_name = "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw";
+        auto network_config_2 = network_config + "_2";
+
+        timer2.Start();
+        handle->AddKernel(algo_name,
+                          network_config_2,
+                          program_name,
+                          kernel_name,
+                          vld2,
+                          vgd2,
+                          compile_param_string)(
+            reinterpret_cast<const TInWei*>(wei_k_c_y_x_dev_buf.GetDeviceBuffer()),
+            reinterpret_cast<const TInWei*>(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()),
+            reinterpret_cast<TOut*>(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()),
+            (const void*)(workspace_dev_buf.GetDeviceBuffer()));
+        timer2.End();
+
+        kernel1_times.push_back(timer1.GetElapsedTime());
+        kernel2_times.push_back(timer2.GetElapsedTime());
+    }
+
+    {
+        auto ave_time1 =
+            std::accumulate(
+                std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus<float>{}) /
+            (nrepeat - 1);
+        auto ave_time2 =
+            std::accumulate(
+                std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus<float>{}) /
+            (nrepeat - 1);
+
+        float perf = (float)(conv_problem_desc.CalculateFlop()) /
+                     (std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2);
+
+        std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
+                  << ave_time2 << "), " << perf << " TFlop/s" << std::endl;
+    };
+
+    // copy result back to host
+    out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data());
+}
diff --git a/host/driver_online/include/online_driver_common.hpp b/host/driver_online/include/online_driver_common.hpp
new file mode 100644
index 0000000000..472ffb52dc
--- /dev/null
+++ b/host/driver_online/include/online_driver_common.hpp
@@ -0,0 +1,44 @@
+#ifndef ONLINE_DRIVER_COMMON_HPP
+#define ONLINE_DRIVER_COMMON_HPP
+
+namespace ck_driver {
+
+// greatest common divisor, aka highest common factor
+inline int gcd(int x, int y)
+{
+    if(x < 0)
+    {
+        return gcd(-x, y);
+    }
+    else if(y < 0)
+    {
+        return gcd(x, -y);
+    }
+    else if(x == y || x == 0)
+    {
+        return y;
+    }
+    else if(y == 0)
+    {
+        return x;
+    }
+    else if(x > y)
+    {
+        return gcd(x % y, y);
+    }
+    else
+    {
+        return gcd(x, y % x);
+    }
+}
+
+template <typename X,
+          typename... Ys,
+          typename std::enable_if<sizeof...(Ys) >= 2, bool>::type = false>
+auto gcd(X x, Ys... ys)
+{
+    return gcd(x, gcd(ys...));
+}
+
+} // namespace ck_driver
+#endif
diff --git a/host/host_tensor/CMakeLists.txt b/host/host_tensor/CMakeLists.txt
new file mode 100644
index 0000000000..9c30275220
--- /dev/null
+++ b/host/host_tensor/CMakeLists.txt
@@ -0,0 +1,19 @@
+include_directories(BEFORE
+    include
+)
+
+set(HOST_TENSOR_SOURCE
+    src/host_tensor.cpp;
+    src/device.cpp;
+)
+
+## the library target
+add_library(host_tensor SHARED ${HOST_TENSOR_SOURCE}) 
+
+target_link_libraries(host_tensor PRIVATE hip::device)
+target_link_libraries(host_tensor INTERFACE hip::host)
+
+target_compile_features(host_tensor PUBLIC)
+set_target_properties(host_tensor PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+install(TARGETS host_tensor LIBRARY DESTINATION lib) 
diff --git a/host/host_tensor/include/conv_common.hpp b/host/host_tensor/include/conv_common.hpp
new file mode 100644
index 0000000000..73126b3c79
--- /dev/null
+++ b/host/host_tensor/include/conv_common.hpp
@@ -0,0 +1,86 @@
+#ifndef CONV_COMMON_HPP
+#define CONV_COMMON_HPP
+
+#include "dynamic_tensor_descriptor.hpp"
+
+enum ConvTensorLayout
+{
+    NCHW,
+    NHWC,
+    CHWN,
+    NCHWc,
+    NHWCc
+};
+
+template <typename... InDesc,
+          typename... WeiDesc,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename LeftPads,
+          typename RightPads>
+constexpr auto get_convolution_output_default_4d_tensor_descriptor(
+    const ck::DynamicTensorDescriptor<InDesc...>& in_desc,
+    const ck::DynamicTensorDescriptor<WeiDesc...>& wei_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations conv_dilations,
+    const LeftPads& left_pads,
+    const RightPads& right_pads)
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    assert(in_desc.GetNumOfDimension() == 4);
+    assert(wei_desc.GetNumOfDimension() == 4);
+    assert(in_desc.GetLength(I1) == wei_desc.GetLength(I1));
+
+    const auto N  = in_desc.GetLength(I0);
+    const auto Hi = in_desc.GetLength(I2);
+    const auto Wi = in_desc.GetLength(I3);
+
+    const auto K = wei_desc.GetLength(I0);
+    const auto Y = wei_desc.GetLength(I2);
+    const auto X = wei_desc.GetLength(I3);
+
+    const auto LeftPadH = left_pads[I0];
+    const auto LeftPadW = left_pads[I1];
+
+    const auto RightPadH = right_pads[I0];
+    const auto RightPadW = right_pads[I1];
+
+    const auto YEff = (Y - I1) * conv_dilations[I0] + I1;
+    const auto XEff = (X - I1) * conv_dilations[I1] + I1;
+
+    const auto Ho = (Hi + LeftPadH + RightPadH - YEff) / conv_strides[I0] + I1;
+    const auto Wo = (Wi + LeftPadW + RightPadW - XEff) / conv_strides[I1] + I1;
+
+    return make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K, Ho, Wo));
+}
+
+template <class InDesc, class WeiDesc, class OutDesc>
+constexpr std::size_t
+calculate_convolution_flops(const InDesc& in_desc, const WeiDesc& wei_desc, const OutDesc& out_desc)
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    const index_t N  = out_desc.GetLength(I0);
+    const index_t K  = out_desc.GetLength(I1);
+    const index_t Ho = out_desc.GetLength(I2);
+    const index_t Wo = out_desc.GetLength(I3);
+
+    const index_t C = wei_desc.GetLength(I1);
+    const index_t Y = wei_desc.GetLength(I2);
+    const index_t X = wei_desc.GetLength(I3);
+
+    return std::size_t(2) * N * K * Ho * Wo * C * Y * X;
+}
+
+#endif
diff --git a/host/host_tensor/include/device.hpp b/host/host_tensor/include/device.hpp
new file mode 100644
index 0000000000..2299e14921
--- /dev/null
+++ b/host/host_tensor/include/device.hpp
@@ -0,0 +1,86 @@
+#ifndef DEVICE_HPP
+#define DEVICE_HPP
+
+#include <memory>
+#include "hip/hip_runtime.h"
+#include "hip/hip_fp16.h"
+
+struct DeviceMem
+{
+    DeviceMem() = delete;
+    DeviceMem(std::size_t mem_size);
+    void* GetDeviceBuffer();
+    void ToDevice(const void* p);
+    void FromDevice(void* p);
+    ~DeviceMem();
+
+    void* mpDeviceBuf;
+    std::size_t mMemSize;
+};
+
+struct KernelTimerImpl;
+
+struct KernelTimer
+{
+    KernelTimer();
+    ~KernelTimer();
+    void Start();
+    void End();
+    float GetElapsedTime() const;
+
+    std::unique_ptr<KernelTimerImpl> impl;
+};
+
+using device_stream_t = hipStream_t;
+
+template <typename... Args, typename F>
+void launch_kernel(F kernel,
+                   dim3 grid_dim,
+                   dim3 block_dim,
+                   std::size_t lds_byte,
+                   hipStream_t stream_id,
+                   Args... args)
+{
+    hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
+}
+
+template <typename... Args, typename F>
+float launch_and_time_kernel(F kernel,
+                             int nrepeat,
+                             dim3 grid_dim,
+                             dim3 block_dim,
+                             std::size_t lds_byte,
+                             hipStream_t stream_id,
+                             Args... args)
+{
+    KernelTimer timer;
+
+    printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n",
+           __func__,
+           grid_dim.x,
+           grid_dim.y,
+           grid_dim.z,
+           block_dim.x,
+           block_dim.y,
+           block_dim.z);
+
+    printf("Warm up\n");
+
+    // warm up
+    hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
+
+    printf("Start running %d times...\n", nrepeat);
+
+    timer.Start();
+
+    for(int i = 0; i < nrepeat; ++i)
+    {
+        hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
+    }
+
+    timer.End();
+
+    return timer.GetElapsedTime() / nrepeat;
+}
+
+#endif
diff --git a/host/host_tensor/include/device_tensor.hpp b/host/host_tensor/include/device_tensor.hpp
new file mode 100644
index 0000000000..1a7a34a4cf
--- /dev/null
+++ b/host/host_tensor/include/device_tensor.hpp
@@ -0,0 +1,9 @@
+#pragma once
+#include "host_tensor.hpp"
+#include "common_header.hpp"
+
+template <typename TensorDesc>
+void ostream_tensor_descriptor(TensorDesc, std::ostream& os = std::cout)
+{
+    ostream_HostTensorDescriptor(make_HostTensorDescriptor(TensorDesc{}), os);
+}
diff --git a/host/host_tensor/include/host_conv.hpp b/host/host_tensor/include/host_conv.hpp
new file mode 100644
index 0000000000..7f26cb42f7
--- /dev/null
+++ b/host/host_tensor/include/host_conv.hpp
@@ -0,0 +1,326 @@
+#pragma once
+#include "host_tensor.hpp"
+
+template <typename TIn,
+          typename TWei,
+          typename TOut,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void host_direct_convolution(const Tensor<TIn>& in,
+                             const Tensor<TWei>& wei,
+                             Tensor<TOut>& out,
+                             const ConvStrides& conv_strides,
+                             const ConvDilations& conv_dilations,
+                             const InLeftPads& in_left_pads,
+                             const InRightPads& in_right_pads,
+                             const ConvTensorLayout layout = ConvTensorLayout::NCHW)
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
+        double v = 0;
+        for(int c = 0; c < wei.mDesc.GetLengths()[1]; ++c)
+        {
+            for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
+            {
+                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
+                for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
+                {
+                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
+                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
+                       wi < in.mDesc.GetLengths()[3])
+                    {
+                        v += static_cast<const double>(in(n, c, hi, wi)) *
+                             static_cast<const double>(wei(k, c, y, x));
+                    }
+                }
+            }
+        }
+        out(n, k, ho, wo) = v;
+    };
+
+    auto f_nhwc = [&](auto n, auto ho, auto wo, auto k) {
+        double v = 0;
+        for(int c = 0; c < wei.mDesc.GetLengths()[3]; ++c)
+        {
+            for(int y = 0; y < wei.mDesc.GetLengths()[1]; ++y)
+            {
+                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
+                for(int x = 0; x < wei.mDesc.GetLengths()[2]; ++x)
+                {
+                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
+                    if(hi >= 0 && hi < in.mDesc.GetLengths()[1] && wi >= 0 &&
+                       wi < in.mDesc.GetLengths()[2])
+                    {
+                        v += static_cast<const double>(in(n, hi, wi, c)) *
+                             static_cast<const double>(wei(k, y, x, c));
+                    }
+                }
+            }
+        }
+        out(n, ho, wo, k) = v;
+    };
+
+    switch(layout)
+    {
+    case ConvTensorLayout::NCHW:
+        make_ParallelTensorFunctor(f_nchw,
+                                   out.mDesc.GetLengths()[0],
+                                   out.mDesc.GetLengths()[1],
+                                   out.mDesc.GetLengths()[2],
+                                   out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
+        break;
+    case ConvTensorLayout::NHWC:
+        make_ParallelTensorFunctor(f_nhwc,
+                                   out.mDesc.GetLengths()[0],
+                                   out.mDesc.GetLengths()[1],
+                                   out.mDesc.GetLengths()[2],
+                                   out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
+        break;
+    default: throw std::runtime_error("wrong! not supported layout");
+    }
+}
+
+template <typename TIn, typename TWei, typename TOut, typename InLeftPads, typename InRightPads>
+void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
+                                   const Tensor<TWei>& wei_kcyx,
+                                   Tensor<TOut>& out_nkhw,
+                                   InLeftPads,
+                                   InRightPads)
+{
+    using namespace ck;
+
+    constexpr std::size_t HoPerTile = 2;
+    constexpr std::size_t WoPerTile = 2;
+
+    std::size_t N  = in_nchw.mDesc.GetLengths()[0];
+    std::size_t C  = in_nchw.mDesc.GetLengths()[1];
+    std::size_t HI = in_nchw.mDesc.GetLengths()[2];
+    std::size_t WI = in_nchw.mDesc.GetLengths()[3];
+
+    std::size_t K = wei_kcyx.mDesc.GetLengths()[0];
+    std::size_t Y = wei_kcyx.mDesc.GetLengths()[2];
+    std::size_t X = wei_kcyx.mDesc.GetLengths()[3];
+
+    std::size_t HO = out_nkhw.mDesc.GetLengths()[2];
+    std::size_t WO = out_nkhw.mDesc.GetLengths()[3];
+
+    index_t h_pad_low = InLeftPads{}.Get(Number<0>{});
+    index_t w_pad_low = InLeftPads{}.Get(Number<1>{});
+
+    std::size_t HiPerTile = HoPerTile + Y - 1;
+    std::size_t WiPerTile = WoPerTile + X - 1;
+
+    std::size_t HTile = (HO + HoPerTile - 1) / HoPerTile;
+    std::size_t WTile = (WO + WoPerTile - 1) / WoPerTile;
+
+    Tensor<double> in_hold({N, C, HTile, WTile, HiPerTile, WiPerTile});
+    Tensor<double> in_transform({N, C, HTile, WTile, HiPerTile, WiPerTile});
+    Tensor<double> wei_transform({K, C, HiPerTile, WiPerTile});
+    Tensor<double> out_transform({N, K, HTile, WTile, HiPerTile, HiPerTile});
+    Tensor<double> out_hold({N, K, HTile, WTile, HoPerTile, WoPerTile});
+
+    auto f_in_hold = [&](auto n, auto c, auto htile, auto wtile) {
+        for(int j = 0; j < HiPerTile; ++j)
+        {
+            int hi = HoPerTile * htile + j - h_pad_low;
+            for(int i = 0; i < WiPerTile; ++i)
+            {
+                int wi = WoPerTile * wtile + i - w_pad_low;
+
+                if(hi >= 0 && hi < in_nchw.mDesc.GetLengths()[2] && wi >= 0 &&
+                   wi < in_nchw.mDesc.GetLengths()[3])
+                {
+                    in_hold(n, c, htile, wtile, j, i) = in_nchw(n, c, hi, wi);
+                }
+                else
+                {
+                    in_hold(n, c, htile, wtile, j, i) = TIn(0);
+                }
+            }
+        }
+    };
+
+    auto f_in_transform = [&](auto n, auto c, auto htile, auto wtile) {
+        in_transform(n, c, htile, wtile, 0, 0) =
+            in_hold(n, c, htile, wtile, 0, 0) - in_hold(n, c, htile, wtile, 0, 2) -
+            in_hold(n, c, htile, wtile, 2, 0) + in_hold(n, c, htile, wtile, 2, 2);
+        in_transform(n, c, htile, wtile, 0, 1) =
+            in_hold(n, c, htile, wtile, 0, 1) + in_hold(n, c, htile, wtile, 0, 2) -
+            in_hold(n, c, htile, wtile, 2, 1) - in_hold(n, c, htile, wtile, 2, 2);
+        in_transform(n, c, htile, wtile, 0, 2) =
+            -in_hold(n, c, htile, wtile, 0, 1) + in_hold(n, c, htile, wtile, 0, 2) +
+            in_hold(n, c, htile, wtile, 2, 1) - in_hold(n, c, htile, wtile, 2, 2);
+        in_transform(n, c, htile, wtile, 0, 3) =
+            in_hold(n, c, htile, wtile, 0, 1) - in_hold(n, c, htile, wtile, 0, 3) -
+            in_hold(n, c, htile, wtile, 2, 1) + in_hold(n, c, htile, wtile, 2, 3);
+
+        in_transform(n, c, htile, wtile, 1, 0) =
+            in_hold(n, c, htile, wtile, 1, 0) - in_hold(n, c, htile, wtile, 1, 2) +
+            in_hold(n, c, htile, wtile, 2, 0) - in_hold(n, c, htile, wtile, 2, 2);
+        in_transform(n, c, htile, wtile, 1, 1) =
+            in_hold(n, c, htile, wtile, 1, 1) + in_hold(n, c, htile, wtile, 1, 2) +
+            in_hold(n, c, htile, wtile, 2, 1) + in_hold(n, c, htile, wtile, 2, 2);
+        in_transform(n, c, htile, wtile, 1, 2) =
+            -in_hold(n, c, htile, wtile, 1, 1) + in_hold(n, c, htile, wtile, 1, 2) -
+            in_hold(n, c, htile, wtile, 2, 1) + in_hold(n, c, htile, wtile, 2, 2);
+        in_transform(n, c, htile, wtile, 1, 3) =
+            in_hold(n, c, htile, wtile, 1, 1) - in_hold(n, c, htile, wtile, 1, 3) +
+            in_hold(n, c, htile, wtile, 2, 1) - in_hold(n, c, htile, wtile, 2, 3);
+
+        in_transform(n, c, htile, wtile, 2, 0) =
+            -in_hold(n, c, htile, wtile, 1, 0) + in_hold(n, c, htile, wtile, 1, 2) +
+            in_hold(n, c, htile, wtile, 2, 0) - in_hold(n, c, htile, wtile, 2, 2);
+        in_transform(n, c, htile, wtile, 2, 1) =
+            -in_hold(n, c, htile, wtile, 1, 1) - in_hold(n, c, htile, wtile, 1, 2) +
+            in_hold(n, c, htile, wtile, 2, 1) + in_hold(n, c, htile, wtile, 2, 2);
+        in_transform(n, c, htile, wtile, 2, 2) =
+            in_hold(n, c, htile, wtile, 1, 1) - in_hold(n, c, htile, wtile, 1, 2) -
+            in_hold(n, c, htile, wtile, 2, 1) + in_hold(n, c, htile, wtile, 2, 2);
+        in_transform(n, c, htile, wtile, 2, 3) =
+            -in_hold(n, c, htile, wtile, 1, 1) + in_hold(n, c, htile, wtile, 1, 3) +
+            in_hold(n, c, htile, wtile, 2, 1) - in_hold(n, c, htile, wtile, 2, 3);
+
+        in_transform(n, c, htile, wtile, 3, 0) =
+            in_hold(n, c, htile, wtile, 1, 0) - in_hold(n, c, htile, wtile, 1, 2) -
+            in_hold(n, c, htile, wtile, 3, 0) + in_hold(n, c, htile, wtile, 3, 2);
+        in_transform(n, c, htile, wtile, 3, 1) =
+            in_hold(n, c, htile, wtile, 1, 1) + in_hold(n, c, htile, wtile, 1, 2) -
+            in_hold(n, c, htile, wtile, 3, 1) - in_hold(n, c, htile, wtile, 3, 2);
+        in_transform(n, c, htile, wtile, 3, 2) =
+            -in_hold(n, c, htile, wtile, 1, 1) + in_hold(n, c, htile, wtile, 1, 2) +
+            in_hold(n, c, htile, wtile, 3, 1) - in_hold(n, c, htile, wtile, 3, 2);
+        in_transform(n, c, htile, wtile, 3, 3) =
+            in_hold(n, c, htile, wtile, 1, 1) - in_hold(n, c, htile, wtile, 1, 3) -
+            in_hold(n, c, htile, wtile, 3, 1) + in_hold(n, c, htile, wtile, 3, 3);
+    };
+
+    auto f_wei_transform = [&](auto k, auto c) {
+        wei_transform(k, c, 0, 0) = double(wei_kcyx(k, c, 0, 0));
+        wei_transform(k, c, 0, 1) = 0.5 * double(wei_kcyx(k, c, 0, 0)) +
+                                    0.5 * double(wei_kcyx(k, c, 0, 1)) +
+                                    0.5 * double(wei_kcyx(k, c, 0, 2));
+        wei_transform(k, c, 0, 2) = 0.5 * double(wei_kcyx(k, c, 0, 0)) -
+                                    0.5 * double(wei_kcyx(k, c, 0, 1)) +
+                                    0.5 * double(wei_kcyx(k, c, 0, 2));
+        wei_transform(k, c, 0, 3) = double(wei_kcyx(k, c, 0, 2));
+
+        wei_transform(k, c, 1, 0) = 0.5 * double(wei_kcyx(k, c, 0, 0)) +
+                                    0.5 * double(wei_kcyx(k, c, 1, 0)) +
+                                    0.5 * double(wei_kcyx(k, c, 2, 0));
+        wei_transform(k, c, 1, 1) =
+            0.25 * double(wei_kcyx(k, c, 0, 0)) + 0.25 * double(wei_kcyx(k, c, 0, 1)) +
+            0.25 * double(wei_kcyx(k, c, 0, 2)) + 0.25 * double(wei_kcyx(k, c, 1, 0)) +
+            0.25 * double(wei_kcyx(k, c, 1, 1)) + 0.25 * double(wei_kcyx(k, c, 1, 2)) +
+            0.25 * double(wei_kcyx(k, c, 2, 0)) + 0.25 * double(wei_kcyx(k, c, 2, 1)) +
+            0.25 * double(wei_kcyx(k, c, 2, 2));
+        wei_transform(k, c, 1, 2) =
+            0.25 * double(wei_kcyx(k, c, 0, 0)) - 0.25 * double(wei_kcyx(k, c, 0, 1)) +
+            0.25 * double(wei_kcyx(k, c, 0, 2)) + 0.25 * double(wei_kcyx(k, c, 1, 0)) -
+            0.25 * double(wei_kcyx(k, c, 1, 1)) + 0.25 * double(wei_kcyx(k, c, 1, 2)) +
+            0.25 * double(wei_kcyx(k, c, 2, 0)) - 0.25 * double(wei_kcyx(k, c, 2, 1)) +
+            0.25 * double(wei_kcyx(k, c, 2, 2));
+        wei_transform(k, c, 1, 3) = 0.5 * double(wei_kcyx(k, c, 0, 2)) +
+                                    0.5 * double(wei_kcyx(k, c, 1, 2)) +
+                                    0.5 * double(wei_kcyx(k, c, 2, 2));
+
+        wei_transform(k, c, 2, 0) = 0.5 * double(wei_kcyx(k, c, 0, 0)) -
+                                    0.5 * double(wei_kcyx(k, c, 1, 0)) +
+                                    0.5 * double(wei_kcyx(k, c, 2, 0));
+        wei_transform(k, c, 2, 1) =
+            0.25 * double(wei_kcyx(k, c, 0, 0)) + 0.25 * double(wei_kcyx(k, c, 0, 1)) +
+            0.25 * double(wei_kcyx(k, c, 0, 2)) - 0.25 * double(wei_kcyx(k, c, 1, 0)) -
+            0.25 * double(wei_kcyx(k, c, 1, 1)) - 0.25 * double(wei_kcyx(k, c, 1, 2)) +
+            0.25 * double(wei_kcyx(k, c, 2, 0)) + 0.25 * double(wei_kcyx(k, c, 2, 1)) +
+            0.25 * double(wei_kcyx(k, c, 2, 2));
+        wei_transform(k, c, 2, 2) =
+            0.25 * double(wei_kcyx(k, c, 0, 0)) - 0.25 * double(wei_kcyx(k, c, 0, 1)) +
+            0.25 * double(wei_kcyx(k, c, 0, 2)) - 0.25 * double(wei_kcyx(k, c, 1, 0)) +
+            0.25 * double(wei_kcyx(k, c, 1, 1)) - 0.25 * double(wei_kcyx(k, c, 1, 2)) +
+            0.25 * double(wei_kcyx(k, c, 2, 0)) - 0.25 * double(wei_kcyx(k, c, 2, 1)) +
+            0.25 * double(wei_kcyx(k, c, 2, 2));
+        wei_transform(k, c, 2, 3) = 0.5 * double(wei_kcyx(k, c, 0, 2)) -
+                                    0.5 * double(wei_kcyx(k, c, 1, 2)) +
+                                    0.5 * double(wei_kcyx(k, c, 2, 2));
+
+        wei_transform(k, c, 3, 0) = double(wei_kcyx(k, c, 2, 0));
+        wei_transform(k, c, 3, 1) = 0.5 * double(wei_kcyx(k, c, 2, 0)) +
+                                    0.5 * double(wei_kcyx(k, c, 2, 1)) +
+                                    0.5 * double(wei_kcyx(k, c, 2, 2));
+        wei_transform(k, c, 3, 2) = 0.5 * double(wei_kcyx(k, c, 2, 0)) -
+                                    0.5 * double(wei_kcyx(k, c, 2, 1)) +
+                                    0.5 * double(wei_kcyx(k, c, 2, 2));
+        wei_transform(k, c, 3, 3) = double(wei_kcyx(k, c, 2, 2));
+    };
+
+    auto f_out_transform = [&](auto n, auto k, auto htile, auto wtile) {
+        for(int j = 0; j < HiPerTile; ++j)
+        {
+            for(int i = 0; i < WiPerTile; ++i)
+            {
+                double v = 0;
+                for(int c = 0; c < C; ++c)
+                {
+                    v += in_transform(n, c, htile, wtile, j, i) * wei_transform(k, c, j, i);
+                }
+
+                out_transform(n, k, htile, wtile, j, i) = v;
+            }
+        }
+    };
+
+    auto f_out_hold = [&](auto n, auto k, auto htile, auto wtile) {
+        out_hold(n, k, htile, wtile, 0, 0) =
+            out_transform(n, k, htile, wtile, 0, 0) + out_transform(n, k, htile, wtile, 0, 1) +
+            out_transform(n, k, htile, wtile, 0, 2) + out_transform(n, k, htile, wtile, 1, 0) +
+            out_transform(n, k, htile, wtile, 1, 1) + out_transform(n, k, htile, wtile, 1, 2) +
+            out_transform(n, k, htile, wtile, 2, 0) + out_transform(n, k, htile, wtile, 2, 1) +
+            out_transform(n, k, htile, wtile, 2, 2);
+        out_hold(n, k, htile, wtile, 0, 1) =
+            out_transform(n, k, htile, wtile, 0, 1) - out_transform(n, k, htile, wtile, 0, 2) -
+            out_transform(n, k, htile, wtile, 0, 3) + out_transform(n, k, htile, wtile, 1, 1) -
+            out_transform(n, k, htile, wtile, 1, 2) - out_transform(n, k, htile, wtile, 1, 3) +
+            out_transform(n, k, htile, wtile, 2, 1) - out_transform(n, k, htile, wtile, 2, 2) -
+            out_transform(n, k, htile, wtile, 2, 3);
+        out_hold(n, k, htile, wtile, 1, 0) =
+            out_transform(n, k, htile, wtile, 1, 0) + out_transform(n, k, htile, wtile, 1, 1) +
+            out_transform(n, k, htile, wtile, 1, 2) - out_transform(n, k, htile, wtile, 2, 0) -
+            out_transform(n, k, htile, wtile, 2, 1) - out_transform(n, k, htile, wtile, 2, 2) -
+            out_transform(n, k, htile, wtile, 3, 0) - out_transform(n, k, htile, wtile, 3, 1) -
+            out_transform(n, k, htile, wtile, 3, 2);
+        out_hold(n, k, htile, wtile, 1, 1) =
+            out_transform(n, k, htile, wtile, 1, 1) - out_transform(n, k, htile, wtile, 1, 2) -
+            out_transform(n, k, htile, wtile, 1, 3) - out_transform(n, k, htile, wtile, 2, 1) +
+            out_transform(n, k, htile, wtile, 2, 2) + out_transform(n, k, htile, wtile, 2, 3) -
+            out_transform(n, k, htile, wtile, 3, 1) + out_transform(n, k, htile, wtile, 3, 2) +
+            out_transform(n, k, htile, wtile, 3, 3);
+    };
+
+    auto f_out = [&](auto n, auto k, auto htile, auto wtile) {
+        for(int j = 0; j < HoPerTile; ++j)
+        {
+            std::size_t ho = HoPerTile * htile + j;
+            for(int i = 0; i < WoPerTile; ++i)
+            {
+                std::size_t wo         = WoPerTile * wtile + i;
+                out_nkhw(n, k, ho, wo) = out_hold(n, k, htile, wtile, j, i);
+            }
+        }
+    };
+
+    std::size_t num_thread = std::thread::hardware_concurrency();
+
+    make_ParallelTensorFunctor(f_in_hold, N, C, HTile, WTile)(num_thread);
+    make_ParallelTensorFunctor(f_in_transform, N, C, HTile, WTile)(num_thread);
+    make_ParallelTensorFunctor(f_wei_transform, K, C)(num_thread);
+    make_ParallelTensorFunctor(f_out_transform, N, K, HTile, WTile)(num_thread);
+    make_ParallelTensorFunctor(f_out_hold, N, K, HTile, WTile)(num_thread);
+    make_ParallelTensorFunctor(f_out, N, K, HTile, WTile)(num_thread);
+}
diff --git a/host/host_tensor/include/host_conv_bwd_data.hpp b/host/host_tensor/include/host_conv_bwd_data.hpp
new file mode 100644
index 0000000000..07617c3926
--- /dev/null
+++ b/host/host_tensor/include/host_conv_bwd_data.hpp
@@ -0,0 +1,143 @@
+#pragma once
+#include "host_tensor.hpp"
+
+template <typename TIn,
+          typename TWei,
+          typename TOut,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void host_direct_convolution_backward_data(Tensor<TIn>& in,
+                                           const Tensor<TWei>& wei,
+                                           const Tensor<TOut>& out,
+                                           const ConvStrides& conv_strides,
+                                           const ConvDilations& conv_dilations,
+                                           const InLeftPads& in_left_pads,
+                                           const InRightPads& in_right_pads,
+                                           const ConvTensorLayout layout = ConvTensorLayout::NCHW)
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    auto f_nchw = [&](auto n, auto c, auto hi, auto wi) {
+        std::size_t N  = in.mDesc.GetLengths()[I0];
+        std::size_t C  = in.mDesc.GetLengths()[I1];
+        std::size_t Hi = in.mDesc.GetLengths()[I2];
+        std::size_t Wi = in.mDesc.GetLengths()[I3];
+
+        std::size_t K = wei.mDesc.GetLengths()[I0];
+        std::size_t Y = wei.mDesc.GetLengths()[I2];
+        std::size_t X = wei.mDesc.GetLengths()[I3];
+
+        std::size_t Ho = out.mDesc.GetLengths()[I2];
+        std::size_t Wo = out.mDesc.GetLengths()[I3];
+
+        double v = 0;
+
+        for(int y = 0; y < Y; ++y)
+        {
+            int h_tmp = hi + in_left_pads[I0] - y * conv_dilations[I0];
+
+            if(h_tmp % conv_strides[I0] == 0)
+            {
+                int ho = h_tmp / conv_strides[I0];
+
+                if(ho >= 0 && ho < Ho)
+                {
+                    for(int x = 0; x < X; ++x)
+                    {
+                        int w_tmp = wi + in_left_pads[I1] - x * conv_dilations[I1];
+
+                        if(w_tmp % conv_strides[I1] == 0)
+                        {
+                            int wo = w_tmp / conv_strides[I1];
+
+                            if(wo >= 0 && wo < Wo)
+                            {
+                                for(int k = 0; k < K; ++k)
+                                {
+                                    v += out(n, k, ho, wo) * wei(k, c, y, x);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        in(n, c, hi, wi) = v;
+    };
+
+    auto f_nhwc = [&](auto n, auto hi, auto wi, auto c) {
+        std::size_t N  = in.mDesc.GetLengths()[I0];
+        std::size_t Hi = in.mDesc.GetLengths()[I1];
+        std::size_t Wi = in.mDesc.GetLengths()[I2];
+        std::size_t C  = in.mDesc.GetLengths()[I3];
+
+        std::size_t K = wei.mDesc.GetLengths()[I0];
+        std::size_t Y = wei.mDesc.GetLengths()[I1];
+        std::size_t X = wei.mDesc.GetLengths()[I2];
+
+        std::size_t Ho = out.mDesc.GetLengths()[I1];
+        std::size_t Wo = out.mDesc.GetLengths()[I2];
+
+        double v = 0;
+
+        for(int y = 0; y < Y; ++y)
+        {
+            int h_tmp = hi + in_left_pads[I0] - y * conv_dilations[I0];
+
+            if(h_tmp % conv_strides[I0] == 0)
+            {
+                int ho = h_tmp / conv_strides[I0];
+
+                if(ho >= 0 && ho < Ho)
+                {
+                    for(int x = 0; x < X; ++x)
+                    {
+                        int w_tmp = wi + in_left_pads[I1] - x * conv_dilations[I1];
+
+                        if(w_tmp % conv_strides[I1] == 0)
+                        {
+                            int wo = w_tmp / conv_strides[I1];
+
+                            if(wo >= 0 && wo < Wo)
+                            {
+                                for(int k = 0; k < K; ++k)
+                                {
+                                    v += out(n, ho, wo, k) * wei(k, y, x, c);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        in(n, hi, wi, c) = v;
+    };
+
+    switch(layout)
+    {
+    case ConvTensorLayout::NCHW:
+        make_ParallelTensorFunctor(f_nchw,
+                                   in.mDesc.GetLengths()[0],
+                                   in.mDesc.GetLengths()[1],
+                                   in.mDesc.GetLengths()[2],
+                                   in.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
+        break;
+    case ConvTensorLayout::NHWC:
+        make_ParallelTensorFunctor(f_nhwc,
+                                   in.mDesc.GetLengths()[0],
+                                   in.mDesc.GetLengths()[1],
+                                   in.mDesc.GetLengths()[2],
+                                   in.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
+        break;
+    default: throw std::runtime_error("wrong! not supported layout");
+    }
+}
diff --git a/host/host_tensor/include/host_tensor.hpp b/host/host_tensor/include/host_tensor.hpp
new file mode 100644
index 0000000000..70778a4a94
--- /dev/null
+++ b/host/host_tensor/include/host_tensor.hpp
@@ -0,0 +1,322 @@
+#ifndef HOST_TENSOR_HPP
+#define HOST_TENSOR_HPP
+
+#include <thread>
+#include <vector>
+#include <numeric>
+#include <algorithm>
+#include <utility>
+#include <cassert>
+#include <iostream>
+
+template <typename Range>
+std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
+{
+    bool first = true;
+    for(auto&& v : range)
+    {
+        if(first)
+            first = false;
+        else
+            os << delim;
+        os << v;
+    }
+    return os;
+}
+
+template <typename T, typename Range>
+std::ostream& LogRangeAsType(std::ostream& os, Range&& range, std::string delim)
+{
+    bool first = true;
+    for(auto&& v : range)
+    {
+        if(first)
+            first = false;
+        else
+            os << delim;
+        os << T{v};
+    }
+    return os;
+}
+
+typedef enum
+{
+    Half  = 0,
+    Float = 1,
+} DataType_t;
+
+template <typename T>
+struct DataType;
+
+template <>
+struct DataType<float> : std::integral_constant<DataType_t, DataType_t::Float>
+{
+};
+
+template <typename F, typename T, std::size_t... Is>
+auto call_f_unpack_args_impl(F f, T args, std::index_sequence<Is...>)
+{
+    return f(std::get<Is>(args)...);
+}
+
+template <typename F, typename T>
+auto call_f_unpack_args(F f, T args)
+{
+    constexpr std::size_t N = std::tuple_size<T>{};
+
+    return call_f_unpack_args_impl(f, args, std::make_index_sequence<N>{});
+}
+
+template <typename F, typename T, std::size_t... Is>
+auto construct_f_unpack_args_impl(T args, std::index_sequence<Is...>)
+{
+    return F(std::get<Is>(args)...);
+}
+
+template <typename F, typename T>
+auto construct_f_unpack_args(F, T args)
+{
+    constexpr std::size_t N = std::tuple_size<T>{};
+
+    return construct_f_unpack_args_impl<F>(args, std::make_index_sequence<N>{});
+}
+
+struct HostTensorDescriptor
+{
+    HostTensorDescriptor() = delete;
+
+    template <typename X>
+    HostTensorDescriptor(std::vector<X> lens);
+
+    template <typename X, typename Y>
+    HostTensorDescriptor(std::vector<X> lens, std::vector<Y> strides);
+
+    void CalculateStrides();
+
+    template <typename Range>
+    HostTensorDescriptor(const Range& lens) : mLens(lens.begin(), lens.end())
+    {
+        this->CalculateStrides();
+    }
+
+    template <typename Range1, typename Range2>
+    HostTensorDescriptor(const Range1& lens, const Range2& strides)
+        : mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
+    {
+    }
+
+    std::size_t GetNumOfDimension() const;
+    std::size_t GetElementSize() const;
+    std::size_t GetElementSpace() const;
+
+    const std::vector<std::size_t>& GetLengths() const;
+    const std::vector<std::size_t>& GetStrides() const;
+
+    template <typename... Is>
+    std::size_t GetOffsetFromMultiIndex(Is... is) const
+    {
+        assert(sizeof...(Is) == this->GetNumOfDimension());
+        std::initializer_list<std::size_t> iss{static_cast<std::size_t>(is)...};
+        return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
+    }
+
+    private:
+    std::vector<std::size_t> mLens;
+    std::vector<std::size_t> mStrides;
+};
+
+struct joinable_thread : std::thread
+{
+    template <typename... Xs>
+    joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...)
+    {
+    }
+
+    joinable_thread(joinable_thread&&) = default;
+    joinable_thread& operator=(joinable_thread&&) = default;
+
+    ~joinable_thread()
+    {
+        if(this->joinable())
+            this->join();
+    }
+};
+
+template <typename F, typename... Xs>
+struct ParallelTensorFunctor
+{
+    F mF;
+    static constexpr std::size_t NDIM = sizeof...(Xs);
+    std::array<std::size_t, NDIM> mLens;
+    std::array<std::size_t, NDIM> mStrides;
+    std::size_t mN1d;
+
+    ParallelTensorFunctor(F f, Xs... xs) : mF(f), mLens({static_cast<std::size_t>(xs)...})
+    {
+        mStrides.back() = 1;
+        std::partial_sum(mLens.rbegin(),
+                         mLens.rend() - 1,
+                         mStrides.rbegin() + 1,
+                         std::multiplies<std::size_t>());
+        mN1d = mStrides[0] * mLens[0];
+    }
+
+    std::array<std::size_t, NDIM> GetNdIndices(std::size_t i) const
+    {
+        std::array<std::size_t, NDIM> indices;
+
+        for(int idim = 0; idim < NDIM; ++idim)
+        {
+            indices[idim] = i / mStrides[idim];
+            i -= indices[idim] * mStrides[idim];
+        }
+
+        return indices;
+    }
+
+    void operator()(std::size_t num_thread = std::thread::hardware_concurrency()) const
+    {
+        std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread;
+
+        std::vector<joinable_thread> threads(num_thread);
+
+        for(std::size_t it = 0; it < num_thread; ++it)
+        {
+            std::size_t iw_begin = it * work_per_thread;
+            std::size_t iw_end   = std::min((it + 1) * work_per_thread, mN1d);
+
+            auto f = [=] {
+                for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
+                {
+                    call_f_unpack_args(mF, GetNdIndices(iw));
+                }
+            };
+            threads[it] = joinable_thread(f);
+        }
+    }
+};
+
+template <typename F, typename... Xs>
+auto make_ParallelTensorFunctor(F f, Xs... xs)
+{
+    return ParallelTensorFunctor<F, Xs...>(f, xs...);
+}
+
+template <typename T>
+struct Tensor
+{
+    template <typename X>
+    Tensor(std::initializer_list<X> lens) : mDesc(lens), mData(mDesc.GetElementSpace())
+    {
+    }
+
+    template <typename X>
+    Tensor(std::vector<X> lens) : mDesc(lens), mData(mDesc.GetElementSpace())
+    {
+    }
+
+    template <typename X, typename Y>
+    Tensor(std::vector<X> lens, std::vector<Y> strides)
+        : mDesc(lens, strides), mData(mDesc.GetElementSpace())
+    {
+    }
+
+    Tensor(const HostTensorDescriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpace()) {}
+
+    template <typename G>
+    void GenerateTensorValue(G g, std::size_t num_thread = 1)
+    {
+        switch(mDesc.GetNumOfDimension())
+        {
+        case 1: {
+            auto f = [&](auto i) { (*this)(i) = g(i); };
+            make_ParallelTensorFunctor(f, mDesc.GetLengths()[0])(num_thread);
+            break;
+        }
+        case 2: {
+            auto f = [&](auto i0, auto i1) { (*this)(i0, i1) = g(i0, i1); };
+            make_ParallelTensorFunctor(f, mDesc.GetLengths()[0], mDesc.GetLengths()[1])(num_thread);
+            break;
+        }
+        case 3: {
+            auto f = [&](auto i0, auto i1, auto i2) { (*this)(i0, i1, i2) = g(i0, i1, i2); };
+            make_ParallelTensorFunctor(
+                f, mDesc.GetLengths()[0], mDesc.GetLengths()[1], mDesc.GetLengths()[2])(num_thread);
+            break;
+        }
+        case 4: {
+            auto f = [&](auto i0, auto i1, auto i2, auto i3) {
+                (*this)(i0, i1, i2, i3) = g(i0, i1, i2, i3);
+            };
+            make_ParallelTensorFunctor(f,
+                                       mDesc.GetLengths()[0],
+                                       mDesc.GetLengths()[1],
+                                       mDesc.GetLengths()[2],
+                                       mDesc.GetLengths()[3])(num_thread);
+            break;
+        }
+        default: throw std::runtime_error("unspported dimension");
+        }
+    }
+
+    template <typename... Is>
+    T& operator()(Is... is)
+    {
+        return mData[mDesc.GetOffsetFromMultiIndex(is...)];
+    }
+
+    template <typename... Is>
+    const T& operator()(Is... is) const
+    {
+        return mData[mDesc.GetOffsetFromMultiIndex(is...)];
+    }
+
+    typename std::vector<T>::iterator begin() { return mData.begin(); }
+
+    typename std::vector<T>::iterator end() { return mData.end(); }
+
+    typename std::vector<T>::const_iterator begin() const { return mData.begin(); }
+
+    typename std::vector<T>::const_iterator end() const { return mData.end(); }
+
+    HostTensorDescriptor mDesc;
+    std::vector<T> mData;
+};
+
+template <typename X>
+HostTensorDescriptor::HostTensorDescriptor(std::vector<X> lens) : mLens(lens)
+{
+    this->CalculateStrides();
+}
+
+template <typename X, typename Y>
+HostTensorDescriptor::HostTensorDescriptor(std::vector<X> lens, std::vector<Y> strides)
+    : mLens(lens), mStrides(strides)
+{
+}
+
+void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream& os = std::cout);
+
+template <typename T>
+void check_error(const Tensor<T>& ref, const Tensor<T>& result)
+{
+    float error     = 0;
+    float max_diff  = -1;
+    float ref_value = 0, result_value = 0;
+    for(int i = 0; i < ref.mData.size(); ++i)
+    {
+        error += std::abs(double(ref.mData[i]) - double(result.mData[i]));
+        float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
+        if(max_diff < diff)
+        {
+            max_diff     = diff;
+            ref_value    = ref.mData[i];
+            result_value = result.mData[i];
+        }
+    }
+
+    std::cout << "error: " << error << std::endl;
+    std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl;
+}
+
+#endif
diff --git a/host/host_tensor/include/host_tensor_generator.hpp b/host/host_tensor/include/host_tensor_generator.hpp
new file mode 100644
index 0000000000..98192e066f
--- /dev/null
+++ b/host/host_tensor/include/host_tensor_generator.hpp
@@ -0,0 +1,60 @@
+#ifndef HOST_TENSOR_GENERATOR_HPP
+#define HOST_TENSOR_GENERATOR_HPP
+
+#include <cmath>
+#include "config.hpp"
+
+struct GeneratorTensor_1
+{
+    int value = 1;
+
+    template <typename... Is>
+    float operator()(Is... is)
+    {
+        return value;
+    }
+};
+
+struct GeneratorTensor_2
+{
+    int min_value = 0;
+    int max_value = 1;
+
+    template <typename... Is>
+    float operator()(Is...)
+    {
+        return (std::rand() % (max_value - min_value)) + min_value;
+    }
+};
+
+template <typename T>
+struct GeneratorTensor_3
+{
+    T min_value = 0;
+    T max_value = 1;
+
+    template <typename... Is>
+    float operator()(Is...)
+    {
+        float tmp = float(std::rand()) / float(RAND_MAX);
+
+        return min_value + tmp * (max_value - min_value);
+    }
+};
+
+struct GeneratorTensor_Checkboard
+{
+    template <typename... Ts>
+    float operator()(Ts... Xs) const
+    {
+        std::array<ck::index_t, sizeof...(Ts)> dims = {{static_cast<ck::index_t>(Xs)...}};
+        return std::accumulate(dims.begin(),
+                               dims.end(),
+                               true,
+                               [](bool init, ck::index_t x) -> int { return init != (x % 2); })
+                   ? 1
+                   : -1;
+    }
+};
+
+#endif
diff --git a/host/host_tensor/src/device.cpp b/host/host_tensor/src/device.cpp
new file mode 100644
index 0000000000..d0d74a4c2a
--- /dev/null
+++ b/host/host_tensor/src/device.cpp
@@ -0,0 +1,67 @@
+#include "device.hpp"
+
+DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
+{
+    hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
+}
+
+void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; }
+
+void DeviceMem::ToDevice(const void* p)
+{
+    hipGetErrorString(
+        hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
+}
+
+void DeviceMem::FromDevice(void* p)
+{
+    hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
+}
+
+DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); }
+
+struct KernelTimerImpl
+{
+    KernelTimerImpl()
+    {
+        hipEventCreate(&mStart);
+        hipEventCreate(&mEnd);
+    }
+
+    ~KernelTimerImpl()
+    {
+        hipEventDestroy(mStart);
+        hipEventDestroy(mEnd);
+    }
+
+    void Start()
+    {
+        hipDeviceSynchronize();
+        hipEventRecord(mStart, 0);
+    }
+
+    void End()
+    {
+        hipEventRecord(mEnd, 0);
+        hipEventSynchronize(mEnd);
+    }
+
+    float GetElapsedTime() const
+    {
+        float time;
+        hipEventElapsedTime(&time, mStart, mEnd);
+        return time;
+    }
+
+    hipEvent_t mStart, mEnd;
+};
+
+KernelTimer::KernelTimer() : impl(new KernelTimerImpl()) {}
+
+KernelTimer::~KernelTimer() {}
+
+void KernelTimer::Start() { impl->Start(); }
+
+void KernelTimer::End() { impl->End(); }
+
+float KernelTimer::GetElapsedTime() const { return impl->GetElapsedTime(); }
diff --git a/host/host_tensor/src/host_tensor.cpp b/host/host_tensor/src/host_tensor.cpp
new file mode 100644
index 0000000000..e840baf7f5
--- /dev/null
+++ b/host/host_tensor/src/host_tensor.cpp
@@ -0,0 +1,48 @@
+#include <boost/range/adaptor/transformed.hpp>
+#include <cassert>
+
+#include "host_tensor.hpp"
+
+void HostTensorDescriptor::CalculateStrides()
+{
+    mStrides.clear();
+    mStrides.resize(mLens.size(), 0);
+    if(mStrides.empty())
+        return;
+
+    mStrides.back() = 1;
+    std::partial_sum(
+        mLens.rbegin(), mLens.rend() - 1, mStrides.rbegin() + 1, std::multiplies<std::size_t>());
+}
+
+std::size_t HostTensorDescriptor::GetNumOfDimension() const { return mLens.size(); }
+
+std::size_t HostTensorDescriptor::GetElementSize() const
+{
+    assert(mLens.size() == mStrides.size());
+    return std::accumulate(
+        mLens.begin(), mLens.end(), std::size_t{1}, std::multiplies<std::size_t>());
+}
+
+std::size_t HostTensorDescriptor::GetElementSpace() const
+{
+    auto ls = mLens | boost::adaptors::transformed([](std::size_t v) { return v - 1; });
+    return std::inner_product(ls.begin(), ls.end(), mStrides.begin(), std::size_t{0}) + 1;
+}
+
+const std::vector<std::size_t>& HostTensorDescriptor::GetLengths() const { return mLens; }
+
+const std::vector<std::size_t>& HostTensorDescriptor::GetStrides() const { return mStrides; }
+
+void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream& os)
+{
+    os << "dim " << desc.GetNumOfDimension() << ", ";
+
+    os << "lengths {";
+    LogRange(os, desc.GetLengths(), ", ");
+    os << "}, ";
+
+    os << "strides {";
+    LogRange(os, desc.GetStrides(), ", ");
+    os << "}" << std::endl;
+}
diff --git a/host/online_compilation/CMakeLists.txt b/host/online_compilation/CMakeLists.txt
new file mode 100644
index 0000000000..02f6795308
--- /dev/null
+++ b/host/online_compilation/CMakeLists.txt
@@ -0,0 +1,168 @@
+set(CMAKE_CXX_COMPILER /opt/rocm/llvm/bin/clang++)
+
+## for online-compiling of HIP kernels
+set(OLC_HIP_COMPILER ${CMAKE_CXX_COMPILER} CACHE PATH "")
+
+## reset to avoid the C++ options from the parent project
+set(CMAKE_CXX_FLAGS "")
+message("Compiling options for library and kernels: ${CMAKE_CXX_FLAGS}")
+
+# look for and register clang-offload-bundler
+if(OLC_HIP_COMPILER MATCHES ".*clang\\+\\+$")
+    find_program(OLC_OFFLOADBUNDLER_BIN clang-offload-bundler
+        PATH_SUFFIXES bin
+        PATHS
+	    /opt/rocm/llvm
+	    ${CMAKE_INSTALL_PREFIX}/llvm
+    )
+endif()
+
+if(OLC_OFFLOADBUNDLER_BIN)
+    message(STATUS "clang-offload-bundler found: ${OLC_OFFLOADBUNDLER_BIN}")
+    set(OLC_OFFLOADBUNDLER_BIN "${OLC_OFFLOADBUNDLER_BIN}")
+else()
+    # look for and register extractkernel
+    message(STATUS "clang-offload-bundler not found")
+
+    find_program(EXTRACTKERNEL_BIN extractkernel
+        PATH_SUFFIXES bin
+        PATHS
+            /opt/rocm/hip
+            /opt/rocm/hcc
+            /opt/rocm
+	    ${CMAKE_INSTALL_PREFIX}/hip
+            ${CMAKE_INSTALL_PREFIX}/hcc
+            ${CMAKE_INSTALL_PREFIX}
+
+    )
+    if(EXTRACTKERNEL_BIN)
+        message(STATUS "extractkernel found: ${EXTRACTKERNEL_BIN}")
+        set(EXTRACTKERNEL_BIN "${EXTRACTKERNEL_BIN}")
+    else()
+        message(FATAL_ERROR "extractkernel not found")
+    endif()
+endif()
+
+option(Boost_USE_STATIC_LIBS "Use boost static libraries" OFF)
+set(BOOST_COMPONENTS filesystem)
+add_definitions(-DBOOST_ALL_NO_LIB=1)
+find_package(Boost REQUIRED COMPONENTS ${BOOST_COMPONENTS})
+
+# HIP is always required
+find_package(hip REQUIRED PATHS /opt/rocm)
+message(STATUS "Build with HIP ${hip_VERSION}")
+target_flags(HIP_COMPILER_FLAGS hip::device)
+# Remove cuda arch flags
+string(REGEX REPLACE --cuda-gpu-arch=[a-z0-9]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
+string(REGEX REPLACE --offload-arch=[a-z0-9]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
+
+set(OLC_hip_VERSION_MAJOR "${hip_VERSION_MAJOR}")
+set(OLC_hip_VERSION_MINOR "${hip_VERSION_MINOR}")
+set(OLC_hip_VERSION_PATCH "${hip_VERSION_PATCH}")
+
+option(ENABLE_DEBUG "Build to enable debugging" ON)
+if(ENABLE_DEBUG)
+    set(OLC_DEBUG 1)
+else()
+    set(OLC_DEBUG 0)
+endif()
+
+configure_file("${PROJECT_SOURCE_DIR}/host/online_compilation/include/config.h.in" "${PROJECT_BINARY_DIR}/host/online_compilation/include/config.h")
+
+include_directories(BEFORE
+    ${PROJECT_BINARY_DIR}/host/online_compilation/include
+)
+
+message(STATUS "Hip compiler flags: ${HIP_COMPILER_FLAGS}")
+
+## HIP_COMPILER_FLAGS will be used for on-line compiling of the HIP kernels
+set(HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS} ${HIP_ONLINE_COMPILER_FLAGS}")
+add_definitions("-DHIP_COMPILER_FLAGS=${HIP_COMPILER_FLAGS}")
+
+file(GLOB_RECURSE COMPOSABLE_KERNEL_INCLUDE_1 "${PROJECT_SOURCE_DIR}/composable_kernel/include/*/*.hpp")
+file(GLOB COMPOSABLE_KERNEL_INCLUDE_2 "${PROJECT_SOURCE_DIR}/external/rocm/include/bfloat16_dev.hpp")
+set(MCONV_KERNEL_INCLUDES
+    ${COMPOSABLE_KERNEL_INCLUDE_1}
+    ${COMPOSABLE_KERNEL_INCLUDE_2}
+   )
+
+file(GLOB_RECURSE MCONV_KERNELS "${PROJECT_SOURCE_DIR}/composable_kernel/src/kernel_wrapper/*.cpp")
+
+add_kernels(${CMAKE_CURRENT_SOURCE_DIR} "${MCONV_KERNELS}")
+add_kernel_includes(${CMAKE_CURRENT_SOURCE_DIR} "${MCONV_KERNEL_INCLUDES}")
+
+set(ONLINE_COMPILATION_SOURCE
+     ${PROJECT_BINARY_DIR}/kernel.cpp
+     ${PROJECT_BINARY_DIR}/kernel_includes.cpp
+)
+
+include_directories(BEFORE
+    ${PROJECT_BINARY_DIR}/host/online_compilation/include
+    include
+)
+
+set(OLC_HIP_UTILITY_CPPS
+    hip_utility/logger.cpp
+    hip_utility/tmp_dir.cpp
+    hip_utility/md5.cpp  
+    hip_utility/exec_utils.cpp
+    hip_utility/target_properties.cpp  
+    hip_utility/handlehip.cpp
+    hip_utility/kernel_build_params.cpp  
+    hip_utility/hip_build_utils.cpp  
+    hip_utility/hipoc_program.cpp  
+    hip_utility/hipoc_kernel.cpp  
+    hip_utility/kernel_cache.cpp  
+    hip_utility/binary_cache.cpp
+   )
+
+list(APPEND OLC_SOURCES ${OLC_HIP_UTILITY_CPPS} ${OLC_HIP_UTILITY_HEADERS})
+
+## addkernels provide the tool to create inlined kernels in one header
+add_subdirectory(addkernels)
+
+function(inline_kernels_src KERNELS KERNEL_INCLUDES)
+    set(KERNEL_SRC_HPP_FILENAME batch_all.cpp.hpp)
+    set(KERNEL_SRC_HPP_PATH ${PROJECT_BINARY_DIR}/inlined_kernels/${KERNEL_SRC_HPP_FILENAME})
+    set(KERNEL_SRC_CPP_PATH ${PROJECT_BINARY_DIR}/inlined_kernels/batch_all.cpp)
+
+    add_custom_command(
+        OUTPUT ${KERNEL_SRC_HPP_PATH}
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        DEPENDS addkernels ${KERNELS} ${KERNEL_INCLUDES}
+        COMMAND $<TARGET_FILE:addkernels> -target ${KERNEL_SRC_HPP_PATH} -extern -source ${KERNELS}
+	COMMENT "Inlining All kernels"
+    )
+    configure_file(kernels_batch.cpp.in ${KERNEL_SRC_CPP_PATH})
+    list(APPEND OLC_SOURCES ${KERNEL_SRC_CPP_PATH} ${KERNEL_SRC_HPP_PATH})
+
+    set(OLC_SOURCES ${OLC_SOURCES} PARENT_SCOPE)
+endfunction()
+
+inline_kernels_src("${MCONV_KERNELS}" "${MCONV_KERNEL_INCLUDES}")
+
+list(APPEND ONLINE_COMPILATION_SOURCE ${OLC_SOURCES} ${PROJECT_BINARY_DIR}/olc_kernel_includes.h)
+
+add_custom_command(
+    OUTPUT ${PROJECT_BINARY_DIR}/olc_kernel_includes.h
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+    DEPENDS addkernels ${MCONV_KERNEL_INCLUDES}
+    COMMAND $<TARGET_FILE:addkernels> -no-recurse -guard GUARD_OLC_KERNEL_INCLUDES_HPP_ -target ${PROJECT_BINARY_DIR}/olc_kernel_includes.h -source ${MCONV_KERNEL_INCLUDES}
+    COMMENT "Inlining HIP kernel includes"
+  )
+
+## the library target
+add_library(online_compilation SHARED ${ONLINE_COMPILATION_SOURCE}) 
+
+target_include_directories(online_compilation PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/online_compilation/include/)
+target_include_directories(online_compilation PRIVATE ${PROJECT_BINARY_DIR})
+target_include_directories(online_compilation PRIVATE ${PROJECT_SOURCE_DIR}/external/half/include/)
+
+target_link_libraries(online_compilation PRIVATE hip::device)
+target_link_libraries(online_compilation INTERFACE hip::host)
+target_link_libraries(online_compilation PRIVATE Boost::filesystem)
+
+target_compile_features(online_compilation PUBLIC)
+set_target_properties(online_compilation PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+install(TARGETS online_compilation LIBRARY DESTINATION lib) 
diff --git a/host/online_compilation/addkernels/CMakeLists.txt b/host/online_compilation/addkernels/CMakeLists.txt
new file mode 100644
index 0000000000..874cba6a5e
--- /dev/null
+++ b/host/online_compilation/addkernels/CMakeLists.txt
@@ -0,0 +1,30 @@
+################################################################################
+# 
+# MIT License
+# 
+# Copyright (c) 2017 Advanced Micro Devices, Inc.
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# 
+################################################################################
+
+set(ADD_KERNELS_SOURCE include_inliner.cpp addkernels.cpp)
+
+add_executable(addkernels EXCLUDE_FROM_ALL ${ADD_KERNELS_SOURCE})
+
diff --git a/host/online_compilation/addkernels/addkernels.cpp b/host/online_compilation/addkernels/addkernels.cpp
new file mode 100644
index 0000000000..5be523d97b
--- /dev/null
+++ b/host/online_compilation/addkernels/addkernels.cpp
@@ -0,0 +1,264 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "include_inliner.hpp"
+#include <algorithm>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <string>
+
+void Bin2Hex(std::istream& source,
+             std::ostream& target,
+             const std::string& variable,
+             bool nullTerminate,
+             size_t bufferSize,
+             size_t lineSize)
+{
+    source.seekg(0, std::ios::end);
+    std::unique_ptr<unsigned char[]> buffer(new unsigned char[bufferSize]);
+    std::streamoff sourceSize = source.tellg();
+    std::streamoff blockStart = 0;
+
+    if(variable.length() != 0)
+    {
+        target << "extern const size_t " << variable << "_SIZE;" << std::endl;
+        target << "extern const unsigned char " << variable << "[];" << std::endl;
+        target << "const size_t " << variable << "_SIZE = " << std::setbase(10) << sourceSize << ";"
+               << std::endl;
+        target << "const unsigned char " << variable << "[] = {" << std::endl;
+    }
+
+    target << std::setbase(16) << std::setfill('0');
+    source.seekg(0, std::ios::beg);
+
+    while(blockStart < sourceSize)
+    {
+        source.read(reinterpret_cast<char*>(buffer.get()), bufferSize);
+
+        std::streamoff pos       = source.tellg();
+        std::streamoff blockSize = (pos < 0 ? sourceSize : pos) - blockStart;
+        std::streamoff i         = 0;
+
+        while(i < blockSize)
+        {
+            size_t j   = i;
+            size_t end = std::min<size_t>(i + lineSize, blockSize);
+
+            for(; j < end; j++)
+                target << "0x" << std::setw(2) << static_cast<unsigned>(buffer[j]) << ",";
+
+            target << std::endl;
+            i = end;
+        }
+
+        blockStart += blockSize;
+    }
+
+    if(nullTerminate)
+        target << "0x00," << std::endl;
+
+    if(variable.length() != 0)
+    {
+        target << "};" << std::endl;
+    }
+}
+
+void PrintHelp()
+{
+    std::cout << "Usage: bin2hex {<option>}" << std::endl;
+    std::cout << "Option format: -<option name>[ <option value>]" << std::endl;
+    std::cout << std::endl;
+    std::cout << "Options:" << std::endl;
+    std::cout
+        << "[REQUIRED] -s[ource] {<path to file>}: files to be processed. Must be last argument."
+        << std::endl;
+    std::cout << "           -t[arget] <path>: target file. Default: std out." << std::endl;
+    std::cout << "           -l[ine-size] <number>: bytes in one line. Default: 16." << std::endl;
+    std::cout << "           -b[uffer] <number>: read buffer size. Default: 512." << std::endl;
+    std::cout << "           -g[uard] <string>: guard name. Default: no guard" << std::endl;
+    std::cout << "           -n[o-recurse] : dont expand include files recursively. Default: off"
+              << std::endl;
+}
+
+[[gnu::noreturn]] void WrongUsage(const std::string& error)
+{
+    std::cout << "Wrong usage: " << error << std::endl;
+    std::cout << std::endl;
+    PrintHelp();
+    std::exit(1);
+}
+
+[[gnu::noreturn]] void UnknownArgument(const std::string& arg)
+{
+    std::ostringstream ss;
+    ss << "unknown argument - " << arg;
+    WrongUsage(ss.str());
+}
+
+void Process(const std::string& sourcePath,
+             std::ostream& target,
+             size_t bufferSize,
+             size_t lineSize,
+             bool recurse,
+             bool as_extern)
+{
+    std::string fileName(sourcePath);
+    std::string extension, root;
+    std::stringstream inlinerTemp;
+    auto extPos   = fileName.rfind('.');
+    auto slashPos = fileName.rfind('/');
+
+    if(extPos != std::string::npos)
+    {
+        extension = fileName.substr(extPos + 1);
+        fileName  = fileName.substr(0, extPos);
+    }
+
+    if(slashPos != std::string::npos)
+    {
+        root     = fileName.substr(0, slashPos + 1);
+        fileName = fileName.substr(slashPos + 1);
+    }
+
+    std::string variable(fileName);
+    std::ifstream sourceFile(sourcePath, std::ios::in | std::ios::binary);
+    std::istream* source = &sourceFile;
+
+    if(!sourceFile.good())
+    {
+        std::cerr << "File not found: " << sourcePath << std::endl;
+        std::exit(1);
+    }
+
+    const auto is_asm    = extension == "s";
+    const auto is_cl     = extension == "cl";
+    const auto is_hip    = extension == "cpp";
+    const auto is_header = extension == "hpp";
+
+    if(is_asm || is_cl || is_hip || is_header)
+    {
+        IncludeInliner inliner;
+
+        try
+        {
+            if(is_asm)
+                inliner.Process(
+                    sourceFile, inlinerTemp, root, sourcePath, ".include", false, recurse);
+            else if(is_cl || is_header)
+                inliner.Process(
+                    sourceFile, inlinerTemp, root, sourcePath, "#include", true, recurse);
+            else if(is_hip)
+                inliner.Process(
+                    sourceFile, inlinerTemp, root, sourcePath, "<#not_include>", true, false);
+        }
+        catch(const InlineException& ex)
+        {
+            std::cerr << ex.What() << std::endl;
+            std::cerr << ex.GetTrace() << std::endl;
+            std::exit(1);
+        }
+
+        source = &inlinerTemp;
+    }
+
+    std::transform(variable.begin(), variable.end(), variable.begin(), ::toupper);
+
+    if(as_extern && variable.length() != 0)
+    {
+        variable = "APP_KERNEL_" + variable;
+    }
+
+    Bin2Hex(*source, target, variable, true, bufferSize, lineSize);
+}
+
+int main(int argsn, char** args)
+{
+    if(argsn == 1)
+    {
+        PrintHelp();
+        return 2;
+    }
+
+    std::string guard;
+    size_t bufferSize = 512;
+    size_t lineSize   = 16;
+
+    std::ofstream targetFile;
+    std::ostream* target = &std::cout;
+    bool recurse         = true;
+    bool as_extern       = false;
+
+    int i = 0;
+    while(++i < argsn && **args != '-')
+    {
+        std::string arg(args[i] + 1);
+        std::transform(arg.begin(), arg.end(), arg.begin(), ::tolower);
+
+        if(arg == "s" || arg == "source")
+        {
+            if(guard.length() > 0)
+            {
+                *target << "#ifndef " << guard << std::endl;
+                *target << "#define " << guard << std::endl;
+            }
+
+            *target << "#include <cstddef>" << std::endl;
+
+            while(++i < argsn)
+            {
+                Process(args[i], *target, bufferSize, lineSize, recurse, as_extern);
+            }
+
+            if(guard.length() > 0)
+            {
+                *target << "#endif" << std::endl;
+            }
+
+            return 0;
+        }
+        else if(arg == "t" || arg == "target")
+        {
+            targetFile.open(args[++i], std::ios::out);
+            target = &targetFile;
+        }
+        else if(arg == "l" || arg == "line-size")
+            lineSize = std::stol(args[++i]);
+        else if(arg == "b" || arg == "buffer")
+            bufferSize = std::stol(args[++i]);
+        else if(arg == "g" || arg == "guard")
+            guard = args[++i];
+        else if(arg == "n" || arg == "no-recurse")
+            recurse = false;
+        else if(arg == "e" || arg == "extern")
+            as_extern = true;
+        else
+            UnknownArgument(arg);
+    }
+
+    WrongUsage("source key is required");
+}
diff --git a/host/online_compilation/addkernels/include_inliner.cpp b/host/online_compilation/addkernels/include_inliner.cpp
new file mode 100644
index 0000000000..e5aec7dd77
--- /dev/null
+++ b/host/online_compilation/addkernels/include_inliner.cpp
@@ -0,0 +1,213 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <algorithm>
+#include <exception>
+#include <fstream>
+#include <sstream>
+
+#ifdef _WIN32
+#include <windows.h>
+#endif
+#ifdef __linux__
+#include <linux/limits.h>
+#include <cstdlib>
+#endif // !WIN32
+
+#include "include_inliner.hpp"
+
+namespace PathHelpers {
+static int GetMaxPath()
+{
+#ifdef _WIN32
+    return MAX_PATH;
+#else
+    return PATH_MAX;
+#endif
+}
+
+static std::string GetAbsolutePath(const std::string& path)
+{
+    std::string result(GetMaxPath(), ' ');
+#ifdef _WIN32
+    const auto retval = GetFullPathName(path.c_str(), result.size(), &result[0], nullptr);
+
+    if(retval == 0)
+        return "";
+#else
+    auto* const retval = realpath(path.c_str(), &result[0]);
+
+    if(retval == nullptr)
+        return "";
+#endif
+    return result;
+}
+} // namespace PathHelpers
+
+std::string IncludeFileExceptionBase::What() const
+{
+    std::ostringstream ss;
+    ss << GetMessage() << ": <" << _file << ">";
+
+    return ss.str();
+}
+
+void IncludeInliner::Process(std::istream& input,
+                             std::ostream& output,
+                             const std::string& root,
+                             const std::string& file_name,
+                             const std::string& directive,
+                             bool allow_angle_brackets,
+                             bool recurse)
+{
+    ProcessCore(input, output, root, file_name, 0, directive, allow_angle_brackets, recurse);
+}
+
+void IncludeInliner::ProcessCore(std::istream& input,
+                                 std::ostream& output,
+                                 const std::string& root,
+                                 const std::string& file_name,
+                                 int line_number,
+                                 const std::string& directive,
+                                 bool allow_angle_brackets,
+                                 bool recurse)
+{
+    if(_include_depth >= include_depth_limit)
+        throw InlineStackOverflowException(GetIncludeStackTrace(0));
+
+    _include_depth++;
+    _included_stack_head =
+        std::make_shared<SourceFileDesc>(file_name, _included_stack_head, line_number);
+    auto current_line          = 0;
+    auto next_include_optional = false;
+
+    while(!input.eof())
+    {
+        std::string line;
+        std::string word;
+        std::getline(input, line);
+        std::istringstream line_parser(line);
+        line_parser >> word;
+        current_line++;
+        std::transform(word.begin(), word.end(), word.begin(), ::tolower);
+
+        const auto include_optional = next_include_optional;
+        next_include_optional       = false;
+
+        if(!word.empty() && word == "//inliner-include-optional")
+        {
+            if(include_optional)
+                throw IncludeExpectedException(GetIncludeStackTrace(current_line));
+            next_include_optional = true;
+            continue;
+        }
+
+        if(!word.empty() && word == directive && recurse)
+        {
+            auto first_quote_pos = line.find('"', static_cast<int>(line_parser.tellg()) + 1);
+            std::string::size_type second_quote_pos;
+
+            if(first_quote_pos != std::string::npos)
+            {
+                second_quote_pos = line.find('"', first_quote_pos + 1);
+                if(second_quote_pos == std::string::npos)
+                    throw WrongInlineDirectiveException(GetIncludeStackTrace(current_line));
+            }
+            else
+            {
+                if(!allow_angle_brackets)
+                    throw WrongInlineDirectiveException(GetIncludeStackTrace(current_line));
+
+                first_quote_pos = line.find('<', static_cast<int>(line_parser.tellg()) + 1);
+                if(first_quote_pos == std::string::npos)
+                    throw WrongInlineDirectiveException(GetIncludeStackTrace(current_line));
+
+                second_quote_pos = line.find('>', first_quote_pos + 1);
+                if(second_quote_pos == std::string::npos)
+                    throw WrongInlineDirectiveException(GetIncludeStackTrace(current_line));
+            }
+
+            const std::string include_file_path =
+                line.substr(first_quote_pos + 1, second_quote_pos - first_quote_pos - 1);
+            const std::string abs_include_file_path(
+                PathHelpers::GetAbsolutePath(root + "/" + include_file_path)); // NOLINT
+
+            if(abs_include_file_path.empty())
+            {
+                if(include_optional)
+                    continue;
+                throw IncludeNotFoundException(include_file_path,
+                                               GetIncludeStackTrace(current_line));
+            }
+            std::ifstream include_file(abs_include_file_path, std::ios::in);
+
+            if(!include_file.good())
+                throw IncludeCantBeOpenedException(include_file_path,
+                                                   GetIncludeStackTrace(current_line));
+
+            ProcessCore(include_file,
+                        output,
+                        root,
+                        include_file_path,
+                        current_line,
+                        directive,
+                        allow_angle_brackets,
+                        recurse);
+        }
+        else
+        {
+            if(include_optional)
+                throw IncludeExpectedException(GetIncludeStackTrace(current_line));
+
+            if(output.tellp() > 0)
+                output << std::endl;
+
+            output << line;
+        }
+    }
+
+    auto prev_file       = _included_stack_head->included_from;
+    _included_stack_head = prev_file;
+    _include_depth--;
+}
+
+std::string IncludeInliner::GetIncludeStackTrace(int line)
+{
+    std::ostringstream ss;
+
+    if(_included_stack_head == nullptr)
+        return "";
+
+    auto item = _included_stack_head;
+    ss << "    " << item->path << ":" << line;
+
+    while(item->included_from != nullptr)
+    {
+        ss << std::endl << "    from " << item->included_from->path << ":" << item->included_line;
+        item = item->included_from;
+    }
+
+    return ss.str();
+}
diff --git a/host/online_compilation/addkernels/include_inliner.hpp b/host/online_compilation/addkernels/include_inliner.hpp
new file mode 100644
index 0000000000..501ad7cc78
--- /dev/null
+++ b/host/online_compilation/addkernels/include_inliner.hpp
@@ -0,0 +1,142 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef SOURCE_INLINER_HPP
+
+#define SOURCE_INLINER_HPP
+#include "source_file_desc.hpp"
+#include <ostream>
+#include <memory>
+#include <stack>
+
+class InlineException : public std::exception
+{
+    public:
+    InlineException(const std::string& trace) : _trace(trace) {}
+
+    virtual std::string What() const = 0;
+    const std::string& GetTrace() const { return _trace; }
+
+    private:
+    std::string _trace;
+};
+
+class InlineStackOverflowException : public InlineException
+{
+    public:
+    InlineStackOverflowException(const std::string& trace) : InlineException(trace) {}
+
+    std::string What() const override
+    {
+        return "Include stack depth limit has been reached, possible circle includes";
+    }
+};
+
+class IncludeExpectedException : public InlineException
+{
+    public:
+    IncludeExpectedException(const std::string& trace) : InlineException(trace) {}
+
+    std::string What() const override { return "Include directive expected"; }
+};
+
+class WrongInlineDirectiveException : public InlineException
+{
+    public:
+    WrongInlineDirectiveException(const std::string& trace) : InlineException(trace) {}
+
+    std::string What() const override { return "Include directive has wrong format"; }
+};
+
+class IncludeFileExceptionBase : public InlineException
+{
+    public:
+    IncludeFileExceptionBase(const std::string& file, const std::string& trace)
+        : InlineException(trace), _file(file)
+    {
+    }
+
+    std::string What() const override;
+    virtual std::string GetMessage() const = 0;
+
+    private:
+    std::string _file;
+};
+
+class IncludeNotFoundException : public IncludeFileExceptionBase
+{
+    public:
+    IncludeNotFoundException(const std::string& file, const std::string& trace)
+        : IncludeFileExceptionBase(file, trace)
+    {
+    }
+
+    std::string GetMessage() const override
+    {
+        return "Include file not found (if it is optional put //inliner-include-optional on line "
+               "before it)";
+    }
+};
+
+class IncludeCantBeOpenedException : public IncludeFileExceptionBase
+{
+    public:
+    IncludeCantBeOpenedException(const std::string& file, const std::string& trace)
+        : IncludeFileExceptionBase(file, trace)
+    {
+    }
+
+    std::string GetMessage() const override { return "Can not open include file"; }
+};
+
+class IncludeInliner
+{
+    public:
+    int include_depth_limit = 256;
+
+    void Process(std::istream& input,
+                 std::ostream& output,
+                 const std::string& root,
+                 const std::string& file_name,
+                 const std::string& directive,
+                 bool allow_angle_brackets,
+                 bool recurse);
+    std::string GetIncludeStackTrace(int line);
+
+    private:
+    int _include_depth                                   = 0;
+    std::shared_ptr<SourceFileDesc> _included_stack_head = nullptr;
+
+    void ProcessCore(std::istream& input,
+                     std::ostream& output,
+                     const std::string& root,
+                     const std::string& file_name,
+                     int line_number,
+                     const std::string& directive,
+                     bool allow_angle_brackets,
+                     bool recurse);
+};
+
+#endif // !SOURCE_INLINER_HPP
diff --git a/host/online_compilation/addkernels/source_file_desc.hpp b/host/online_compilation/addkernels/source_file_desc.hpp
new file mode 100644
index 0000000000..f0fbf5938f
--- /dev/null
+++ b/host/online_compilation/addkernels/source_file_desc.hpp
@@ -0,0 +1,45 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef SOURCE_FILE_DESC_HPP
+
+#define SOURCE_FILE_DESC_HPP
+#include <string>
+#include <memory>
+
+class SourceFileDesc
+{
+    public:
+    const std::string path;
+    int included_line;
+    std::shared_ptr<SourceFileDesc> included_from;
+
+    SourceFileDesc(const std::string& path_, std::shared_ptr<SourceFileDesc> from, int line)
+        : path(path_), included_line(line), included_from(from)
+    {
+    }
+};
+
+#endif // SOURCE_FILE_DESC_HPP
diff --git a/host/online_compilation/hip_utility/binary_cache.cpp b/host/online_compilation/hip_utility/binary_cache.cpp
new file mode 100644
index 0000000000..f2f47a1a31
--- /dev/null
+++ b/host/online_compilation/hip_utility/binary_cache.cpp
@@ -0,0 +1,112 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <binary_cache.hpp>
+#include <handle.hpp>
+#include <md5.hpp>
+#include <env.hpp>
+#include <stringutils.hpp>
+#include <logger.hpp>
+#include <target_properties.hpp>
+#include <boost/filesystem.hpp>
+#include <fstream>
+#include <iostream>
+
+namespace olCompile {
+
+OLC_DECLARE_ENV_VAR(OLC_DISABLE_CACHE)
+OLC_DECLARE_ENV_VAR(HOME)
+
+static boost::filesystem::path ComputeCachePath()
+{
+    const char* home_dir = GetStringEnv(HOME{});
+    if(home_dir == nullptr || home_dir == std::string("/") || home_dir == std::string(""))
+    {
+        home_dir = "/tmp";
+    }
+
+    auto p = boost::filesystem::path{home_dir} / "_hip_binary_kernels_";
+
+    if(!boost::filesystem::exists(p))
+        boost::filesystem::create_directories(p);
+    return p;
+}
+
+boost::filesystem::path GetCachePath()
+{
+    static const boost::filesystem::path user_path = ComputeCachePath();
+
+    return user_path;
+}
+
+static bool IsCacheDisabled() { return olCompile::IsEnabled(OLC_DISABLE_CACHE{}); }
+
+boost::filesystem::path
+GetCacheFile(const std::string& device, const std::string& name, const std::string& args)
+{
+    // std::string filename = (is_kernel_str ? olCompile::md5(name) : name) + ".o";
+    std::string filename = name + ".o";
+    return GetCachePath() / olCompile::md5(device + ":" + args) / filename;
+}
+
+boost::filesystem::path LoadBinary(const TargetProperties& target,
+                                   const size_t num_cu,
+                                   const std::string& name,
+                                   const std::string& args)
+{
+    if(olCompile::IsCacheDisabled())
+        return {};
+
+    (void)num_cu;
+    auto f = GetCacheFile(target.DbId(), name, args);
+    if(boost::filesystem::exists(f))
+    {
+        return f.string();
+    }
+    else
+    {
+        return {};
+    }
+}
+
+void SaveBinary(const boost::filesystem::path& binary_path,
+                const TargetProperties& target,
+                const std::string& name,
+                const std::string& args)
+{
+    if(olCompile::IsCacheDisabled())
+    {
+        boost::filesystem::remove(binary_path);
+    }
+    else
+    {
+        auto p = GetCacheFile(target.DbId(), name, args);
+        boost::filesystem::create_directories(p.parent_path());
+        boost::filesystem::rename(binary_path, p);
+    }
+}
+
+} // namespace olCompile
diff --git a/host/online_compilation/hip_utility/exec_utils.cpp b/host/online_compilation/hip_utility/exec_utils.cpp
new file mode 100644
index 0000000000..60168c1a54
--- /dev/null
+++ b/host/online_compilation/hip_utility/exec_utils.cpp
@@ -0,0 +1,93 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <exec_utils.hpp>
+#include <manage_ptr.hpp>
+#include <istream>
+#include <ostream>
+#include <string>
+#include <cstdio>
+#include <array>
+#include <cassert>
+
+#ifdef __linux__
+#include <unistd.h>
+#include <cstdio>
+#include <sys/wait.h>
+#endif // __linux__
+
+namespace olCompile {
+namespace exec {
+
+int Run(const std::string& p, std::istream* in, std::ostream* out)
+{
+#ifdef __linux__
+    const auto redirect_stdin  = (in != nullptr);
+    const auto redirect_stdout = (out != nullptr);
+
+    assert(!(redirect_stdin && redirect_stdout));
+
+    const auto file_mode = redirect_stdout ? "r" : "w";
+    OLC_MANAGE_PTR(FILE*, pclose) pipe{popen(p.c_str(), file_mode)};
+
+    if(!pipe)
+        throw std::runtime_error("olCompile::exec::Run(): popen(" + p + ", " + file_mode +
+                                 ") failed");
+
+    if(redirect_stdin || redirect_stdout)
+    {
+        std::array<char, 1024> buffer{};
+
+        if(redirect_stdout)
+        {
+            while(feof(pipe.get()) == 0)
+                if(fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr)
+                    *out << buffer.data();
+        }
+        else
+        {
+            while(!in->eof())
+            {
+                in->read(buffer.data(), buffer.size() - 1);
+                buffer[in->gcount()] = 0;
+
+                if(fputs(buffer.data(), pipe.get()) == EOF)
+                    throw std::runtime_error("olCompile::exec::Run(): fputs() failed");
+            }
+        }
+    }
+
+    auto status = pclose(pipe.release());
+    return WEXITSTATUS(status);
+#else
+    (void)p;
+    (void)in;
+    (void)out;
+    return -1;
+#endif // __linux__
+}
+
+} // namespace exec
+} // namespace olCompile
diff --git a/host/online_compilation/hip_utility/handlehip.cpp b/host/online_compilation/hip_utility/handlehip.cpp
new file mode 100644
index 0000000000..f403b040f4
--- /dev/null
+++ b/host/online_compilation/hip_utility/handlehip.cpp
@@ -0,0 +1,285 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <handle.hpp>
+
+#include <binary_cache.hpp>
+#include <env.hpp>
+#include <kernel_cache.hpp>
+#include <stringutils.hpp>
+#include <target_properties.hpp>
+
+#include <hipCheck.hpp>
+
+#include <write_file.hpp>
+
+#include <boost/filesystem.hpp>
+#include <boost/lexical_cast.hpp>
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <algorithm>
+#include <cassert>
+#include <chrono>
+#include <thread>
+
+OLC_DECLARE_ENV_VAR(OLC_DEVICE_CU)
+
+namespace olCompile {
+
+std::size_t GetAvailableMemory()
+{
+    size_t free, total;
+    MY_HIP_CHECK(hipMemGetInfo(&free, &total));
+    return free;
+}
+
+int get_device_id() // Get random device
+{
+    int device;
+
+    MY_HIP_CHECK(hipGetDevice(&device));
+
+    return device;
+}
+
+void set_device(int id) { MY_HIP_CHECK(hipSetDevice(id)); }
+
+int set_default_device()
+{
+    int n;
+
+    MY_HIP_CHECK(hipGetDeviceCount(&n));
+
+    // Pick device based on process id
+    auto pid = ::getpid();
+    assert(pid > 0);
+    set_device(pid % n);
+    return (pid % n);
+}
+
+struct HandleImpl
+{
+    using StreamPtr = std::shared_ptr<typename std::remove_pointer<hipStream_t>::type>;
+
+    HandleImpl() {}
+
+    StreamPtr create_stream()
+    {
+        hipStream_t result;
+
+        MY_HIP_CHECK(hipStreamCreate(&result));
+
+        return StreamPtr{result, &hipStreamDestroy};
+    }
+
+    static StreamPtr reference_stream(hipStream_t s) { return StreamPtr{s, null_deleter{}}; }
+
+    std::string get_device_name() const
+    {
+        hipDeviceProp_t props;
+
+        MY_HIP_CHECK(hipGetDeviceProperties(&props, device));
+
+        const std::string name(props.gcnArchName);
+        return name;
+    }
+
+    StreamPtr stream = nullptr;
+    int device       = -1;
+    KernelCache cache;
+    TargetProperties target_properties;
+};
+
+Handle::Handle(hipStream_t stream) : impl(new HandleImpl())
+{
+    this->impl->device = get_device_id();
+
+    if(stream == nullptr)
+        this->impl->stream = HandleImpl::reference_stream(nullptr);
+    else
+        this->impl->stream = HandleImpl::reference_stream(stream);
+
+    this->impl->target_properties.Init(this);
+}
+
+Handle::Handle() : impl(new HandleImpl())
+{
+    this->impl->device = get_device_id();
+    this->impl->stream = HandleImpl::reference_stream(nullptr);
+
+    this->impl->target_properties.Init(this);
+}
+
+Handle::~Handle() {}
+
+void Handle::SetStream(hipStream_t streamID) const
+{
+    this->impl->stream = HandleImpl::reference_stream(streamID);
+
+    this->impl->target_properties.Init(this);
+}
+
+hipStream_t Handle::GetStream() const { return impl->stream.get(); }
+
+KernelInvoke Handle::AddKernel(const std::string& algorithm,
+                               const std::string& network_config,
+                               const std::string& program_name,
+                               const std::string& kernel_name,
+                               const std::vector<size_t>& vld,
+                               const std::vector<size_t>& vgd,
+                               const std::string& params,
+                               std::size_t cache_index) const
+{
+
+    auto obj = this->impl->cache.AddKernel(
+        *this, algorithm, network_config, program_name, kernel_name, vld, vgd, params, cache_index);
+    return this->Run(obj);
+}
+
+void Handle::ClearKernels(const std::string& algorithm, const std::string& network_config) const
+{
+    this->impl->cache.ClearKernels(algorithm, network_config);
+}
+
+const std::vector<Kernel>& Handle::GetKernelsImpl(const std::string& algorithm,
+                                                  const std::string& network_config) const
+{
+    return this->impl->cache.GetKernels(algorithm, network_config);
+}
+
+bool Handle::HasKernel(const std::string& algorithm, const std::string& network_config) const
+{
+    return this->impl->cache.HasKernels(algorithm, network_config);
+}
+
+KernelInvoke Handle::Run(Kernel k) const { return k.Invoke(this->GetStream()); }
+
+Program Handle::LoadProgram(const std::string& program_name, std::string params) const
+{
+    if((!olCompile::EndsWith(program_name, ".mlir-cpp")) &&
+       (!olCompile::EndsWith(program_name, ".mlir")))
+    {
+        params += " -mcpu=" + this->GetTargetProperties().Name();
+    }
+
+    auto hsaco = olCompile::LoadBinary(
+        this->GetTargetProperties(), this->GetMaxComputeUnits(), program_name, params);
+    if(hsaco.empty())
+    {
+        auto p = HIPOCProgram{program_name, params, this->GetTargetProperties()};
+
+        auto path = olCompile::GetCachePath() / boost::filesystem::unique_path();
+        if(p.IsCodeObjectInMemory())
+            olCompile::WriteFile(p.GetCodeObjectBlob(), path);
+        else
+            boost::filesystem::copy_file(p.GetCodeObjectPathname(), path);
+        olCompile::SaveBinary(path, this->GetTargetProperties(), program_name, params);
+
+        return p;
+    }
+    else
+    {
+        return HIPOCProgram{program_name, hsaco};
+    }
+}
+
+bool Handle::HasProgram(const std::string& program_name, const std::string& params) const
+{
+    return this->impl->cache.HasProgram(program_name, params);
+}
+
+void Handle::AddProgram(Program prog,
+                        const std::string& program_name,
+                        const std::string& params) const
+{
+    this->impl->cache.AddProgram(prog, program_name, params);
+}
+
+void Handle::Finish() const { MY_HIP_CHECK(hipStreamSynchronize(this->GetStream())); }
+
+std::size_t Handle::GetLocalMemorySize() const
+{
+    int result;
+
+    MY_HIP_CHECK(hipDeviceGetAttribute(
+        &result, hipDeviceAttributeMaxSharedMemoryPerBlock, this->impl->device));
+
+    return result;
+}
+
+std::size_t Handle::GetGlobalMemorySize() const
+{
+    size_t result;
+
+    MY_HIP_CHECK(hipDeviceTotalMem(&result, this->impl->device));
+
+    return result;
+}
+
+std::size_t Handle::GetMaxComputeUnits() const
+{
+    int result;
+    const char* const num_cu = olCompile::GetStringEnv(OLC_DEVICE_CU{});
+    if(num_cu != nullptr && strlen(num_cu) > 0)
+    {
+        return boost::lexical_cast<std::size_t>(num_cu);
+    }
+
+    MY_HIP_CHECK(
+        hipDeviceGetAttribute(&result, hipDeviceAttributeMultiprocessorCount, this->impl->device));
+
+    return result;
+}
+
+std::size_t Handle::GetWavefrontWidth() const
+{
+    hipDeviceProp_t props{};
+
+    MY_HIP_CHECK(hipGetDeviceProperties(&props, this->impl->device));
+
+    auto result = static_cast<size_t>(props.warpSize);
+    return result;
+}
+
+std::string Handle::GetDeviceNameImpl() const { return this->impl->get_device_name(); }
+
+std::string Handle::GetDeviceName() const { return this->impl->target_properties.Name(); }
+
+const TargetProperties& Handle::GetTargetProperties() const
+{
+    return this->impl->target_properties;
+}
+
+std::ostream& Handle::Print(std::ostream& os) const
+{
+    os << "stream: " << this->impl->stream << ", device_id: " << this->impl->device;
+    return os;
+}
+
+} // namespace olCompile
diff --git a/host/online_compilation/hip_utility/hip_build_utils.cpp b/host/online_compilation/hip_utility/hip_build_utils.cpp
new file mode 100644
index 0000000000..e73c345937
--- /dev/null
+++ b/host/online_compilation/hip_utility/hip_build_utils.cpp
@@ -0,0 +1,346 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <hip_build_utils.hpp>
+#include <stringutils.hpp>
+#include <tmp_dir.hpp>
+#include <env.hpp>
+#include <target_properties.hpp>
+#include <write_file.hpp>
+#include <exec_utils.hpp>
+#include <logger.hpp>
+#include <config.h>
+#include <boost/optional.hpp>
+#include <sstream>
+#include <string>
+#include <stdexcept>
+
+#include <iostream>
+
+OLC_DECLARE_ENV_VAR(OLC_DEBUG_HIP_VERBOSE)
+OLC_DECLARE_ENV_VAR(OLC_DEBUG_HIP_DUMP)
+
+#define OLC_HIP_COMPILER "/opt/rocm/llvm/bin/clang++"
+
+namespace olCompile {
+
+bool IsHccCompiler()
+{
+    static const auto isHcc = EndsWith(OLC_HIP_COMPILER, "hcc");
+    return isHcc;
+}
+
+bool IsHipClangCompiler()
+{
+    static const auto isClangXX = EndsWith(OLC_HIP_COMPILER, "clang++");
+    return isClangXX;
+}
+
+namespace {
+
+inline bool ProduceCoV3()
+{
+    // Otherwise, let's enable CO v3 for HIP kernels since ROCm 3.0.
+    return (HipCompilerVersion() >= external_tool_version_t{3, 0, -1});
+}
+
+/// Returns option for enabling/disabling CO v3 generation for the compiler
+/// that builds HIP kernels, depending on compiler version etc.
+inline const std::string& GetCoV3Option(const bool enable)
+{
+    /// \note PR #2166 uses the "--hcc-cov3" option when isHCC is true.
+    /// It's unclear why... HCC included in ROCm 2.8 does not support it,
+    /// perhaps it suits for some older HCC?
+    ///
+    /// These options are Ok for ROCm 3.0:
+    static const std::string option_enable{"-mcode-object-v3"};
+    static const std::string no_option{};
+    if(enable)
+        return option_enable;
+    else
+        return no_option;
+}
+} // namespace
+
+static boost::filesystem::path HipBuildImpl(boost::optional<TmpDir>& tmp_dir,
+                                            const std::string& filename,
+                                            std::string src,
+                                            std::string params,
+                                            const TargetProperties& target,
+                                            const bool testing_mode,
+                                            const bool sources_already_reside_on_filesystem)
+{
+#ifdef __linux__
+    // Write out the include files
+    // Let's assume includes are overkill for feature tests & optimize'em out.
+    if(!testing_mode)
+    {
+        auto inc_list = GetHipKernelIncList();
+        auto inc_path = tmp_dir->path;
+        boost::filesystem::create_directories(inc_path);
+        for(auto inc_file : inc_list)
+        {
+            auto inc_src = GetKernelInc(inc_file);
+            WriteFile(inc_src, inc_path / inc_file);
+        }
+    }
+
+    // Sources produced by MLIR-cpp already reside in tmp dir.
+    if(!sources_already_reside_on_filesystem)
+    {
+        src += "\nint main() {}\n";
+        WriteFile(src, tmp_dir->path / filename);
+    }
+
+    // cppcheck-suppress unreadVariable
+    const LcOptionTargetStrings lots(target);
+
+    auto env = std::string("");
+    if(IsHccCompiler())
+    {
+        params += " -amdgpu-target=" + target.Name();
+        params += " " + GetCoV3Option(ProduceCoV3());
+    }
+    else if(IsHipClangCompiler())
+    {
+        if(params.find("-std=") == std::string::npos)
+            params += " --std=c++11";
+
+        if(HipCompilerVersion() < external_tool_version_t{4, 1, 0})
+            params += " --cuda-gpu-arch=" + lots.device;
+        else
+            params += " --cuda-gpu-arch=" + lots.device + lots.xnack;
+
+        params += " --cuda-device-only";
+        params += " -c";
+        params += " -O3 ";
+    }
+
+    params += " -Wno-unused-command-line-argument -I. ";
+    params += OLC_STRINGIZE(HIP_COMPILER_FLAGS);
+    if(IsHccCompiler())
+    {
+        env += std::string("KMOPTLLC=\"-mattr=+enable-ds128 ");
+        if(HipCompilerVersion() >= external_tool_version_t{2, 8, 0})
+            env += " --amdgpu-spill-vgpr-to-agpr=0";
+        env += '\"';
+    }
+    else if(IsHipClangCompiler())
+    {
+        params += " -mllvm --amdgpu-spill-vgpr-to-agpr=0";
+        params += " -mllvm -amdgpu-early-inline-all=true";
+        params += " -mllvm -amdgpu-function-calls=false";
+    }
+
+    if(olCompile::IsEnabled(OLC_DEBUG_HIP_VERBOSE{}))
+    {
+        params += " -v";
+    }
+
+    if(olCompile::IsEnabled(OLC_DEBUG_HIP_DUMP{}))
+    {
+        if(IsHccCompiler())
+        {
+            params += " -gline-tables-only";
+            env += " KMDUMPISA=1";
+            env += " KMDUMPLLVM=1";
+        }
+        else if(IsHipClangCompiler())
+        {
+            params += " -gline-tables-only";
+            params += " -save-temps";
+        }
+    }
+
+    // hip version
+    params +=
+        std::string(" -DHIP_PACKAGE_VERSION_FLAT=") + std::to_string(HIP_PACKAGE_VERSION_FLAT);
+
+    params += " ";
+    auto bin_file = tmp_dir->path / (filename + ".o");
+
+    // compile
+    const std::string redirector = testing_mode ? " 1>/dev/null 2>&1" : "";
+    tmp_dir->Execute(env + std::string(" ") + OLC_HIP_COMPILER,
+                     params + filename + " -o " + bin_file.string() + redirector);
+    if(!boost::filesystem::exists(bin_file))
+        throw std::runtime_error(filename + " failed to compile");
+#ifdef EXTRACTKERNEL_BIN
+    if(IsHccCompiler())
+    {
+        // call extract kernel
+        tmp_dir->Execute(EXTRACTKERNEL_BIN, " -i " + bin_file.string());
+        auto hsaco =
+            std::find_if(boost::filesystem::directory_iterator{tmp_dir->path}, {}, [](auto entry) {
+                return (entry.path().extension() == ".hsaco");
+            });
+
+        if(hsaco == boost::filesystem::directory_iterator{})
+        {
+            fdt_log(LogLevel::Info, "HipBuild", "failed to find *.hsaco in ")
+                << hsaco->path().string() << std::endl;
+        }
+
+        return hsaco->path();
+    }
+#endif
+    return bin_file;
+#else
+    (void)filename;
+    (void)params;
+    throw std::runtimer_error("HIP kernels are only supported in Linux");
+#endif
+}
+
+boost::filesystem::path HipBuild(boost::optional<TmpDir>& tmp_dir,
+                                 const std::string& filename,
+                                 std::string src,
+                                 std::string params,
+                                 const TargetProperties& target,
+                                 const bool sources_already_reside_on_filesystem)
+{
+    return HipBuildImpl(
+        tmp_dir, filename, src, params, target, false, sources_already_reside_on_filesystem);
+}
+
+void bin_file_to_str(const boost::filesystem::path& file, std::string& buf)
+{
+    std::ifstream bin_file_ptr(file.string().c_str(), std::ios::binary);
+    std::ostringstream bin_file_strm;
+    bin_file_strm << bin_file_ptr.rdbuf();
+    buf = bin_file_strm.str();
+}
+
+static external_tool_version_t HipCompilerVersionImpl()
+{
+    external_tool_version_t version;
+    if(IsHccCompiler())
+    {
+        const std::string path(OLC_HIP_COMPILER);
+        const std::string mandatory_prefix("(based on HCC ");
+        do
+        {
+            if(path.empty() || !std::ifstream(path).good())
+                break;
+
+            std::stringstream out;
+            if(olCompile::exec::Run(path + " --version", nullptr, &out) != 0)
+                break;
+
+            std::string line;
+            while(!out.eof())
+            {
+                std::getline(out, line);
+                fdt_log() << line;
+                auto begin = line.find(mandatory_prefix);
+                if(begin == std::string::npos)
+                    continue;
+
+                begin += mandatory_prefix.size();
+                int v3, v2, v1 = v2 = v3 = -1;
+                char c2, c1 = c2 = 'X';
+                std::istringstream iss(line.substr(begin));
+                iss >> v1 >> c1 >> v2 >> c2 >> v3;
+                if(!iss.fail() && v1 >= 0)
+                {
+                    version.major = v1;
+                    if(c1 == '.' && v2 >= 0)
+                    {
+                        version.minor = v2;
+                        if(c2 == '.' && v3 >= 0)
+                            version.patch = v3;
+                    }
+                }
+                break;
+            }
+        } while(false);
+    }
+    else
+    {
+#ifdef HIP_PACKAGE_VERSION_MAJOR
+        fdt_log(
+            LogLevel::Info, "HipCompilerVersion", "Read version information from HIP package...");
+        version.major = HIP_PACKAGE_VERSION_MAJOR;
+#ifdef HIP_PACKAGE_VERSION_MINOR
+        version.minor = HIP_PACKAGE_VERSION_MINOR;
+#else
+        version.minor = 0;
+#endif
+#ifdef HIP_PACKAGE_VERSION_PATCH
+        version.patch = HIP_PACKAGE_VERSION_PATCH;
+#else
+        version.patch = 0;
+#endif
+#else // HIP_PACKAGE_VERSION_MAJOR is not defined. CMake failed to find HIP package.
+        fdt_log(LogLevel::Info, "HipCompilerVersion", "...assuming 3.2.0 (hip-clang RC)");
+        version.major = 3;
+        version.minor = 2;
+        version.patch = 0;
+#endif
+    }
+    fdt_log() << version.major << '.' << version.minor << '.' << version.patch << std::endl;
+    return version;
+}
+
+external_tool_version_t HipCompilerVersion()
+{
+    // NOLINTNEXTLINE (cppcoreguidelines-avoid-non-const-global-variables)
+    static auto once = HipCompilerVersionImpl();
+    return once;
+}
+
+bool operator>(const external_tool_version_t& lhs, const external_tool_version_t& rhs)
+{
+    if(lhs.major > rhs.major)
+        return true;
+    else if(lhs.major == rhs.major)
+    {
+        if(lhs.minor > rhs.minor)
+            return true;
+        else if(lhs.minor == rhs.minor)
+            return (lhs.patch > rhs.patch);
+        else
+            return false;
+    }
+    else
+        return false;
+}
+
+bool operator<(const external_tool_version_t& lhs, const external_tool_version_t& rhs)
+{
+    return rhs > lhs;
+}
+bool operator>=(const external_tool_version_t& lhs, const external_tool_version_t& rhs)
+{
+    return !(lhs < rhs);
+}
+
+bool operator<=(const external_tool_version_t& lhs, const external_tool_version_t& rhs)
+{
+    return !(lhs > rhs);
+}
+
+} // namespace olCompile
diff --git a/host/online_compilation/hip_utility/hipoc_kernel.cpp b/host/online_compilation/hip_utility/hipoc_kernel.cpp
new file mode 100644
index 0000000000..41fcd92c94
--- /dev/null
+++ b/host/online_compilation/hip_utility/hipoc_kernel.cpp
@@ -0,0 +1,84 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <env.hpp>
+#include <hipoc_kernel.hpp>
+#include <hipCheck.hpp>
+
+#include <hip/hip_ext.h>
+#include <hip/hip_runtime.h>
+
+#include <chrono>
+#include <thread>
+
+namespace olCompile {
+
+void HIPOCKernelInvoke::run(void* args, std::size_t size) const
+{
+    HipEventPtr start = nullptr;
+    HipEventPtr stop  = nullptr;
+    void* config[]    = {// HIP_LAUNCH_PARAM_* are macros that do horrible things
+                      // NOLINTNEXTLINE cppcoreguidelines-pro-type-cstyle-cast
+                      HIP_LAUNCH_PARAM_BUFFER_POINTER,
+                      args,
+                      // NOLINTNEXTLINE cppcoreguidelines-pro-type-cstyle-cast
+                      HIP_LAUNCH_PARAM_BUFFER_SIZE,
+                      &size,
+                      // NOLINTNEXTLINE cppcoreguidelines-pro-type-cstyle-cast
+                      HIP_LAUNCH_PARAM_END};
+    if(callback)
+    {
+        start = make_hip_event();
+        stop  = make_hip_event();
+    }
+
+    MY_HIP_CHECK(hipExtModuleLaunchKernel(fun,
+                                          gdims[0],
+                                          gdims[1],
+                                          gdims[2],
+                                          ldims[0],
+                                          ldims[1],
+                                          ldims[2],
+                                          0,
+                                          stream,
+                                          nullptr,
+                                          reinterpret_cast<void**>(&config),
+                                          start.get(),
+                                          stop.get()));
+
+    if(callback)
+    {
+        MY_HIP_CHECK(hipEventSynchronize(stop.get()));
+        callback(start.get(), stop.get());
+    }
+}
+
+HIPOCKernelInvoke HIPOCKernel::Invoke(hipStream_t stream,
+                                      std::function<void(hipEvent_t, hipEvent_t)> callback) const
+{
+    return HIPOCKernelInvoke{stream, fun, ldims, gdims, name, callback};
+}
+} // namespace olCompile
diff --git a/host/online_compilation/hip_utility/hipoc_program.cpp b/host/online_compilation/hip_utility/hipoc_program.cpp
new file mode 100644
index 0000000000..d2ea1fcb0c
--- /dev/null
+++ b/host/online_compilation/hip_utility/hipoc_program.cpp
@@ -0,0 +1,139 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <hip_build_utils.hpp>
+#include <hipoc_program.hpp>
+#include <kernel.hpp>
+#include <stringutils.hpp>
+#include <target_properties.hpp>
+#include <env.hpp>
+#include <write_file.hpp>
+#include <boost/optional.hpp>
+#include <boost/filesystem/operations.hpp>
+
+#include <cstring>
+#include <mutex>
+#include <sstream>
+
+#include <unistd.h>
+
+namespace olCompile {
+
+static hipModulePtr CreateModule(const boost::filesystem::path& hsaco_file)
+{
+    hipModule_t raw_m;
+    MY_HIP_CHECK(hipModuleLoad(&raw_m, hsaco_file.string().c_str()));
+    hipModulePtr m{raw_m};
+    return m;
+}
+
+template <typename T> /// intended for std::string and std::vector<char>
+hipModulePtr CreateModuleInMem(const T& blob)
+{
+    hipModule_t raw_m;
+    MY_HIP_CHECK(hipModuleLoadData(&raw_m, reinterpret_cast<const void*>(blob.data())));
+    hipModulePtr m{raw_m};
+    return m;
+}
+
+HIPOCProgramImpl::HIPOCProgramImpl(const std::string& program_name,
+                                   const boost::filesystem::path& filespec)
+    : program(program_name), hsaco_file(filespec)
+{
+    this->module = CreateModule(hsaco_file);
+}
+
+HIPOCProgramImpl::HIPOCProgramImpl(const std::string& program_name,
+                                   std::string params,
+                                   const TargetProperties& target_)
+    : program(program_name), target(target_)
+{
+    BuildCodeObject(params);
+    if(!binary.empty())
+    {
+        module = CreateModuleInMem(this->binary);
+    }
+    else
+    {
+        module = CreateModule(this->hsaco_file);
+    }
+}
+
+void HIPOCProgramImpl::BuildCodeObjectInFile(std::string& params,
+                                             const std::string& src,
+                                             const std::string& filename)
+{
+
+    this->dir.emplace(filename);
+    hsaco_file = dir->path / (filename + ".o");
+
+    if(olCompile::EndsWith(filename, ".cpp"))
+    {
+        hsaco_file = HipBuild(dir, filename, src, params, target);
+    }
+    else
+        throw std::runtime_error("Only HIP kernel source of .cpp file is supported");
+
+    if(!boost::filesystem::exists(hsaco_file))
+        throw std::runtime_error("Cant find file: " + hsaco_file.string());
+}
+
+void HIPOCProgramImpl::BuildCodeObject(std::string params)
+{
+    std::string filename = program;
+
+    if(olCompile::EndsWith(filename, ".cpp"))
+    {
+        params += " -Wno-everything";
+    }
+
+    BuildCodeObjectInFile(params, GetKernelSrc(this->program), filename);
+}
+
+HIPOCProgram::HIPOCProgram() {}
+HIPOCProgram::HIPOCProgram(const std::string& program_name,
+                           std::string params,
+                           const TargetProperties& target)
+    : impl(std::make_shared<HIPOCProgramImpl>(program_name, params, target))
+{
+}
+
+HIPOCProgram::HIPOCProgram(const std::string& program_name, const boost::filesystem::path& hsaco)
+    : impl(std::make_shared<HIPOCProgramImpl>(program_name, hsaco))
+{
+}
+
+hipModule_t HIPOCProgram::GetModule() const { return impl->module.get(); }
+
+boost::filesystem::path HIPOCProgram::GetCodeObjectPathname() const { return impl->hsaco_file; }
+
+std::string HIPOCProgram::GetCodeObjectBlob() const
+{
+    return {impl->binary.data(), impl->binary.size()};
+}
+
+bool HIPOCProgram::IsCodeObjectInMemory() const { return !impl->binary.empty(); };
+
+} // namespace olCompile
diff --git a/host/online_compilation/hip_utility/kernel_build_params.cpp b/host/online_compilation/hip_utility/kernel_build_params.cpp
new file mode 100644
index 0000000000..f9474796bc
--- /dev/null
+++ b/host/online_compilation/hip_utility/kernel_build_params.cpp
@@ -0,0 +1,66 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <sstream>
+
+#include <boost/range/adaptor/transformed.hpp>
+
+#include <kernel_build_params.hpp>
+#include <stringutils.hpp>
+
+namespace olCompile {
+
+static std::string GenerateDefines(const std::vector<KernelBuildParameter>& options,
+                                   const std::string& prefix)
+{
+    const auto strs =
+        options | boost::adaptors::transformed([&prefix](const KernelBuildParameter& define) {
+            std::ostringstream ss;
+
+            ss << '-';
+            if(define.type == ParameterTypes::Define)
+                ss << prefix;
+
+            ss << define.name;
+
+            if(!define.value.empty())
+            {
+                switch(define.type)
+                {
+                case ParameterTypes::Define: ss << '='; break;
+                case ParameterTypes::Option: ss << ' '; break;
+                }
+
+                ss << define.value;
+            }
+
+            return ss.str();
+        });
+
+    return JoinStrings(strs, " ");
+}
+
+} // namespace olCompile
diff --git a/host/online_compilation/hip_utility/kernel_cache.cpp b/host/online_compilation/hip_utility/kernel_cache.cpp
new file mode 100644
index 0000000000..fff57c194e
--- /dev/null
+++ b/host/online_compilation/hip_utility/kernel_cache.cpp
@@ -0,0 +1,154 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+/* ************************************************************************
+ * Copyright 2015 Vratis, Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************ */
+
+#include <env.hpp>
+#include <kernel_cache.hpp>
+#include <stringutils.hpp>
+
+#include <iostream>
+#include <iterator>
+
+namespace olCompile {
+
+const std::vector<Kernel>& KernelCache::GetKernels(const std::string& algorithm,
+                                                   const std::string& network_config)
+{
+
+    std::pair<std::string, std::string> key = std::make_pair(algorithm, network_config);
+
+    const auto it = kernel_map.find(key);
+    if(it != kernel_map.end())
+    {
+        return it->second;
+    }
+
+    static const std::vector<Kernel> empty{};
+    return empty;
+}
+
+bool KernelCache::HasKernels(const std::string& algorithm, const std::string& network_config) const
+{
+    const auto key = std::make_pair(algorithm, network_config);
+    const auto it  = kernel_map.find(key);
+    if(it == kernel_map.end())
+        return false;
+
+    if(it->second.empty())
+    {
+        throw std::runtime_error(
+            "There should be at least one kernel in kernel cache if an entry exists");
+    }
+
+    return true;
+}
+
+bool KernelCache::HasProgram(const std::string& name, const std::string& params) const
+{
+    const auto key = std::make_pair(name, params);
+    return program_map.count(key) > 0;
+}
+
+void KernelCache::AddProgram(Program prog, const std::string& program_name, std::string params)
+{
+    program_map[std::make_pair(program_name, params)] = prog;
+}
+
+Kernel KernelCache::AddKernel(const Handle& h,
+                              const std::string& algorithm,
+                              const std::string& network_config,
+                              const std::string& program_name,
+                              const std::string& kernel_name,
+                              const std::vector<size_t>& vld,
+                              const std::vector<size_t>& vgd,
+                              std::string params,
+                              std::size_t cache_index)
+{
+    const std::pair<std::string, std::string> key = std::make_pair(algorithm, network_config);
+
+    Program program;
+
+    auto program_it = program_map.find(std::make_pair(program_name, params));
+    if(program_it != program_map.end())
+    {
+        program = program_it->second;
+    }
+    else
+    {
+        program                                           = h.LoadProgram(program_name, params);
+        program_map[std::make_pair(program_name, params)] = program;
+    }
+
+    Kernel kernel{};
+
+    kernel = Kernel{program, kernel_name, vld, vgd};
+
+    if(!network_config.empty() && !algorithm.empty())
+    {
+        this->AddKernel(key, kernel, cache_index);
+    }
+    return kernel;
+}
+
+void KernelCache::AddKernel(Key key, Kernel k, std::size_t cache_index)
+{
+    auto&& v = kernel_map[key];
+    if(cache_index >= v.size())
+    {
+        v.resize(cache_index + 1);
+    }
+    v[cache_index] = k;
+}
+
+void KernelCache::ClearKernels(const std::string& algorithm, const std::string& network_config)
+{
+    if(network_config.empty() || algorithm.empty())
+    {
+        throw std::runtime_error("Network config or algorithm empty.");
+    }
+    const std::pair<std::string, std::string> key = std::make_pair(algorithm, network_config);
+    auto&& v                                      = this->kernel_map[key];
+    if(!v.empty()) {}
+    v.clear();
+}
+
+KernelCache::KernelCache() {}
+
+} // namespace olCompile
diff --git a/host/online_compilation/hip_utility/logger.cpp b/host/online_compilation/hip_utility/logger.cpp
new file mode 100644
index 0000000000..e8d31562a5
--- /dev/null
+++ b/host/online_compilation/hip_utility/logger.cpp
@@ -0,0 +1,43 @@
+#include <config.h>
+#include <logger.hpp>
+#include <iostream>
+#include <string>
+
+using namespace std;
+
+namespace olCompile {
+
+#if OLC_DEBUG
+static LogLevel defLevel = LogLevel::Info2;
+#else
+static LogLevel defLevel = LogLevel::Error;
+#endif
+
+string LogLevelString(LogLevel level)
+{
+    switch(level)
+    {
+    case LogLevel::Error: return ("Error");
+    case LogLevel::Warning: return ("Warning");
+    case LogLevel::Info: return ("Info");
+    case LogLevel::Info2: return ("Info2");
+    default: return ("Unknown");
+    };
+};
+
+ostream& fdt_log(LogLevel level, const char* header, const char* content)
+{
+    if(level > olCompile::defLevel)
+    {
+        return (cerr);
+    };
+
+    cerr << endl << LogLevelString(level) << ":" << header << ", " << content;
+
+    return (cerr);
+}
+
+ostream& fdt_log() { return (cerr); };
+
+void fdt_log_flush() { cerr << endl; }
+}; // namespace olCompile
diff --git a/host/online_compilation/hip_utility/md5.cpp b/host/online_compilation/hip_utility/md5.cpp
new file mode 100644
index 0000000000..ad31292ea7
--- /dev/null
+++ b/host/online_compilation/hip_utility/md5.cpp
@@ -0,0 +1,319 @@
+/*
+ * Derived from a public-domain MD5 implementation. Original license
+ * below.
+ *
+ * This is an OpenSSL-compatible implementation of the RSA Data Security, Inc.
+ * MD5 Message-Digest Algorithm (RFC 1321).
+ *
+ * Homepage:
+ * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5
+ *
+ * Author:
+ * Alexander Peslyak, better known as Solar Designer <solar at openwall.com>
+ *
+ * This software was written by Alexander Peslyak in 2001.  No copyright is
+ * claimed, and the software is hereby placed in the public domain.
+ * In case this attempt to disclaim copyright and place the software in the
+ * public domain is deemed null and void, then the software is
+ * Copyright (c) 2001 Alexander Peslyak and it is hereby released to the
+ * general public under the following terms:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted.
+ *
+ * There's ABSOLUTELY NO WARRANTY, express or implied.
+ *
+ * (This is a heavily cut-down "BSD license".)
+ *
+ * This differs from Colin Plumb's older public domain implementation in that
+ * no exactly 32-bit integer data type is required (any 32-bit or wider
+ * unsigned integer data type will do), there's no compile-time endianness
+ * configuration, and the function prototypes match OpenSSL's.  No code from
+ * Colin Plumb's implementation has been reused; this comment merely compares
+ * the properties of the two independent implementations.
+ *
+ * The primary goals of this implementation are portability and ease of use.
+ * It is meant to be fast, but not as fast as possible.  Some known
+ * optimizations are not included to reduce source code size and avoid
+ * compile-time configuration.
+ */
+#include <md5.hpp>
+#include <array>
+#include <cstring>
+#include <cstdint>
+#include <sstream>
+#include <iomanip>
+
+#define MD5_DIGEST_LENGTH 16
+
+struct MD5_CTX
+{
+    uint32_t lo, hi;
+    uint32_t a, b, c, d;
+    unsigned char buffer[64];
+    uint32_t block[MD5_DIGEST_LENGTH];
+};
+
+/*
+ * The basic MD5 functions.
+ *
+ * F and G are optimized compared to their RFC 1321 definitions for
+ * architectures that lack an AND-NOT instruction, just like in Colin Plumb's
+ * implementation.
+ */
+#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
+#define G(x, y, z) ((y) ^ ((z) & ((x) ^ (y))))
+#define H(x, y, z) (((x) ^ (y)) ^ (z))
+#define H2(x, y, z) ((x) ^ ((y) ^ (z)))
+#define I(x, y, z) ((y) ^ ((x) | ~(z)))
+
+/*
+ * The MD5 transformation for all four rounds.
+ */
+#define STEP(f, a, b, c, d, x, t, s)                         \
+    (a) += f((b), (c), (d)) + (x) + (t);                     \
+    (a) = (((a) << (s)) | (((a)&0xffffffff) >> (32 - (s)))); \
+    (a) += (b);
+
+/*
+ * SET reads 4 input bytes in little-endian byte order and stores them in a
+ * properly aligned word in host byte order.
+ *
+ * The check for little-endian architectures that tolerate unaligned memory
+ * accesses is just an optimization.  Nothing will break if it fails to detect
+ * a suitable architecture.
+ *
+ * Unfortunately, this optimization may be a C strict aliasing rules violation
+ * if the caller's data buffer has effective type that cannot be aliased by
+ * uint32_t.  In practice, this problem may occur if these MD5 routines are
+ * inlined into a calling function, or with future and dangerously advanced
+ * link-time optimizations.  For the time being, keeping these MD5 routines in
+ * their own translation unit avoids the problem.
+ */
+#if defined(__i386__) || defined(__x86_64__) || defined(__vax__)
+#define SET(n) (*reinterpret_cast<const uint32_t*>(&ptr[(n)*4]))
+#define GET(n) SET(n)
+#else
+#define SET(n)                                                         \
+    (ctx->block[(n)] = static_cast<uint32_t>(ptr[(n)*4]) |             \
+                       (static_cast<uint32_t>(ptr[(n)*4 + 1]) << 8) |  \
+                       (static_cast<uint32_t>(ptr[(n)*4 + 2]) << 16) | \
+                       (static_cast<uint32_t>(ptr[(n)*4 + 3]) << 24))
+#define GET(n) (ctx->block[(n)])
+#endif
+
+/*
+ * This processes one or more 64-byte data blocks, but does NOT update the bit
+ * counters.  There are no alignment requirements.
+ */
+static const void* body(MD5_CTX* ctx, const void* data, size_t size)
+{
+    const unsigned char* ptr;
+    uint32_t a, b, c, d;
+
+    ptr = static_cast<const unsigned char*>(data);
+
+    a = ctx->a;
+    b = ctx->b;
+    c = ctx->c;
+    d = ctx->d;
+
+    do
+    {
+        uint32_t saved_a = a, saved_b = b, saved_c = c, saved_d = d;
+
+        /* Round 1 */
+        STEP(F, a, b, c, d, SET(0), 0xd76aa478, 7)
+        STEP(F, d, a, b, c, SET(1), 0xe8c7b756, 12)
+        STEP(F, c, d, a, b, SET(2), 0x242070db, 17)
+        STEP(F, b, c, d, a, SET(3), 0xc1bdceee, 22)
+        STEP(F, a, b, c, d, SET(4), 0xf57c0faf, 7)
+        STEP(F, d, a, b, c, SET(5), 0x4787c62a, 12)
+        STEP(F, c, d, a, b, SET(6), 0xa8304613, 17)
+        STEP(F, b, c, d, a, SET(7), 0xfd469501, 22)
+        STEP(F, a, b, c, d, SET(8), 0x698098d8, 7)
+        STEP(F, d, a, b, c, SET(9), 0x8b44f7af, 12)
+        STEP(F, c, d, a, b, SET(10), 0xffff5bb1, 17)
+        STEP(F, b, c, d, a, SET(11), 0x895cd7be, 22)
+        STEP(F, a, b, c, d, SET(12), 0x6b901122, 7)
+        STEP(F, d, a, b, c, SET(13), 0xfd987193, 12)
+        STEP(F, c, d, a, b, SET(14), 0xa679438e, 17)
+        STEP(F, b, c, d, a, SET(15), 0x49b40821, 22)
+
+        /* Round 2 */
+        STEP(G, a, b, c, d, GET(1), 0xf61e2562, 5)
+        STEP(G, d, a, b, c, GET(6), 0xc040b340, 9)
+        STEP(G, c, d, a, b, GET(11), 0x265e5a51, 14)
+        STEP(G, b, c, d, a, GET(0), 0xe9b6c7aa, 20)
+        STEP(G, a, b, c, d, GET(5), 0xd62f105d, 5)
+        STEP(G, d, a, b, c, GET(10), 0x02441453, 9)
+        STEP(G, c, d, a, b, GET(15), 0xd8a1e681, 14)
+        STEP(G, b, c, d, a, GET(4), 0xe7d3fbc8, 20)
+        STEP(G, a, b, c, d, GET(9), 0x21e1cde6, 5)
+        STEP(G, d, a, b, c, GET(14), 0xc33707d6, 9)
+        STEP(G, c, d, a, b, GET(3), 0xf4d50d87, 14)
+        STEP(G, b, c, d, a, GET(8), 0x455a14ed, 20)
+        STEP(G, a, b, c, d, GET(13), 0xa9e3e905, 5)
+        STEP(G, d, a, b, c, GET(2), 0xfcefa3f8, 9)
+        STEP(G, c, d, a, b, GET(7), 0x676f02d9, 14)
+        STEP(G, b, c, d, a, GET(12), 0x8d2a4c8a, 20)
+
+        /* Round 3 */
+        STEP(H, a, b, c, d, GET(5), 0xfffa3942, 4)
+        STEP(H2, d, a, b, c, GET(8), 0x8771f681, 11)
+        STEP(H, c, d, a, b, GET(11), 0x6d9d6122, 16)
+        STEP(H2, b, c, d, a, GET(14), 0xfde5380c, 23)
+        STEP(H, a, b, c, d, GET(1), 0xa4beea44, 4)
+        STEP(H2, d, a, b, c, GET(4), 0x4bdecfa9, 11)
+        STEP(H, c, d, a, b, GET(7), 0xf6bb4b60, 16)
+        STEP(H2, b, c, d, a, GET(10), 0xbebfbc70, 23)
+        STEP(H, a, b, c, d, GET(13), 0x289b7ec6, 4)
+        STEP(H2, d, a, b, c, GET(0), 0xeaa127fa, 11)
+        STEP(H, c, d, a, b, GET(3), 0xd4ef3085, 16)
+        STEP(H2, b, c, d, a, GET(6), 0x04881d05, 23)
+        STEP(H, a, b, c, d, GET(9), 0xd9d4d039, 4)
+        STEP(H2, d, a, b, c, GET(12), 0xe6db99e5, 11)
+        STEP(H, c, d, a, b, GET(15), 0x1fa27cf8, 16)
+        STEP(H2, b, c, d, a, GET(2), 0xc4ac5665, 23)
+
+        /* Round 4 */
+        STEP(I, a, b, c, d, GET(0), 0xf4292244, 6)
+        STEP(I, d, a, b, c, GET(7), 0x432aff97, 10)
+        STEP(I, c, d, a, b, GET(14), 0xab9423a7, 15)
+        STEP(I, b, c, d, a, GET(5), 0xfc93a039, 21)
+        STEP(I, a, b, c, d, GET(12), 0x655b59c3, 6)
+        STEP(I, d, a, b, c, GET(3), 0x8f0ccc92, 10)
+        STEP(I, c, d, a, b, GET(10), 0xffeff47d, 15)
+        STEP(I, b, c, d, a, GET(1), 0x85845dd1, 21)
+        STEP(I, a, b, c, d, GET(8), 0x6fa87e4f, 6)
+        STEP(I, d, a, b, c, GET(15), 0xfe2ce6e0, 10)
+        STEP(I, c, d, a, b, GET(6), 0xa3014314, 15)
+        STEP(I, b, c, d, a, GET(13), 0x4e0811a1, 21)
+        STEP(I, a, b, c, d, GET(4), 0xf7537e82, 6)
+        STEP(I, d, a, b, c, GET(11), 0xbd3af235, 10)
+        STEP(I, c, d, a, b, GET(2), 0x2ad7d2bb, 15)
+        STEP(I, b, c, d, a, GET(9), 0xeb86d391, 21)
+
+        a += saved_a;
+        b += saved_b;
+        c += saved_c;
+        d += saved_d;
+
+        ptr += 64;
+    } while((size -= 64) != 0u);
+
+    ctx->a = a;
+    ctx->b = b;
+    ctx->c = c;
+    ctx->d = d;
+
+    return ptr;
+}
+
+static void MD5_Init(MD5_CTX* ctx)
+{
+    ctx->a = 0x67452301;
+    ctx->b = 0xefcdab89;
+    ctx->c = 0x98badcfe;
+    ctx->d = 0x10325476;
+
+    ctx->lo = 0;
+    ctx->hi = 0;
+}
+
+static void MD5_Update(MD5_CTX* ctx, const void* data, size_t size)
+{
+    uint32_t saved_lo;
+    size_t used;
+
+    saved_lo = ctx->lo;
+    if((ctx->lo = (saved_lo + size) & 0x1fffffff) < saved_lo)
+        ctx->hi++;
+    ctx->hi += size >> 29;
+
+    used = saved_lo & 0x3f;
+
+    if(used != 0u)
+    {
+        size_t available = 64 - used;
+
+        if(size < available)
+        {
+            memcpy(&ctx->buffer[used], data, size);
+            return;
+        }
+
+        memcpy(&ctx->buffer[used], data, available);
+        data = static_cast<const unsigned char*>(data) + available;
+        size -= available;
+        body(ctx, ctx->buffer, 64);
+    }
+
+    if(size >= 64)
+    {
+        data = body(ctx, data, size & ~size_t{0x3f});
+        size &= 0x3f;
+    }
+
+    memcpy(ctx->buffer, data, size);
+}
+
+#define OUT(dst, src)                                   \
+    (dst)[0] = static_cast<unsigned char>(src);         \
+    (dst)[1] = static_cast<unsigned char>((src) >> 8);  \
+    (dst)[2] = static_cast<unsigned char>((src) >> 16); \
+    (dst)[3] = static_cast<unsigned char>((src) >> 24);
+
+static void MD5_Final(unsigned char* result, MD5_CTX* ctx)
+{
+    size_t used, available;
+
+    used = ctx->lo & 0x3f;
+
+    ctx->buffer[used++] = 0x80;
+
+    available = 64 - used;
+
+    if(available < 8)
+    {
+        memset(&ctx->buffer[used], 0, available);
+        body(ctx, ctx->buffer, 64);
+        used      = 0;
+        available = 64;
+    }
+
+    memset(&ctx->buffer[used], 0, available - 8);
+
+    ctx->lo <<= 3;
+    OUT(&ctx->buffer[56], ctx->lo)
+    OUT(&ctx->buffer[60], ctx->hi)
+
+    body(ctx, ctx->buffer, 64);
+
+    OUT(&result[0], ctx->a)
+    OUT(&result[4], ctx->b)
+    OUT(&result[8], ctx->c)
+    OUT(&result[12], ctx->d)
+
+    memset(ctx, 0, sizeof(*ctx));
+}
+
+namespace olCompile {
+
+std::string md5(std::string s)
+{
+    std::array<unsigned char, MD5_DIGEST_LENGTH> result{};
+
+    MD5_CTX ctx{};
+    MD5_Init(&ctx);
+    MD5_Update(&ctx, s.data(), s.length());
+    MD5_Final(result.data(), &ctx);
+
+    std::ostringstream sout;
+    sout << std::hex << std::setfill('0');
+    for(auto c : result)
+        sout << std::setw(2) << int{c};
+
+    return sout.str();
+}
+} // namespace olCompile
diff --git a/host/online_compilation/hip_utility/target_properties.cpp b/host/online_compilation/hip_utility/target_properties.cpp
new file mode 100644
index 0000000000..1d2bdef1c1
--- /dev/null
+++ b/host/online_compilation/hip_utility/target_properties.cpp
@@ -0,0 +1,119 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <env.hpp>
+#include <handle.hpp>
+#include <stringutils.hpp>
+#include <target_properties.hpp>
+#include <map>
+#include <string>
+
+OLC_DECLARE_ENV_VAR(OLC_DEBUG_ENFORCE_DEVICE)
+
+namespace olCompile {
+
+static std::string GetDeviceNameFromMap(const std::string& in)
+{
+    // NOLINTNEXTLINE (cppcoreguidelines-avoid-non-const-global-variables)
+    static std::map<std::string, std::string> device_name_map = {
+        {"Ellesmere", "gfx803"},
+        {"Baffin", "gfx803"},
+        {"RacerX", "gfx803"},
+        {"Polaris10", "gfx803"},
+        {"Polaris11", "gfx803"},
+        {"Tonga", "gfx803"},
+        {"Fiji", "gfx803"},
+        {"gfx800", "gfx803"},
+        {"gfx802", "gfx803"},
+        {"gfx804", "gfx803"},
+        {"Vega10", "gfx900"},
+        {"gfx901", "gfx900"},
+        {"10.3.0 Sienna_Cichlid 18", "gfx1030"},
+    };
+
+    const char* const p_asciz = olCompile::GetStringEnv(OLC_DEBUG_ENFORCE_DEVICE{});
+    if(p_asciz != nullptr && strlen(p_asciz) > 0)
+        return {p_asciz};
+
+    const auto name = in.substr(0, in.find(':')); // str.substr(0, npos) returns str.
+
+    auto match = device_name_map.find(name);
+    if(match != device_name_map.end())
+        return match->second;
+    return name; // NOLINT (performance-no-automatic-move)
+}
+
+void TargetProperties::Init(const Handle* const handle)
+{
+    const auto rawName = [&]() -> std::string { return handle->GetDeviceNameImpl(); }();
+    name               = GetDeviceNameFromMap(rawName);
+    // DKMS driver older than 5.9 may report incorrect state of SRAMECC feature.
+    // Therefore we compute default SRAMECC and rely on it for now.
+    sramecc = [&]() -> boost::optional<bool> {
+        if(name == "gfx906" || name == "gfx908")
+            return {true};
+        return {};
+    }();
+    // However we need to store the reported state, even if it is incorrect,
+    // to use together with COMGR.
+    sramecc_reported = [&]() -> boost::optional<bool> {
+        if(rawName.find(":sramecc+") != std::string::npos)
+            return true;
+        if(rawName.find(":sramecc-") != std::string::npos)
+            return false;
+        return sramecc; // default
+    }();
+    xnack = [&]() -> boost::optional<bool> {
+        if(rawName.find(":xnack+") != std::string::npos)
+            return true;
+        if(rawName.find(":xnack-") != std::string::npos)
+            return false;
+        return {}; // default
+    }();
+    InitDbId();
+}
+
+void TargetProperties::InitDbId()
+{
+    dbId = name;
+    if(name == "gfx906" || name == "gfx908")
+    {
+        // Let's stay compatible with existing gfx906/908 databases.
+        // When feature equal to the default (SRAMECC ON), do not
+        // append feature suffix. This is for backward compatibility
+        // with legacy databases ONLY!
+        if(!sramecc || !(*sramecc))
+            dbId += "_nosramecc";
+    }
+    else
+    {
+        if(sramecc && *sramecc)
+            dbId += "_sramecc";
+    }
+    if(xnack && *xnack)
+        dbId += "_xnack";
+}
+
+} // namespace olCompile
diff --git a/host/online_compilation/hip_utility/tmp_dir.cpp b/host/online_compilation/hip_utility/tmp_dir.cpp
new file mode 100644
index 0000000000..6e5de6935d
--- /dev/null
+++ b/host/online_compilation/hip_utility/tmp_dir.cpp
@@ -0,0 +1,66 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <tmp_dir.hpp>
+#include <env.hpp>
+#include <boost/filesystem.hpp>
+#include <logger.hpp>
+
+OLC_DECLARE_ENV_VAR(OLC_DEBUG_SAVE_TEMP_DIR)
+
+namespace olCompile {
+
+void SystemCmd(std::string cmd)
+{
+    fdt_log(LogLevel::Info, "SystemCmd", cmd.c_str());
+    fdt_log_flush();
+    if(std::system(cmd.c_str()) != 0)
+        throw std::runtime_error("Can't execute " + cmd);
+}
+
+TmpDir::TmpDir(std::string prefix)
+    : path(boost::filesystem::temp_directory_path() /
+           boost::filesystem::unique_path("olCompile-" + prefix + "-%%%%-%%%%-%%%%-%%%%"))
+{
+    boost::filesystem::create_directories(this->path);
+}
+
+void TmpDir::Execute(std::string exe, std::string args) const
+{
+    std::string cd  = "cd " + this->path.string() + "; ";
+    std::string cmd = cd + exe + " " + args; // + " > /dev/null";
+    SystemCmd(cmd);
+}
+
+TmpDir::~TmpDir()
+{
+    if(!olCompile::IsEnabled(OLC_DEBUG_SAVE_TEMP_DIR{}))
+    {
+        boost::filesystem::remove_all(this->path);
+    }
+}
+
+} // namespace olCompile
diff --git a/host/online_compilation/include/binary_cache.hpp b/host/online_compilation/include/binary_cache.hpp
new file mode 100644
index 0000000000..5ff9f81093
--- /dev/null
+++ b/host/online_compilation/include/binary_cache.hpp
@@ -0,0 +1,52 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef GUARD_OLC_BINARY_CACHE_HPP
+#define GUARD_OLC_BINARY_CACHE_HPP
+
+#include <target_properties.hpp>
+#include <boost/filesystem/path.hpp>
+#include <string>
+
+namespace olCompile {
+
+boost::filesystem::path
+GetCacheFile(const std::string& device, const std::string& name, const std::string& args);
+
+boost::filesystem::path GetCachePath();
+
+boost::filesystem::path LoadBinary(const TargetProperties& target,
+                                   std::size_t num_cu,
+                                   const std::string& name,
+                                   const std::string& args);
+void SaveBinary(const boost::filesystem::path& binary_path,
+                const TargetProperties& target,
+                const std::string& name,
+                const std::string& args);
+
+} // namespace olCompile
+
+#endif
diff --git a/host/online_compilation/include/config.h.in b/host/online_compilation/include/config.h.in
new file mode 100644
index 0000000000..b36486de2f
--- /dev/null
+++ b/host/online_compilation/include/config.h.in
@@ -0,0 +1,47 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_CONFIG_H_IN
+#define GUARD_CONFIG_H_IN
+
+// "_PACKAGE_" to avoid name contentions: the macros like
+// HIP_VERSION_MAJOR are defined in hip_version.h.
+// clang-format off
+#define HIP_PACKAGE_VERSION_MAJOR @OLC_hip_VERSION_MAJOR@
+#define HIP_PACKAGE_VERSION_MINOR @OLC_hip_VERSION_MINOR@
+#define HIP_PACKAGE_VERSION_PATCH @OLC_hip_VERSION_PATCH@
+// clang-format on
+
+#define HIP_PACKAGE_VERSION_FLAT                                                   \
+    ((HIP_PACKAGE_VERSION_MAJOR * 1000ULL + HIP_PACKAGE_VERSION_MINOR) * 1000000 + \
+     HIP_PACKAGE_VERSION_PATCH)
+
+#cmakedefine01 OLC_DEBUG
+
+#cmakedefine OLC_HIP_COMPILER "@OLC_HIP_COMPILER@"
+#cmakedefine EXTRACTKERNEL_BIN "@EXTRACTKERNEL_BIN@"
+#cmakedefine OLC_OFFLOADBUNDLER_BIN "@OLC_OFFLOADBUNDLER_BIN@"
+
+#endif
diff --git a/host/online_compilation/include/env.hpp b/host/online_compilation/include/env.hpp
new file mode 100644
index 0000000000..1d519a44d7
--- /dev/null
+++ b/host/online_compilation/include/env.hpp
@@ -0,0 +1,123 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_OLC_ENV_HPP
+#define GUARD_OLC_ENV_HPP
+
+#include <cstdlib>
+#include <cstring>
+#include <string>
+#include <vector>
+
+namespace olCompile {
+
+/// \todo Rework: Case-insensitive string compare, ODR, (?) move to .cpp
+
+// Declare a cached environment variable
+#define OLC_DECLARE_ENV_VAR(x)                    \
+    struct x                                      \
+    {                                             \
+        static const char* value() { return #x; } \
+    };
+
+/*
+ * Returns false if a feature-controlling environment variable is defined
+ * and set to something which disables a feature.
+ */
+inline bool IsEnvvarValueDisabled(const char* name)
+{
+    const auto value_env_p = std::getenv(name);
+    return value_env_p != nullptr &&
+           (std::strcmp(value_env_p, "disable") == 0 || std::strcmp(value_env_p, "disabled") == 0 ||
+            std::strcmp(value_env_p, "0") == 0 || std::strcmp(value_env_p, "no") == 0 ||
+            std::strcmp(value_env_p, "false") == 0);
+}
+
+inline bool IsEnvvarValueEnabled(const char* name)
+{
+    const auto value_env_p = std::getenv(name);
+    return value_env_p != nullptr &&
+           (std::strcmp(value_env_p, "enable") == 0 || std::strcmp(value_env_p, "enabled") == 0 ||
+            std::strcmp(value_env_p, "1") == 0 || std::strcmp(value_env_p, "yes") == 0 ||
+            std::strcmp(value_env_p, "true") == 0);
+}
+
+// Return 0 if env is enabled else convert environment var to an int.
+// Supports hexadecimal with leading 0x or decimal
+inline unsigned long int EnvvarValue(const char* name, unsigned long int fallback = 0)
+{
+    const auto value_env_p = std::getenv(name);
+    if(value_env_p == nullptr)
+    {
+        return fallback;
+    }
+    else
+    {
+        return strtoul(value_env_p, nullptr, 0);
+    }
+}
+
+inline std::vector<std::string> GetEnv(const char* name)
+{
+    const auto p = std::getenv(name);
+    if(p == nullptr)
+        return {};
+    else
+        return {{p}};
+}
+
+template <class T>
+inline const char* GetStringEnv(T)
+{
+    static const std::vector<std::string> result = GetEnv(T::value());
+    if(result.empty())
+        return nullptr;
+    else
+        return result.front().c_str();
+}
+
+template <class T>
+inline bool IsEnabled(T)
+{
+    static const bool result = olCompile::IsEnvvarValueEnabled(T::value());
+    return result;
+}
+
+template <class T>
+inline bool IsDisabled(T)
+{
+    static const bool result = olCompile::IsEnvvarValueDisabled(T::value());
+    return result;
+}
+
+template <class T>
+inline unsigned long int Value(T, unsigned long int fallback = 0)
+{
+    static const auto result = olCompile::EnvvarValue(T::value(), fallback);
+    return result;
+}
+} // namespace olCompile
+
+#endif
diff --git a/host/online_compilation/include/exec_utils.hpp b/host/online_compilation/include/exec_utils.hpp
new file mode 100644
index 0000000000..bbad128d96
--- /dev/null
+++ b/host/online_compilation/include/exec_utils.hpp
@@ -0,0 +1,42 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef EXEC_OLC_UTILS_HPP
+#define EXEC_OLC_UTILS_HPP
+
+#include <istream>
+#include <ostream>
+#include <string>
+
+namespace olCompile {
+namespace exec {
+
+/// Redirecting both input and output is not supported.
+int Run(const std::string& p, std::istream* in, std::ostream* out);
+
+} // namespace exec
+} // namespace olCompile
+
+#endif // EXEC_UTILS_HPP
diff --git a/host/online_compilation/include/handle.hpp b/host/online_compilation/include/handle.hpp
new file mode 100644
index 0000000000..db93ee1445
--- /dev/null
+++ b/host/online_compilation/include/handle.hpp
@@ -0,0 +1,145 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_OLC_HANDLE_HPP_
+#define GUARD_OLC_HANDLE_HPP_
+
+#include <kernel.hpp>
+#include <stringutils.hpp>
+#include <target_properties.hpp>
+
+#include <boost/range/adaptor/transformed.hpp>
+
+#include <cstdio>
+#include <cstring>
+#include <ios>
+#include <sstream>
+#include <memory>
+#include <vector>
+#include <unordered_map>
+
+namespace olCompile {
+
+struct HandleImpl;
+
+struct Handle
+{
+    friend struct TargetProperties;
+
+    Handle();
+    Handle(hipStream_t stream);
+    Handle(Handle&&) noexcept;
+    ~Handle();
+
+    hipStream_t GetStream() const;
+    void SetStream(hipStream_t streamID) const;
+
+    KernelInvoke AddKernel(const std::string& algorithm,
+                           const std::string& network_config,
+                           const std::string& program_name,
+                           const std::string& kernel_name,
+                           const std::vector<size_t>& vld,
+                           const std::vector<size_t>& vgd,
+                           const std::string& params,
+                           std::size_t cache_index = 0) const;
+
+    bool HasKernel(const std::string& algorithm, const std::string& network_config) const;
+
+    void ClearKernels(const std::string& algorithm, const std::string& network_config) const;
+
+    auto GetKernels(const std::string& algorithm, const std::string& network_config) const
+    {
+        return this->GetKernelsImpl(algorithm, network_config) |
+               boost::adaptors::transformed([this](Kernel k) { return this->Run(k); });
+    }
+    KernelInvoke GetKernel(const std::string& algorithm, const std::string& network_config) const
+    {
+        auto ks = this->GetKernelsImpl(algorithm, network_config);
+        if(ks.empty())
+        {
+            throw std::runtime_error("looking for default kernel (does not exist): " + algorithm +
+                                     ", " + network_config);
+        }
+        return this->Run(ks.front());
+    }
+
+    KernelInvoke Run(Kernel k) const;
+
+    Program LoadProgram(const std::string& program_name, std::string params) const;
+
+    bool HasProgram(const std::string& program_name, const std::string& params) const;
+
+    void AddProgram(Program prog, const std::string& program_name, const std::string& params) const;
+
+    void Finish() const;
+
+    std::size_t GetLocalMemorySize() const;
+    std::size_t GetGlobalMemorySize() const;
+    std::size_t GetWavefrontWidth() const;
+    std::size_t GetMaxComputeUnits() const;
+    std::size_t GetMaxHardwareComputeUnits() const
+    {
+        std::size_t num_cu = this->GetMaxComputeUnits();
+        std::string name   = this->GetDeviceName();
+        return StartsWith(name, "gfx1") ? num_cu * 2 /* CUs per WGP */ : num_cu;
+    }
+
+    std::string GetDeviceName() const;
+    const TargetProperties& GetTargetProperties() const;
+
+    private:
+    std::string GetDeviceNameImpl() const;
+    const std::vector<Kernel>& GetKernelsImpl(const std::string& algorithm,
+                                              const std::string& network_config) const;
+
+    public:
+    std::ostream& Print(std::ostream& os) const;
+
+    static std::string GetDbBasename(const TargetProperties& target, size_t num_cu)
+    {
+        auto ret = target.DbId() + [&]() {
+            std::ostringstream ss;
+            if(num_cu <= 64)
+                ss << '_' << num_cu;
+            else
+                ss << std::hex << num_cu;
+            return std::string(ss.str());
+        }();
+        return ret;
+    }
+
+    std::string GetDbBasename() const
+    {
+        return GetDbBasename(GetTargetProperties(), GetMaxComputeUnits());
+    }
+
+    std::unique_ptr<HandleImpl> impl;
+};
+
+inline std::ostream& operator<<(std::ostream& os, const Handle& handle) { return handle.Print(os); }
+
+} // namespace olCompile
+
+#endif // GUARD_OLC_HANDLE_HPP_
diff --git a/host/online_compilation/include/hipCheck.hpp b/host/online_compilation/include/hipCheck.hpp
new file mode 100644
index 0000000000..61959c8fa6
--- /dev/null
+++ b/host/online_compilation/include/hipCheck.hpp
@@ -0,0 +1,22 @@
+#ifndef _HIP_OLC_CHECK_HPP_
+#define _HIP_OLC_CHECK_HPP_
+
+#include <hip/hip_runtime.h>
+#include <sstream>
+#include <vector>
+
+// Here flag can be a constant, variable or function call
+#define MY_HIP_CHECK(flag)                                                         \
+    do                                                                             \
+    {                                                                              \
+        hipError_t _tmpVal;                                                        \
+        if((_tmpVal = flag) != hipSuccess)                                         \
+        {                                                                          \
+            std::ostringstream ostr;                                               \
+            ostr << "HIP Function Failed (" << __FILE__ << "," << __LINE__ << ") " \
+                 << hipGetErrorString(_tmpVal);                                    \
+            throw std::runtime_error(ostr.str());                                  \
+        }                                                                          \
+    } while(0)
+
+#endif
diff --git a/host/online_compilation/include/hip_build_utils.hpp b/host/online_compilation/include/hip_build_utils.hpp
new file mode 100644
index 0000000000..af456f846b
--- /dev/null
+++ b/host/online_compilation/include/hip_build_utils.hpp
@@ -0,0 +1,97 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef OLC_GUARD_OLC_HIP_BUILD_UTILS_HPP
+#define OLC_GUARD_OLC_HIP_BUILD_UTILS_HPP
+
+#include <target_properties.hpp>
+#include <kernel.hpp>
+#include <boost/optional.hpp>
+#include <string>
+
+namespace olCompile {
+
+boost::filesystem::path HipBuild(boost::optional<olCompile::TmpDir>& tmp_dir,
+                                 const std::string& filename,
+                                 std::string src,
+                                 std::string params,
+                                 const TargetProperties& target,
+                                 bool sources_already_reside_on_filesystem = false);
+
+void bin_file_to_str(const boost::filesystem::path& file, std::string& buf);
+
+struct external_tool_version_t
+{
+    int major = -1;
+    int minor = -1;
+    int patch = -1;
+    friend bool operator>(const external_tool_version_t& lhs, const external_tool_version_t& rhs);
+    friend bool operator<(const external_tool_version_t& lhs, const external_tool_version_t& rhs);
+    friend bool operator>=(const external_tool_version_t& lhs, const external_tool_version_t& rhs);
+    friend bool operator<=(const external_tool_version_t& lhs, const external_tool_version_t& rhs);
+};
+
+external_tool_version_t HipCompilerVersion();
+
+bool IsHccCompiler();
+bool IsHipClangCompiler();
+
+class LcOptionTargetStrings
+{
+    public:
+    const std::string& device;
+    const std::string xnack;
+
+    private:
+    const std::string sramecc;
+    const std::string sramecc_reported;
+
+    public:
+    const std::string targetId;
+    LcOptionTargetStrings(const TargetProperties& target)
+        : device(target.Name()),
+          xnack([&]() -> std::string {
+              if(target.Xnack())
+                  return std::string{":xnack"} + (*target.Xnack() ? "+" : "-");
+              return {};
+          }()),
+          sramecc([&]() -> std::string {
+              if(target.Sramecc())
+                  return std::string{":sramecc"} + (*target.Sramecc() ? "+" : "-");
+              return {};
+          }()),
+          sramecc_reported([&]() -> std::string {
+              if(target.SrameccReported())
+                  return std::string{":sramecc"} + (*target.SrameccReported() ? "+" : "-");
+              return {};
+          }()),
+          targetId(device + sramecc + xnack)
+    {
+    }
+};
+
+} // namespace olCompile
+
+#endif
diff --git a/host/online_compilation/include/hipoc_kernel.hpp b/host/online_compilation/include/hipoc_kernel.hpp
new file mode 100644
index 0000000000..3bcf88f526
--- /dev/null
+++ b/host/online_compilation/include/hipoc_kernel.hpp
@@ -0,0 +1,174 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_OLC_HIPOC_KERNEL_HPP
+#define GUARD_OLC_HIPOC_KERNEL_HPP
+
+#include <array>
+#include <cassert>
+#include <hipoc_program.hpp>
+#include <stringutils.hpp>
+#include <manage_ptr.hpp>
+#include <op_kernel_args.hpp>
+#include <hipCheck.hpp>
+#include <vector>
+#include <memory>
+
+namespace olCompile {
+
+using HipEventPtr = OLC_MANAGE_PTR(hipEvent_t, hipEventDestroy);
+inline HipEventPtr make_hip_event()
+{
+    hipEvent_t result = nullptr;
+    MY_HIP_CHECK(hipEventCreate(&result));
+    return HipEventPtr{result};
+}
+
+template <class T, class U>
+struct KernelArgsPair
+{
+    static const int alignment    = sizeof(U);
+    static const int padding      = (alignment - (sizeof(T) % alignment)) % alignment;
+    static const int second_index = sizeof(T) + padding;
+    KernelArgsPair(T x, U y)
+    {
+
+        new(buffer) T(x); // NOLINT (clang-analyzer-cplusplus.PlacementNew)
+        new(buffer + second_index) U(y);
+    }
+    char buffer[second_index + sizeof(U)] = {};
+};
+
+template <class... Ts>
+struct KernelArgsPack;
+
+template <class T, class U, class... Ts>
+struct KernelArgsPack<T, U, Ts...>
+{
+    using data_t = KernelArgsPack<KernelArgsPair<T, U>, Ts...>;
+    KernelArgsPack(T x, U y, Ts... xs) : data(KernelArgsPair<T, U>(x, y), xs...) {}
+    data_t data;
+};
+
+template <class T>
+struct KernelArgsPack<T>
+{
+    KernelArgsPack(T x) : head(x) {}
+    T head;
+};
+
+template <class... Ts>
+struct KernelArgs
+{
+    KernelArgs(Ts... xs) : pack(xs...) { std::fill(std::begin(hidden), std::end(hidden), 0); }
+    KernelArgsPack<Ts...> pack;
+    uint64_t hidden[6] = {};
+};
+
+struct HIPOCKernelInvoke
+{
+    hipStream_t stream          = nullptr;
+    hipFunction_t fun           = nullptr;
+    std::array<size_t, 3> ldims = {};
+    std::array<size_t, 3> gdims = {};
+    std::string name;
+    std::function<void(hipEvent_t, hipEvent_t)> callback;
+
+    // Workaround for aggregate types in c++11
+    HIPOCKernelInvoke() {}
+    HIPOCKernelInvoke(hipStream_t pstream,
+                      hipFunction_t pfun,
+                      std::array<size_t, 3> pldims,
+                      std::array<size_t, 3> pgdims,
+                      std::string pname,
+                      std::function<void(hipEvent_t, hipEvent_t)> pcallback)
+        : stream(pstream), fun(pfun), ldims(pldims), gdims(pgdims), name(pname), callback(pcallback)
+    {
+    }
+    void operator()(std::vector<OpKernelArg>& any_args) const
+    {
+        char hip_args[256] = {0};
+        auto sz_left       = any_args[0].size();
+
+        memcpy(hip_args, &(any_args[0].buffer[0]), any_args[0].size());
+
+        for(unsigned long idx = 1; idx < any_args.size(); idx++)
+        {
+            auto& any_arg              = any_args[idx];
+            unsigned long alignment    = any_arg.size();
+            unsigned long padding      = (alignment - (sz_left % alignment)) % alignment;
+            unsigned long second_index = sz_left + padding;
+            memcpy(hip_args + second_index, &(any_arg.buffer[0]), any_arg.size());
+            sz_left = second_index + alignment;
+        }
+        run(hip_args, sz_left);
+    }
+
+    template <class... Ts>
+    void operator()(Ts... xs) const
+    {
+        KernelArgs<Ts...> args{xs...};
+        run(&args, sizeof(args));
+    }
+
+    void run(void* args, std::size_t size) const;
+
+    const std::string& GetName() const { return name; }
+};
+
+struct HIPOCKernel
+{
+    HIPOCProgram program;
+    std::string name;
+    std::array<size_t, 3> ldims = {};
+    std::array<size_t, 3> gdims = {};
+    std::string kernel_module;
+    hipFunction_t fun = nullptr;
+
+    HIPOCKernel() {}
+    HIPOCKernel(HIPOCProgram p,
+                const std::string kernel_name,
+                std::vector<size_t> local_dims,
+                std::vector<size_t> global_dims)
+        : program(p), name(kernel_name)
+    {
+        assert(!local_dims.empty() && local_dims.size() <= 3);
+        assert(!global_dims.empty() && global_dims.size() <= 3);
+        ldims.fill(1);
+        gdims.fill(1);
+        std::copy(local_dims.begin(), local_dims.end(), ldims.begin());
+        std::copy(global_dims.begin(), global_dims.end(), gdims.begin());
+
+        kernel_module = name;
+        MY_HIP_CHECK(hipModuleGetFunction(&fun, program.GetModule(), kernel_module.c_str()));
+    }
+
+    HIPOCKernelInvoke Invoke(hipStream_t stream,
+                             std::function<void(hipEvent_t, hipEvent_t)> callback = nullptr) const;
+};
+
+} // namespace olCompile
+
+#endif
diff --git a/host/online_compilation/include/hipoc_program.hpp b/host/online_compilation/include/hipoc_program.hpp
new file mode 100644
index 0000000000..5296003cb7
--- /dev/null
+++ b/host/online_compilation/include/hipoc_program.hpp
@@ -0,0 +1,64 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_OLC_HIPOC_PROGRAM_HPP
+#define GUARD_OLC_HIPOC_PROGRAM_HPP
+
+#include <target_properties.hpp>
+#include <manage_ptr.hpp>
+#include <hipoc_program_impl.hpp>
+#include <boost/filesystem/path.hpp>
+#include <hip/hip_runtime_api.h>
+#include <string>
+
+namespace olCompile {
+
+struct HIPOCProgramImpl;
+struct HIPOCProgram
+{
+    HIPOCProgram();
+    /// This ctor builds the program from source, initializes module.
+    /// Also either CO pathname (typically if offline tools were used)
+    /// or binary blob (if comgr was used to build the program)
+    /// is initialized. GetModule(), GetCodeObjectPathname(),
+    /// GetCodeObjectBlob() return appropriate data after this ctor.
+    /// Other ctors only guarantee to initialize module.
+    HIPOCProgram(const std::string& program_name,
+                 std::string params,
+                 const TargetProperties& target);
+    HIPOCProgram(const std::string& program_name, const boost::filesystem::path& hsaco);
+    std::shared_ptr<const HIPOCProgramImpl> impl;
+    hipModule_t GetModule() const;
+    /// \return Pathname of CO file, if it resides on the filesystem.
+    boost::filesystem::path GetCodeObjectPathname() const;
+    /// \return Copy of in-memory CO blob.
+    std::string GetCodeObjectBlob() const;
+    /// \return True if CO blob resides in-memory.
+    /// False if CO resides on filesystem.
+    bool IsCodeObjectInMemory() const;
+};
+} // namespace olCompile
+
+#endif
diff --git a/host/online_compilation/include/hipoc_program_impl.hpp b/host/online_compilation/include/hipoc_program_impl.hpp
new file mode 100644
index 0000000000..2d8706b2e8
--- /dev/null
+++ b/host/online_compilation/include/hipoc_program_impl.hpp
@@ -0,0 +1,61 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_OLC_HIPOC_PROGRAM_IMPL_HPP
+#define GUARD_OLC_HIPOC_PROGRAM_IMPL_HPP
+
+#include <target_properties.hpp>
+#include <manage_ptr.hpp>
+#include <tmp_dir.hpp>
+#include <boost/filesystem/path.hpp>
+#include <boost/optional.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace olCompile {
+
+using hipModulePtr = OLC_MANAGE_PTR(hipModule_t, hipModuleUnload);
+
+struct HIPOCProgramImpl
+{
+    HIPOCProgramImpl(){};
+    HIPOCProgramImpl(const std::string& program_name, const boost::filesystem::path& filespec);
+
+    HIPOCProgramImpl(const std::string& program_name,
+                     std::string params,
+                     const TargetProperties& target_);
+
+    std::string program;
+    TargetProperties target;
+    boost::filesystem::path hsaco_file;
+    hipModulePtr module;
+    boost::optional<TmpDir> dir;
+    std::vector<char> binary;
+
+    void
+    BuildCodeObjectInFile(std::string& params, const std::string& src, const std::string& filename);
+    void BuildCodeObject(std::string params);
+};
+} // namespace olCompile
+#endif // GUARD_OLC_HIPOC_PROGRAM_IMPL_HPP
diff --git a/host/online_compilation/include/kernel.hpp b/host/online_compilation/include/kernel.hpp
new file mode 100644
index 0000000000..73d6be61ad
--- /dev/null
+++ b/host/online_compilation/include/kernel.hpp
@@ -0,0 +1,45 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_OLC_KERNEL_HPP
+#define GUARD_OLC_KERNEL_HPP
+
+#include <string>
+#include <vector>
+#include <hipoc_kernel.hpp>
+
+namespace olCompile {
+std::string GetKernelSrc(std::string name);
+std::string GetKernelInc(std::string key);
+std::vector<std::string> GetKernelIncList();
+std::vector<std::string> GetHipKernelIncList();
+
+using Kernel       = HIPOCKernel;
+using KernelInvoke = HIPOCKernelInvoke;
+using Program      = HIPOCProgram;
+
+} // namespace olCompile
+
+#endif
diff --git a/host/online_compilation/include/kernel_build_params.hpp b/host/online_compilation/include/kernel_build_params.hpp
new file mode 100644
index 0000000000..c15769ed27
--- /dev/null
+++ b/host/online_compilation/include/kernel_build_params.hpp
@@ -0,0 +1,137 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_OLC_KERNEL_BUILD_PARAMETERS_HPP_
+#define GUARD_OLC_KERNEL_BUILD_PARAMETERS_HPP_
+
+#include <algorithm>
+#include <cassert>
+#include <initializer_list>
+#include <string>
+#include <vector>
+
+namespace olCompile {
+
+namespace kbp {
+struct Option
+{
+};
+} // namespace kbp
+
+enum class ParameterTypes
+{
+    Define,
+    Option,
+};
+
+struct KernelBuildParameter
+{
+    ParameterTypes type;
+    std::string name;
+    std::string value;
+};
+
+class KernelBuildParameters
+{
+    public:
+    struct KBPInit
+    {
+        friend class KernelBuildParameters;
+
+        KBPInit(const std::string& name, const std::string& value = "")
+            : data{ParameterTypes::Define, name, value}
+        {
+        }
+
+        template <class TValue, class = decltype(std::to_string(std::declval<TValue>()))>
+        KBPInit(const std::string& name, const TValue& value) : KBPInit(name, std::to_string(value))
+        {
+        }
+
+        KBPInit(kbp::Option, const std::string& name, const std::string& value = "")
+            : data{ParameterTypes::Option, name, value}
+        {
+        }
+
+        template <class TValue, class = decltype(std::to_string(std::declval<TValue>()))>
+        KBPInit(kbp::Option, const std::string& name, const TValue& value)
+            : KBPInit(kbp::Option{}, name, std::to_string(value))
+        {
+        }
+
+        private:
+        KernelBuildParameter data{};
+    };
+
+    KernelBuildParameters() = default;
+    KernelBuildParameters(const std::initializer_list<KBPInit>& defines_)
+    {
+        options.reserve(defines_.size());
+        for(const auto& define : defines_)
+        {
+            assert(ValidateUniqueness(define.data.name));
+            options.push_back(define.data);
+        }
+    }
+
+    bool Empty() const { return options.empty(); }
+
+    void Define(const std::string& name, const std::string& value = "")
+    {
+        assert(ValidateUniqueness(name));
+        options.push_back({ParameterTypes::Define, name, value});
+    }
+
+    template <class TValue, class = decltype(std::to_string(std::declval<TValue>()))>
+    void Define(const std::string& name, const TValue& value)
+    {
+        Define(name, std::to_string(value));
+    }
+
+    KernelBuildParameters& operator<<(const KernelBuildParameters& other)
+    {
+        std::copy(other.options.begin(), other.options.end(), std::back_inserter(options));
+        return *this;
+    }
+
+    template <class TFor>
+    std::string GenerateFor(TFor&&) const
+    {
+        return TFor::Generate(options);
+    }
+
+    private:
+    std::vector<KernelBuildParameter> options = {};
+
+    bool ValidateUniqueness(const std::string& name) const
+    {
+        const auto eq = [=](const auto& item) { return item.name == name; };
+        return std::find_if(options.begin(), options.end(), eq) == options.end();
+    }
+};
+
+} // namespace olCompile
+
+#endif
diff --git a/host/online_compilation/include/kernel_cache.hpp b/host/online_compilation/include/kernel_cache.hpp
new file mode 100644
index 0000000000..9f88327858
--- /dev/null
+++ b/host/online_compilation/include/kernel_cache.hpp
@@ -0,0 +1,97 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+/* ************************************************************************
+ * Copyright 2015 Vratis, Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************ */
+
+#ifndef GUARD_OLC_KERNEL_CACHE_HPP_
+#define GUARD_OLC_KERNEL_CACHE_HPP_
+
+#include <handle.hpp>
+#include <kernel.hpp>
+#include <simple_hash.hpp>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace olCompile {
+
+/**
+ * @brief The KernelCache class Build and cache kernels
+ *
+ */
+class KernelCache
+{
+    public:
+    using Key        = std::pair<std::string, std::string>;
+    using KernelMap  = std::unordered_map<Key, std::vector<Kernel>, SimpleHash>;
+    using ProgramMap = std::unordered_map<Key, Program, SimpleHash>;
+
+    Kernel AddKernel(const Handle& h,
+                     const std::string& algorithm,
+                     const std::string& network_config,
+                     const std::string& program_name,
+                     const std::string& kernel_name,
+                     const std::vector<size_t>& vld,
+                     const std::vector<size_t>& vgd,
+                     std::string params      = "",
+                     std::size_t cache_index = 0);
+
+    void AddKernel(Key key, Kernel k, std::size_t cache_index);
+
+    void ClearKernels(const std::string& algorithm, const std::string& network_config);
+
+    const std::vector<Kernel>& GetKernels(const std::string& algorithm,
+                                          const std::string& network_config);
+
+    bool HasKernels(const std::string& algorithm, const std::string& network_config) const;
+
+    bool HasProgram(const std::string& name, const std::string& params) const;
+
+    void AddProgram(Program prog, const std::string& program_name, std::string params);
+
+    KernelCache();
+
+    private:
+    KernelMap kernel_map;
+    ProgramMap program_map;
+};
+
+} // namespace olCompile
+
+#endif // GUARD_OLC_KERNEL_CACHE_HPP_
diff --git a/host/online_compilation/include/logger.hpp b/host/online_compilation/include/logger.hpp
new file mode 100644
index 0000000000..cc420d6e34
--- /dev/null
+++ b/host/online_compilation/include/logger.hpp
@@ -0,0 +1,23 @@
+#ifndef _OLC_LOGGER_HPP_
+#define _OLC_LOGGER_HPP_
+
+#include <fstream>
+
+namespace olCompile {
+
+enum class LogLevel
+{
+    Quiet   = 1,
+    Error   = 2,
+    Warning = 3,
+    Info    = 4,
+    Info2   = 5
+};
+
+std::ostream& fdt_log(LogLevel level, const char* header, const char* content);
+std::ostream& fdt_log();
+void fdt_log_flush();
+
+}; // namespace olCompile
+
+#endif
diff --git a/host/online_compilation/include/manage_ptr.hpp b/host/online_compilation/include/manage_ptr.hpp
new file mode 100644
index 0000000000..c02c712475
--- /dev/null
+++ b/host/online_compilation/include/manage_ptr.hpp
@@ -0,0 +1,76 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_OLC_MANAGE_PTR_HPP
+#define GUARD_OLC_MANAGE_PTR_HPP
+
+#include <memory>
+#include <type_traits>
+
+namespace olCompile {
+
+template <class F, F f>
+struct manage_deleter
+{
+    template <class T>
+    void operator()(T* x) const
+    {
+        if(x != nullptr)
+        {
+            (void)f(x); // NOLINT (cppcoreguidelines-owning-memory)
+        }
+    }
+};
+
+struct null_deleter
+{
+    template <class T>
+    void operator()(T* /*x*/) const
+    {
+    }
+};
+
+template <class T, class F, F f>
+using manage_ptr = std::unique_ptr<T, manage_deleter<F, f>>;
+
+template <class T>
+struct element_type
+{
+    using type = typename T::element_type;
+};
+
+template <class T>
+using remove_ptr = typename std::
+    conditional<std::is_pointer<T>::value, std::remove_pointer<T>, element_type<T>>::type::type;
+
+template <class T>
+using shared = std::shared_ptr<remove_ptr<T>>;
+
+} // namespace olCompile
+
+#define OLC_MANAGE_PTR(T, F) \
+    olCompile::manage_ptr<typename std::remove_pointer<T>::type, decltype(&F), &F> // NOLINT
+
+#endif
diff --git a/host/online_compilation/include/md5.hpp b/host/online_compilation/include/md5.hpp
new file mode 100644
index 0000000000..1f350766e7
--- /dev/null
+++ b/host/online_compilation/include/md5.hpp
@@ -0,0 +1,12 @@
+#ifndef GUARD_OLC_MD5_HPP
+#define GUARD_OLC_MD5_HPP
+
+#include <string>
+
+namespace olCompile {
+
+std::string md5(std::string s);
+
+} // namespace olCompile
+
+#endif
diff --git a/host/online_compilation/include/op_kernel_args.hpp b/host/online_compilation/include/op_kernel_args.hpp
new file mode 100644
index 0000000000..7d0420e8f5
--- /dev/null
+++ b/host/online_compilation/include/op_kernel_args.hpp
@@ -0,0 +1,35 @@
+#ifndef OLC_GUARD_MLOPEN_OP_KERNEL_ARGS_HPP
+#define OLC_GUARD_MLOPEN_OP_KERNEL_ARGS_HPP
+
+#include <type_traits>
+#include <cstdint>
+#include <half.hpp>
+
+#include <boost/container/small_vector.hpp>
+struct OpKernelArg
+{
+
+    OpKernelArg(char val, size_t sz) : buffer(sz) { std::fill(buffer.begin(), buffer.end(), val); }
+
+    template <typename T>
+    OpKernelArg(T arg) : buffer(sizeof(T))
+    {
+        static_assert(std::is_trivial<T>{} || std::is_same<T, half_float::half>{},
+                      "Only for trivial types");
+        *(reinterpret_cast<T*>(buffer.data())) = arg;
+    }
+
+    template <typename T>
+    OpKernelArg(T* arg) // NOLINT
+        : buffer(sizeof(T*))
+    {
+        *(reinterpret_cast<T**>(buffer.data())) = arg;
+        is_ptr                                  = true;
+    }
+
+    std::size_t size() const { return buffer.size(); };
+    boost::container::small_vector<char, 8> buffer;
+    bool is_ptr = false;
+};
+
+#endif
diff --git a/host/online_compilation/include/simple_hash.hpp b/host/online_compilation/include/simple_hash.hpp
new file mode 100644
index 0000000000..c7dac54cfc
--- /dev/null
+++ b/host/online_compilation/include/simple_hash.hpp
@@ -0,0 +1,44 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef GUARD_OLC_SIMPLE_HASH_HPP
+#define GUARD_OLC_SIMPLE_HASH_HPP
+
+#include <string>
+
+namespace olCompile {
+struct SimpleHash
+{
+    size_t operator()(const std::pair<std::string, std::string>& p) const
+    {
+        using std::hash;
+        return (hash<std::string>()(p.first) ^ hash<std::string>()(p.second));
+    }
+};
+
+} // namespace olCompile
+
+#endif
diff --git a/host/online_compilation/include/stringutils.hpp b/host/online_compilation/include/stringutils.hpp
new file mode 100644
index 0000000000..6175c36ff4
--- /dev/null
+++ b/host/online_compilation/include/stringutils.hpp
@@ -0,0 +1,133 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_OLC_STRINGUTILS_HPP
+#define GUARD_OLC_STRINGUTILS_HPP
+
+#include <algorithm>
+#include <iterator>
+#include <numeric>
+#include <string>
+#include <vector>
+#include <sstream>
+
+#define OLC_STRINGIZE_1(...) #__VA_ARGS__
+#define OLC_STRINGIZE(...) OLC_STRINGIZE_1(__VA_ARGS__)
+
+namespace olCompile {
+
+inline std::string
+ReplaceString(std::string subject, const std::string& search, const std::string& replace)
+{
+    size_t pos = 0;
+    while((pos = subject.find(search, pos)) != std::string::npos)
+    {
+        subject.replace(pos, search.length(), replace);
+        pos += replace.length();
+    }
+    return subject;
+}
+
+inline bool EndsWith(const std::string& value, const std::string& suffix)
+{
+    if(suffix.size() > value.size())
+        return false;
+    else
+        return std::equal(suffix.rbegin(), suffix.rend(), value.rbegin());
+}
+
+template <class Strings>
+inline std::string JoinStrings(Strings strings, std::string delim)
+{
+    auto it = strings.begin();
+    if(it == strings.end())
+        return "";
+
+    auto nit = std::next(it);
+    return std::accumulate(
+        nit, strings.end(), *it, [&](std::string x, std::string y) { return x + delim + y; });
+}
+
+template <class F>
+static inline std::string TransformString(std::string s, F f)
+{
+    std::transform(s.begin(), s.end(), s.begin(), f);
+    return s;
+}
+
+inline std::string ToUpper(std::string s) { return TransformString(std::move(s), ::toupper); }
+
+inline bool StartsWith(const std::string& value, const std::string& prefix)
+{
+    if(prefix.size() > value.size())
+        return false;
+    else
+        return std::equal(prefix.begin(), prefix.end(), value.begin());
+}
+
+inline std::string RemovePrefix(std::string s, std::string prefix)
+{
+    if(StartsWith(s, prefix))
+        return s.substr(prefix.length());
+    else
+        return s;
+}
+
+inline std::vector<std::string> SplitSpaceSeparated(const std::string& in)
+{
+    std::istringstream ss(in);
+    std::istream_iterator<std::string> begin(ss), end;
+    return {begin, end};
+}
+
+inline std::vector<std::string> SplitSpaceSeparated(const std::string& in,
+                                                    const std::vector<std::string>& dontSplitAfter)
+{
+    std::vector<std::string> rv;
+    std::istringstream ss(in);
+    std::string s;
+    while(ss >> s)
+    {
+        if(std::any_of(dontSplitAfter.begin(), dontSplitAfter.end(), [&](const auto& dont) {
+               return dont == s;
+           }))
+        {
+            std::string s2;
+            if(ss >> s2)
+            {
+                s += std::string(" ").append(s2); // Exactly one space is important.
+                rv.push_back(s);
+                continue;
+            }
+            throw std::runtime_error("Error parsing string: '" + in + '\'');
+        }
+        rv.push_back(s);
+    }
+    return rv;
+}
+
+} // namespace olCompile
+
+#endif // GUARD_OLC_STRINGUTILS_HPP
diff --git a/host/online_compilation/include/target_properties.hpp b/host/online_compilation/include/target_properties.hpp
new file mode 100644
index 0000000000..7918728130
--- /dev/null
+++ b/host/online_compilation/include/target_properties.hpp
@@ -0,0 +1,56 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_OLC_TARGET_PROPERTIES_HPP
+#define GUARD_OLC_TARGET_PROPERTIES_HPP
+
+#include <boost/optional.hpp>
+#include <string>
+
+namespace olCompile {
+
+struct Handle;
+
+struct TargetProperties
+{
+    const std::string& Name() const { return name; }
+    const std::string& DbId() const { return dbId; }
+    boost::optional<bool> Xnack() const { return xnack; }
+    boost::optional<bool> Sramecc() const { return sramecc; }
+    boost::optional<bool> SrameccReported() const { return sramecc_reported; }
+    void Init(const Handle*);
+
+    private:
+    void InitDbId();
+    std::string name;
+    std::string dbId;
+    boost::optional<bool> xnack            = boost::none;
+    boost::optional<bool> sramecc          = boost::none;
+    boost::optional<bool> sramecc_reported = boost::none;
+};
+
+} // namespace olCompile
+
+#endif // GUARD_OLC_TARGET_PROPERTIES_HPP
diff --git a/host/online_compilation/include/tmp_dir.hpp b/host/online_compilation/include/tmp_dir.hpp
new file mode 100644
index 0000000000..099a18bf74
--- /dev/null
+++ b/host/online_compilation/include/tmp_dir.hpp
@@ -0,0 +1,26 @@
+#ifndef GUARD_OLC_TMP_DIR_HPP
+#define GUARD_OLC_TMP_DIR_HPP
+
+#include <string>
+#include <boost/filesystem/path.hpp>
+
+namespace olCompile {
+
+void SystemCmd(std::string cmd);
+
+struct TmpDir
+{
+    boost::filesystem::path path;
+    TmpDir(std::string prefix);
+
+    TmpDir(TmpDir const&) = delete;
+    TmpDir& operator=(TmpDir const&) = delete;
+
+    void Execute(std::string exe, std::string args) const;
+
+    ~TmpDir();
+};
+
+} // namespace olCompile
+
+#endif
diff --git a/host/online_compilation/include/write_file.hpp b/host/online_compilation/include/write_file.hpp
new file mode 100644
index 0000000000..f1ddb85237
--- /dev/null
+++ b/host/online_compilation/include/write_file.hpp
@@ -0,0 +1,30 @@
+#ifndef GUARD_OLC_WRITE_FILE_HPP
+#define GUARD_OLC_WRITE_FILE_HPP
+
+#include <boost/filesystem.hpp>
+#include <manage_ptr.hpp>
+#include <fstream>
+
+namespace olCompile {
+
+using FilePtr = OLC_MANAGE_PTR(FILE*, std::fclose);
+
+inline void WriteFile(const std::string& content, const boost::filesystem::path& name)
+{
+    // std::cerr << "Write file: " << name << std::endl;
+    FilePtr f{std::fopen(name.string().c_str(), "w")};
+    if(std::fwrite(content.c_str(), 1, content.size(), f.get()) != content.size())
+        throw std::runtime_error("Failed to write to file");
+}
+
+inline void WriteFile(const std::vector<char>& content, const boost::filesystem::path& name)
+{
+    // std::cerr << "Write file: " << name << std::endl;
+    FilePtr f{std::fopen(name.string().c_str(), "w")};
+    if(std::fwrite(&content[0], 1, content.size(), f.get()) != content.size())
+        throw std::runtime_error("Failed to write to file");
+}
+
+} // namespace olCompile
+
+#endif
diff --git a/host/online_compilation/kernel.cpp.in b/host/online_compilation/kernel.cpp.in
new file mode 100644
index 0000000000..f67e7d1c6e
--- /dev/null
+++ b/host/online_compilation/kernel.cpp.in
@@ -0,0 +1,70 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <algorithm>
+#include <map>
+#include <stdexcept>
+
+// clang-format off
+${KERNELS_DECLS}
+// clang-format on
+
+namespace olCompile {
+
+const std::map<std::string, std::string>& kernels()
+{
+    static const std::map<std::string, std::string> data{${INIT_KERNELS}};
+    return data;
+}
+
+std::string GetKernelSrc(std::string name)
+{
+    // Use the base name of the string
+    int start  = 0;
+    auto slash = static_cast<int>(name.find_last_of("/\\"));
+    if(slash != std::string::npos)
+    {
+        start = slash + 1;
+    }
+
+    int len = name.size();
+    auto ex = static_cast<int>(name.rfind('.'));
+    if(ex != std::string::npos)
+    {
+        len = ex - start;
+    }
+
+    auto key = name.substr(start, len);
+    // Convert to uppercase
+    std::transform(key.begin(), key.end(), key.begin(), ::toupper);
+
+    auto it = kernels().find(key);
+    if(it == kernels().end())
+        throw std::runtime_error("Failed to load kernel source: " + key);
+
+    return it->second;
+}
+
+} // namespace olCompile
diff --git a/host/online_compilation/kernel_includes.cpp.in b/host/online_compilation/kernel_includes.cpp.in
new file mode 100644
index 0000000000..24dc09e9ff
--- /dev/null
+++ b/host/online_compilation/kernel_includes.cpp.in
@@ -0,0 +1,80 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "olc_kernel_includes.h"
+#include <algorithm>
+#include <map>
+#include <stdexcept>
+#include <vector>
+
+namespace olCompile {
+
+static inline bool EndsWith(const std::string& value, const std::string& suffix)
+{
+    if(suffix.size() > value.size())
+        return false;
+    else
+        return std::equal(suffix.rbegin(), suffix.rend(), value.rbegin());
+}
+
+const std::map<std::string, std::string>& kernel_includes()
+{
+    static const std::map<std::string, std::string> data{${INIT_KERNELS}};
+    return data;
+}
+
+std::string GetKernelInc(std::string key)
+{
+    auto it = kernel_includes().find(key);
+    if(it == kernel_includes().end())
+        throw std::runtime_error("Failed to load kernel source: " + key);
+
+    return it->second;
+}
+
+std::vector<std::string> GetKernelIncList()
+{
+    std::vector<std::string> keys;
+    auto m = kernel_includes();
+    std::transform(m.begin(),
+                   m.end(),
+                   std::back_inserter(keys),
+                   [](decltype(m)::value_type const& pair) { return pair.first; });
+    return keys;
+}
+
+std::vector<std::string> GetHipKernelIncList()
+{
+    auto keys = GetKernelIncList();
+    keys.erase(std::remove_if(keys.begin(),
+                              keys.end(),
+                              [&](const auto& key) {
+                                  return !(EndsWith(key, ".hpp") || EndsWith(key, ".h"));
+                              }),
+               keys.end());
+    return keys;
+}
+
+} // namespace olCompile
diff --git a/host/online_compilation/kernels_batch.cpp.in b/host/online_compilation/kernels_batch.cpp.in
new file mode 100644
index 0000000000..a31caf9c5e
--- /dev/null
+++ b/host/online_compilation/kernels_batch.cpp.in
@@ -0,0 +1 @@
+#include "${KERNEL_SRC_HPP_FILENAME}"
diff --git a/script/cmake-rocm.sh b/script/cmake-rocm.sh
new file mode 100755
index 0000000000..e65c53ce1e
--- /dev/null
+++ b/script/cmake-rocm.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+rm -f CMakeCache.txt
+rm -f *.cmake
+rm -rf CMakeFiles
+
+MY_PROJECT_SOURCE=../../../
+MY_PROJECT_INSTALL=../install.dir
+
+cmake                                                                                                                                          \
+-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL}                                                                                                  \
+-D CMAKE_BUILD_TYPE=Release                                                                                                                    \
+-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX906 -O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=$PWD"   \
+-D HIP_ONLINE_COMPILER_FLAGS="-DCK_AMD_GPU_GFX906"                                                                                             \
+-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                                                                      \
+-D CMAKE_PREFIX_PATH=/opt/rocm                                                                                                                 \
+-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                                                              \
+${MY_PROJECT_SOURCE}
+
+#CXX_FLAG_TMP=-Weverything
+#            -Wno-c++98-compat \
+#            -Wno-c++98-compat-pedantic \
+#            -Wno-conversion \
+#            -Wno-double-promotion \
+#            -Wno-exit-time-destructors \
+#            -Wno-extra-semi \
+#            -Wno-float-conversion \
+#            -Wno-gnu-anonymous-struct \
+#            -Wno-gnu-zero-variadic-macro-arguments \
+#            -Wno-missing-noreturn \
+#            -Wno-missing-prototypes \
+#            -Wno-nested-anon-types \
+#            -Wno-padded \
+#            -Wno-return-std-move-in-c++11 \
+#            -Wno-shorten-64-to-32 \
+#            -Wno-sign-conversion \
+#            -Wno-unknown-warning-option \
+#            -Wno-unused-command-line-argument \
+#            -Wno-weak-vtables \
+#            -Wno-covered-switch-default \
+#            -Wno-disabled-macro-expansion \
+#            -Wno-undefined-reinterpret-cast
+
diff --git a/script/count_vgpr.sh b/script/count_vgpr.sh
new file mode 100755
index 0000000000..4fbfec0278
--- /dev/null
+++ b/script/count_vgpr.sh
@@ -0,0 +1,259 @@
+#!/bin/bash
+FILE=$1
+
+echo	v0	$(	grep -w	v0	$FILE	| wc -l	)
+echo	v1	$(	grep -w	v1	$FILE	| wc -l	)
+echo	v2	$(	grep -w	v2	$FILE	| wc -l	)
+echo	v3	$(	grep -w	v3	$FILE	| wc -l	)
+echo	v4	$(	grep -w	v4	$FILE	| wc -l	)
+echo	v5	$(	grep -w	v5	$FILE	| wc -l	)
+echo	v6	$(	grep -w	v6	$FILE	| wc -l	)
+echo	v7	$(	grep -w	v7	$FILE	| wc -l	)
+echo	v8	$(	grep -w	v8	$FILE	| wc -l	)
+echo	v9	$(	grep -w	v9	$FILE	| wc -l	)
+echo	v10	$(	grep -w	v10	$FILE	| wc -l	)
+echo	v11	$(	grep -w	v11	$FILE	| wc -l	)
+echo	v12	$(	grep -w	v12	$FILE	| wc -l	)
+echo	v13	$(	grep -w	v13	$FILE	| wc -l	)
+echo	v14	$(	grep -w	v14	$FILE	| wc -l	)
+echo	v15	$(	grep -w	v15	$FILE	| wc -l	)
+echo	v16	$(	grep -w	v16	$FILE	| wc -l	)
+echo	v17	$(	grep -w	v17	$FILE	| wc -l	)
+echo	v18	$(	grep -w	v18	$FILE	| wc -l	)
+echo	v19	$(	grep -w	v19	$FILE	| wc -l	)
+echo	v20	$(	grep -w	v20	$FILE	| wc -l	)
+echo	v21	$(	grep -w	v21	$FILE	| wc -l	)
+echo	v22	$(	grep -w	v22	$FILE	| wc -l	)
+echo	v23	$(	grep -w	v23	$FILE	| wc -l	)
+echo	v24	$(	grep -w	v24	$FILE	| wc -l	)
+echo	v25	$(	grep -w	v25	$FILE	| wc -l	)
+echo	v26	$(	grep -w	v26	$FILE	| wc -l	)
+echo	v27	$(	grep -w	v27	$FILE	| wc -l	)
+echo	v28	$(	grep -w	v28	$FILE	| wc -l	)
+echo	v29	$(	grep -w	v29	$FILE	| wc -l	)
+echo	v30	$(	grep -w	v30	$FILE	| wc -l	)
+echo	v31	$(	grep -w	v31	$FILE	| wc -l	)
+echo	v32	$(	grep -w	v32	$FILE	| wc -l	)
+echo	v33	$(	grep -w	v33	$FILE	| wc -l	)
+echo	v34	$(	grep -w	v34	$FILE	| wc -l	)
+echo	v35	$(	grep -w	v35	$FILE	| wc -l	)
+echo	v36	$(	grep -w	v36	$FILE	| wc -l	)
+echo	v37	$(	grep -w	v37	$FILE	| wc -l	)
+echo	v38	$(	grep -w	v38	$FILE	| wc -l	)
+echo	v39	$(	grep -w	v39	$FILE	| wc -l	)
+echo	v40	$(	grep -w	v40	$FILE	| wc -l	)
+echo	v41	$(	grep -w	v41	$FILE	| wc -l	)
+echo	v42	$(	grep -w	v42	$FILE	| wc -l	)
+echo	v43	$(	grep -w	v43	$FILE	| wc -l	)
+echo	v44	$(	grep -w	v44	$FILE	| wc -l	)
+echo	v45	$(	grep -w	v45	$FILE	| wc -l	)
+echo	v46	$(	grep -w	v46	$FILE	| wc -l	)
+echo	v47	$(	grep -w	v47	$FILE	| wc -l	)
+echo	v48	$(	grep -w	v48	$FILE	| wc -l	)
+echo	v49	$(	grep -w	v49	$FILE	| wc -l	)
+echo	v50	$(	grep -w	v50	$FILE	| wc -l	)
+echo	v51	$(	grep -w	v51	$FILE	| wc -l	)
+echo	v52	$(	grep -w	v52	$FILE	| wc -l	)
+echo	v53	$(	grep -w	v53	$FILE	| wc -l	)
+echo	v54	$(	grep -w	v54	$FILE	| wc -l	)
+echo	v55	$(	grep -w	v55	$FILE	| wc -l	)
+echo	v56	$(	grep -w	v56	$FILE	| wc -l	)
+echo	v57	$(	grep -w	v57	$FILE	| wc -l	)
+echo	v58	$(	grep -w	v58	$FILE	| wc -l	)
+echo	v59	$(	grep -w	v59	$FILE	| wc -l	)
+echo	v60	$(	grep -w	v60	$FILE	| wc -l	)
+echo	v61	$(	grep -w	v61	$FILE	| wc -l	)
+echo	v62	$(	grep -w	v62	$FILE	| wc -l	)
+echo	v63	$(	grep -w	v63	$FILE	| wc -l	)
+echo	v64	$(	grep -w	v64	$FILE	| wc -l	)
+echo	v65	$(	grep -w	v65	$FILE	| wc -l	)
+echo	v66	$(	grep -w	v66	$FILE	| wc -l	)
+echo	v67	$(	grep -w	v67	$FILE	| wc -l	)
+echo	v68	$(	grep -w	v68	$FILE	| wc -l	)
+echo	v69	$(	grep -w	v69	$FILE	| wc -l	)
+echo	v70	$(	grep -w	v70	$FILE	| wc -l	)
+echo	v71	$(	grep -w	v71	$FILE	| wc -l	)
+echo	v72	$(	grep -w	v72	$FILE	| wc -l	)
+echo	v73	$(	grep -w	v73	$FILE	| wc -l	)
+echo	v74	$(	grep -w	v74	$FILE	| wc -l	)
+echo	v75	$(	grep -w	v75	$FILE	| wc -l	)
+echo	v76	$(	grep -w	v76	$FILE	| wc -l	)
+echo	v77	$(	grep -w	v77	$FILE	| wc -l	)
+echo	v78	$(	grep -w	v78	$FILE	| wc -l	)
+echo	v79	$(	grep -w	v79	$FILE	| wc -l	)
+echo	v80	$(	grep -w	v80	$FILE	| wc -l	)
+echo	v81	$(	grep -w	v81	$FILE	| wc -l	)
+echo	v82	$(	grep -w	v82	$FILE	| wc -l	)
+echo	v83	$(	grep -w	v83	$FILE	| wc -l	)
+echo	v84	$(	grep -w	v84	$FILE	| wc -l	)
+echo	v85	$(	grep -w	v85	$FILE	| wc -l	)
+echo	v86	$(	grep -w	v86	$FILE	| wc -l	)
+echo	v87	$(	grep -w	v87	$FILE	| wc -l	)
+echo	v88	$(	grep -w	v88	$FILE	| wc -l	)
+echo	v89	$(	grep -w	v89	$FILE	| wc -l	)
+echo	v90	$(	grep -w	v90	$FILE	| wc -l	)
+echo	v91	$(	grep -w	v91	$FILE	| wc -l	)
+echo	v92	$(	grep -w	v92	$FILE	| wc -l	)
+echo	v93	$(	grep -w	v93	$FILE	| wc -l	)
+echo	v94	$(	grep -w	v94	$FILE	| wc -l	)
+echo	v95	$(	grep -w	v95	$FILE	| wc -l	)
+echo	v96	$(	grep -w	v96	$FILE	| wc -l	)
+echo	v97	$(	grep -w	v97	$FILE	| wc -l	)
+echo	v98	$(	grep -w	v98	$FILE	| wc -l	)
+echo	v99	$(	grep -w	v99	$FILE	| wc -l	)
+echo	v100	$(	grep -w	v100	$FILE	| wc -l	)
+echo	v101	$(	grep -w	v101	$FILE	| wc -l	)
+echo	v102	$(	grep -w	v102	$FILE	| wc -l	)
+echo	v103	$(	grep -w	v103	$FILE	| wc -l	)
+echo	v104	$(	grep -w	v104	$FILE	| wc -l	)
+echo	v105	$(	grep -w	v105	$FILE	| wc -l	)
+echo	v106	$(	grep -w	v106	$FILE	| wc -l	)
+echo	v107	$(	grep -w	v107	$FILE	| wc -l	)
+echo	v108	$(	grep -w	v108	$FILE	| wc -l	)
+echo	v109	$(	grep -w	v109	$FILE	| wc -l	)
+echo	v110	$(	grep -w	v110	$FILE	| wc -l	)
+echo	v111	$(	grep -w	v111	$FILE	| wc -l	)
+echo	v112	$(	grep -w	v112	$FILE	| wc -l	)
+echo	v113	$(	grep -w	v113	$FILE	| wc -l	)
+echo	v114	$(	grep -w	v114	$FILE	| wc -l	)
+echo	v115	$(	grep -w	v115	$FILE	| wc -l	)
+echo	v116	$(	grep -w	v116	$FILE	| wc -l	)
+echo	v117	$(	grep -w	v117	$FILE	| wc -l	)
+echo	v118	$(	grep -w	v118	$FILE	| wc -l	)
+echo	v119	$(	grep -w	v119	$FILE	| wc -l	)
+echo	v120	$(	grep -w	v120	$FILE	| wc -l	)
+echo	v121	$(	grep -w	v121	$FILE	| wc -l	)
+echo	v122	$(	grep -w	v122	$FILE	| wc -l	)
+echo	v123	$(	grep -w	v123	$FILE	| wc -l	)
+echo	v124	$(	grep -w	v124	$FILE	| wc -l	)
+echo	v125	$(	grep -w	v125	$FILE	| wc -l	)
+echo	v126	$(	grep -w	v126	$FILE	| wc -l	)
+echo	v127	$(	grep -w	v127	$FILE	| wc -l	)
+echo	v128	$(	grep -w	v128	$FILE	| wc -l	)
+echo	v129	$(	grep -w	v129	$FILE	| wc -l	)
+echo	v130	$(	grep -w	v130	$FILE	| wc -l	)
+echo	v131	$(	grep -w	v131	$FILE	| wc -l	)
+echo	v132	$(	grep -w	v132	$FILE	| wc -l	)
+echo	v133	$(	grep -w	v133	$FILE	| wc -l	)
+echo	v134	$(	grep -w	v134	$FILE	| wc -l	)
+echo	v135	$(	grep -w	v135	$FILE	| wc -l	)
+echo	v136	$(	grep -w	v136	$FILE	| wc -l	)
+echo	v137	$(	grep -w	v137	$FILE	| wc -l	)
+echo	v138	$(	grep -w	v138	$FILE	| wc -l	)
+echo	v139	$(	grep -w	v139	$FILE	| wc -l	)
+echo	v140	$(	grep -w	v140	$FILE	| wc -l	)
+echo	v141	$(	grep -w	v141	$FILE	| wc -l	)
+echo	v142	$(	grep -w	v142	$FILE	| wc -l	)
+echo	v143	$(	grep -w	v143	$FILE	| wc -l	)
+echo	v144	$(	grep -w	v144	$FILE	| wc -l	)
+echo	v145	$(	grep -w	v145	$FILE	| wc -l	)
+echo	v146	$(	grep -w	v146	$FILE	| wc -l	)
+echo	v147	$(	grep -w	v147	$FILE	| wc -l	)
+echo	v148	$(	grep -w	v148	$FILE	| wc -l	)
+echo	v149	$(	grep -w	v149	$FILE	| wc -l	)
+echo	v150	$(	grep -w	v150	$FILE	| wc -l	)
+echo	v151	$(	grep -w	v151	$FILE	| wc -l	)
+echo	v152	$(	grep -w	v152	$FILE	| wc -l	)
+echo	v153	$(	grep -w	v153	$FILE	| wc -l	)
+echo	v154	$(	grep -w	v154	$FILE	| wc -l	)
+echo	v155	$(	grep -w	v155	$FILE	| wc -l	)
+echo	v156	$(	grep -w	v156	$FILE	| wc -l	)
+echo	v157	$(	grep -w	v157	$FILE	| wc -l	)
+echo	v158	$(	grep -w	v158	$FILE	| wc -l	)
+echo	v159	$(	grep -w	v159	$FILE	| wc -l	)
+echo	v160	$(	grep -w	v160	$FILE	| wc -l	)
+echo	v161	$(	grep -w	v161	$FILE	| wc -l	)
+echo	v162	$(	grep -w	v162	$FILE	| wc -l	)
+echo	v163	$(	grep -w	v163	$FILE	| wc -l	)
+echo	v164	$(	grep -w	v164	$FILE	| wc -l	)
+echo	v165	$(	grep -w	v165	$FILE	| wc -l	)
+echo	v166	$(	grep -w	v166	$FILE	| wc -l	)
+echo	v167	$(	grep -w	v167	$FILE	| wc -l	)
+echo	v168	$(	grep -w	v168	$FILE	| wc -l	)
+echo	v169	$(	grep -w	v169	$FILE	| wc -l	)
+echo	v170	$(	grep -w	v170	$FILE	| wc -l	)
+echo	v171	$(	grep -w	v171	$FILE	| wc -l	)
+echo	v172	$(	grep -w	v172	$FILE	| wc -l	)
+echo	v173	$(	grep -w	v173	$FILE	| wc -l	)
+echo	v174	$(	grep -w	v174	$FILE	| wc -l	)
+echo	v175	$(	grep -w	v175	$FILE	| wc -l	)
+echo	v176	$(	grep -w	v176	$FILE	| wc -l	)
+echo	v177	$(	grep -w	v177	$FILE	| wc -l	)
+echo	v178	$(	grep -w	v178	$FILE	| wc -l	)
+echo	v179	$(	grep -w	v179	$FILE	| wc -l	)
+echo	v180	$(	grep -w	v180	$FILE	| wc -l	)
+echo	v181	$(	grep -w	v181	$FILE	| wc -l	)
+echo	v182	$(	grep -w	v182	$FILE	| wc -l	)
+echo	v183	$(	grep -w	v183	$FILE	| wc -l	)
+echo	v184	$(	grep -w	v184	$FILE	| wc -l	)
+echo	v185	$(	grep -w	v185	$FILE	| wc -l	)
+echo	v186	$(	grep -w	v186	$FILE	| wc -l	)
+echo	v187	$(	grep -w	v187	$FILE	| wc -l	)
+echo	v188	$(	grep -w	v188	$FILE	| wc -l	)
+echo	v189	$(	grep -w	v189	$FILE	| wc -l	)
+echo	v190	$(	grep -w	v190	$FILE	| wc -l	)
+echo	v191	$(	grep -w	v191	$FILE	| wc -l	)
+echo	v192	$(	grep -w	v192	$FILE	| wc -l	)
+echo	v193	$(	grep -w	v193	$FILE	| wc -l	)
+echo	v194	$(	grep -w	v194	$FILE	| wc -l	)
+echo	v195	$(	grep -w	v195	$FILE	| wc -l	)
+echo	v196	$(	grep -w	v196	$FILE	| wc -l	)
+echo	v197	$(	grep -w	v197	$FILE	| wc -l	)
+echo	v198	$(	grep -w	v198	$FILE	| wc -l	)
+echo	v199	$(	grep -w	v199	$FILE	| wc -l	)
+echo	v200	$(	grep -w	v200	$FILE	| wc -l	)
+echo	v201	$(	grep -w	v201	$FILE	| wc -l	)
+echo	v202	$(	grep -w	v202	$FILE	| wc -l	)
+echo	v203	$(	grep -w	v203	$FILE	| wc -l	)
+echo	v204	$(	grep -w	v204	$FILE	| wc -l	)
+echo	v205	$(	grep -w	v205	$FILE	| wc -l	)
+echo	v206	$(	grep -w	v206	$FILE	| wc -l	)
+echo	v207	$(	grep -w	v207	$FILE	| wc -l	)
+echo	v208	$(	grep -w	v208	$FILE	| wc -l	)
+echo	v209	$(	grep -w	v209	$FILE	| wc -l	)
+echo	v210	$(	grep -w	v210	$FILE	| wc -l	)
+echo	v211	$(	grep -w	v211	$FILE	| wc -l	)
+echo	v212	$(	grep -w	v212	$FILE	| wc -l	)
+echo	v213	$(	grep -w	v213	$FILE	| wc -l	)
+echo	v214	$(	grep -w	v214	$FILE	| wc -l	)
+echo	v215	$(	grep -w	v215	$FILE	| wc -l	)
+echo	v216	$(	grep -w	v216	$FILE	| wc -l	)
+echo	v217	$(	grep -w	v217	$FILE	| wc -l	)
+echo	v218	$(	grep -w	v218	$FILE	| wc -l	)
+echo	v219	$(	grep -w	v219	$FILE	| wc -l	)
+echo	v220	$(	grep -w	v220	$FILE	| wc -l	)
+echo	v221	$(	grep -w	v221	$FILE	| wc -l	)
+echo	v222	$(	grep -w	v222	$FILE	| wc -l	)
+echo	v223	$(	grep -w	v223	$FILE	| wc -l	)
+echo	v224	$(	grep -w	v224	$FILE	| wc -l	)
+echo	v225	$(	grep -w	v225	$FILE	| wc -l	)
+echo	v226	$(	grep -w	v226	$FILE	| wc -l	)
+echo	v227	$(	grep -w	v227	$FILE	| wc -l	)
+echo	v228	$(	grep -w	v228	$FILE	| wc -l	)
+echo	v229	$(	grep -w	v229	$FILE	| wc -l	)
+echo	v230	$(	grep -w	v230	$FILE	| wc -l	)
+echo	v231	$(	grep -w	v231	$FILE	| wc -l	)
+echo	v232	$(	grep -w	v232	$FILE	| wc -l	)
+echo	v233	$(	grep -w	v233	$FILE	| wc -l	)
+echo	v234	$(	grep -w	v234	$FILE	| wc -l	)
+echo	v235	$(	grep -w	v235	$FILE	| wc -l	)
+echo	v236	$(	grep -w	v236	$FILE	| wc -l	)
+echo	v237	$(	grep -w	v237	$FILE	| wc -l	)
+echo	v238	$(	grep -w	v238	$FILE	| wc -l	)
+echo	v239	$(	grep -w	v239	$FILE	| wc -l	)
+echo	v240	$(	grep -w	v240	$FILE	| wc -l	)
+echo	v241	$(	grep -w	v241	$FILE	| wc -l	)
+echo	v242	$(	grep -w	v242	$FILE	| wc -l	)
+echo	v243	$(	grep -w	v243	$FILE	| wc -l	)
+echo	v244	$(	grep -w	v244	$FILE	| wc -l	)
+echo	v245	$(	grep -w	v245	$FILE	| wc -l	)
+echo	v246	$(	grep -w	v246	$FILE	| wc -l	)
+echo	v247	$(	grep -w	v247	$FILE	| wc -l	)
+echo	v248	$(	grep -w	v248	$FILE	| wc -l	)
+echo	v249	$(	grep -w	v249	$FILE	| wc -l	)
+echo	v250	$(	grep -w	v250	$FILE	| wc -l	)
+echo	v251	$(	grep -w	v251	$FILE	| wc -l	)
+echo	v252	$(	grep -w	v252	$FILE	| wc -l	)
+echo	v253	$(	grep -w	v253	$FILE	| wc -l	)
+echo	v254	$(	grep -w	v254	$FILE	| wc -l	)
+echo	v255	$(	grep -w	v255	$FILE	| wc -l	)
diff --git a/script/docker-rocm4.1.sh b/script/docker-rocm4.1.sh
new file mode 100755
index 0000000000..61cc33c5b8
--- /dev/null
+++ b/script/docker-rocm4.1.sh
@@ -0,0 +1,14 @@
+WORKSPACE=$1
+echo "workspace: " $WORKSPACE
+
+docker run                                                                   \
+-it                                                                          \
+--rm                                                                         \
+--privileged                                                                 \
+--group-add sudo                                                             \
+-w /root/workspace                                                           \
+-v $WORKSPACE:/root/workspace                                                \
+rocm/tensorflow:rocm4.1-tf1.15-dev                               \
+/bin/bash
+
+#--network host                                                               \
diff --git a/script/hipclang_opt.sh b/script/hipclang_opt.sh
new file mode 100755
index 0000000000..c51bd51d97
--- /dev/null
+++ b/script/hipclang_opt.sh
@@ -0,0 +1,25 @@
+rm *.ll *.s
+
+BC_FILE=$1
+
+/opt/rocm/llvm/bin/llvm-dis $BC_FILE -o original.ll
+/opt/rocm/llvm/bin/opt -S -inline -inline-threshold=104857 original.ll > inline.ll
+/opt/rocm/llvm/bin/opt -S -sroa inline.ll > sroa.ll
+/opt/rocm/llvm/bin/opt -S -O3 sroa.ll > o3.ll
+
+/opt/rocm/llvm/bin/llc -mcpu=gfx906 original.ll
+/opt/rocm/llvm/bin/llc -mcpu=gfx906 inline.ll
+/opt/rocm/llvm/bin/llc -mcpu=gfx906 sroa.ll
+/opt/rocm/llvm/bin/llc -mcpu=gfx906 o3.ll
+
+#/opt/rocm/llvm/bin/opt -S -O3 -sroa inline.ll > o3.ll
+#/opt/rocm/llvm/bin/opt -S -O3 -sroa o3.ll > o3_2.ll
+#/opt/rocm/llvm/bin/opt -S -O3 -sroa o3_2.ll > o3_3.ll
+#/opt/rocm/llvm/bin/opt -S -O3 -sroa o3_3.ll > o3_4.ll
+
+#/opt/rocm/llvm/bin/llc -mcpu=gfx908 opt.ll
+#/opt/rocm/llvm/bin/llc -mcpu=gfx908 inline.ll
+#/opt/rocm/llvm/bin/llc -mcpu=gfx908 o3.ll
+#/opt/rocm/llvm/bin/llc -mcpu=gfx908 o3_2.ll
+#/opt/rocm/llvm/bin/llc -mcpu=gfx908 o3_3.ll
+#/opt/rocm/llvm/bin/llc -mcpu=gfx908 o3_4.ll
diff --git a/script/run.sh b/script/run.sh
new file mode 100755
index 0000000000..ecb5c85d81
--- /dev/null
+++ b/script/run.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+## GPU visibility
+ export ROCR_VISIBLE_DEVICE=0
+ export GPU_DEVICE_ORDINAL=0
+
+## Boost
+ export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
+
+## Compiling
+#export OLC_DEBUG_HIP_VERBOSE=1
+#export OLC_DEBUG_HIP_DUMP=1
+#export OLC_DEBUG_SAVE_TEMP_DIR=1
+
+ make -j conv_fwd_driver_offline
+ make -j conv_bwd_driver_offline
+ make -j conv_fwd_driver_online
+
+#rm -rf /root/_hip_binary_kernels_/
+#rm -rf /tmp/olCompile*
+
+LAYOUT=$1
+ALGO=$2
+VERIFY=$3
+INIT=$4
+LOG=$5
+REPEAT=$6
+
+################################################ layout  algo  verify  init  log  repeat  N__ K___ C___ Y X Hi_ Wi__ Strides Dilations LeftPads RightPads
+#./host/driver_offline/conv_fwd_driver_offline  $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  128  128  192 3 3  71   71     2 2       1 1      1 1       1 1
+#./host/driver_offline/conv_fwd_driver_offline  $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  128  256  192 3 3  71   71     2 2       1 1      1 1       1 1
+#./host/driver_offline/conv_fwd_driver_offline  $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  128  256 1024 1 7  17   17     1 1       1 1      0 3       0 3
+ ./host/driver_offline/conv_fwd_driver_offline  $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  256  256  256 3 3  14   14     1 1       1 1      1 1       1 1
+#./host/driver_offline/conv_fwd_driver_offline  $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  128  128  128 3 3  14   14     1 1       1 1      1 1       1 1
+#./host/driver_offline/conv_fwd_driver_offline  $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  256  512  512 3 3   7    7     1 1       1 1      1 1       1 1
+
+#./host/driver_offline/conv_fwd_driver_offline  $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  128  512  192 3 3  35   35     2 2       1 1      0 0       0 0
+#./host/driver_offline/conv_fwd_driver_offline  $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  256  256  256 3 3  30   30     2 2       1 1      0 0       0 0
+#./host/driver_offline/conv_fwd_driver_offline  $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  256  512  512 3 3  16   16     2 2       1 1      0 0       0 0
+
+#./host/driver_offline/conv_fwd_driver_offline  $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  256 2048 1024 1 1  14   14     2 2       1 1      0 0       0 0
+#./host/driver_offline/conv_fwd_driver_offline  $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  256  256 1024 1 1  14   14     1 1       1 1      0 0       0 0
+#./host/driver_offline/conv_fwd_driver_offline  $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  256  512 2048 1 1   7    7     1 1       1 1      0 0       0 0
+
+#./host/driver_offline/conv_bwd_driver_offline  $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  256  256  256 3 3  14   14     1 1       1 1      1 1       1 1
+
+#./host/driver_online/conv_fwd_driver_online    $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  128  256  192 3 3  71   71     2 2       1 1      1 1       1 1

From 6204be8d332857fece75a5fe2ce3999646ab5269 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Fri, 30 Jul 2021 17:50:17 -0500
Subject: [PATCH 02/57] add solver ConvIgemmFwdV6r1DlopsNchwKcyxNkhw; rename
 static ck source files

---
 src/CMakeLists.txt                            |  15 +-
 .../external/half/include/half.hpp            |   3 +-
 src/include/miopen/solver.hpp                 |  40 ++
 src/include/miopen/solver/ck_util.hpp         | 341 +++++++++++++++++
 .../miopen/solver/implicitgemm_util.hpp       | 354 ++++--------------
 .../include/utility/common_header.hpp         |  30 --
 .../static_kernel_convolution_common.hpp}     |   0
 ...t_gemm_v1r1_fp16_bfp16_nchw_kcyx_nkhw.hpp} |   8 +-
 ..._implicit_gemm_v1r1_ncdhw_kczyx_nkdhw.hpp} |   8 +-
 ...ata_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp} |   8 +-
 ...it_gemm_v1r1_xdlops_gnchw_gkcyx_gnkhw.hpp} |   8 +-
 ..._implicit_gemm_v4r1_ncdhw_kczyx_nkdhw.hpp} |   8 +-
 ...ata_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp} |   8 +-
 ...it_gemm_v4r1_xdlops_gnchw_gkcyx_gnkhw.hpp} |  10 +-
 ..._implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.hpp} |  10 +-
 ...hts_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp} |  10 +-
 ...licit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp} |  10 +-
 ...4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.hpp} |  10 +-
 ...licit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp} |  10 +-
 ...4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.hpp} |  10 +-
 ...licit_gemm_v4r5_xdlops_nchw_kcyx_nkhw.hpp} |  10 +-
 ...6_gnchw_gkcyx_gnkhw_lds_double_buffer.hpp} |  16 +-
 ...fp16_nchw_kcyx_nkhw_lds_double_buffer.hpp} |  16 +-
 ...1_gnchw_gkcyx_gnkhw_lds_double_buffer.hpp} |  16 +-
 ...v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp} |  16 +-
 ..._implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.hpp} |   8 +-
 ...ion_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp} |   8 +-
 ...idwise_generic_2d_reduction_blockwise.hpp} |  12 +-
 ...eneric_2d_reduction_direct_threadwise.hpp} |  10 +-
 ..._generic_2d_reduction_direct_warpwise.hpp} |  10 +-
 ...dwise_generic_2d_reduction_multiblock.hpp} |  12 +-
 ...tic_kernel_gridwise_generic_reduction.hpp} |  18 +-
 .../static_kernel_reduction_functions.hpp}    |   6 +-
 ..._reduction_kernel_simple_configurator.hpp} |   4 +-
 ...tatic_kernel_ConstantMatrixDescriptor.hpp} |   6 +-
 ...tantMergedTensorDescriptor_deprecated.hpp} |   4 +-
 ...l_ConstantTensorDescriptor_deprecated.hpp} |   2 +-
 .../static_kernel_dimension.hpp}              |   2 +-
 .../static_kernel_multi_index_transform.hpp}  |   2 +-
 .../static_kernel_tensor_coordinate.hpp}      |   8 +-
 ...c_kernel_tensor_coordinate_deprecated.hpp} |   6 +-
 ...tatic_kernel_tensor_coordinate_helper.hpp} |   0
 .../static_kernel_tensor_descriptor.hpp}      |   6 +-
 ...tatic_kernel_tensor_descriptor_helper.hpp} |   4 +-
 .../static_kernel_blockwise_gemm.hpp}         |   6 +-
 .../static_kernel_blockwise_gemm_xdlops.hpp}  |  10 +-
 ...l_blockwise_generic_tensor_slice_copy.hpp} |  10 +-
 ..._generic_tensor_slice_copy_deprecated.hpp} |  10 +-
 .../static_kernel_gridwise_gemm.hpp}          |  14 +-
 ...tatic_kernel_gridwise_gemm_fp16_bfp16.hpp} |  14 +-
 .../static_kernel_gridwise_gemm_xdlops.hpp}   |  14 +-
 ...ernel_gridwise_gemm_xdlops_fp16_bfp16.hpp} |  14 +-
 .../static_kernel_threadwise_gemm.hpp}        |   6 +-
 ..._threadwise_generic_tensor_slice_copy.hpp} |   8 +-
 ..._generic_tensor_slice_copy_deprecated.hpp} |   8 +-
 .../static_kernel_xdlops_gemm.hpp}            |   6 +-
 .../static_kernel_xdlops_gemm_inline_asm.hpp} |   6 +-
 .../static_kernel_amd_buffer_addressing.hpp}  |   2 +-
 .../utility/static_kernel_amd_inline_asm.hpp} |   2 +-
 .../utility/static_kernel_amd_xdlops.hpp}     |   2 +-
 .../static_kernel_amd_xdlops_inline_asm.hpp}  |   2 +-
 .../include/utility/static_kernel_array.hpp}  |   4 +-
 .../utility/static_kernel_ck_utils_type.hpp}  |   2 +-
 .../utility/static_kernel_common_header.hpp   |  30 ++
 .../include/utility/static_kernel_config.hpp} |   0
 .../utility/static_kernel_float_type.hpp}     |   0
 .../utility/static_kernel_functional.hpp}     |   6 +-
 .../utility/static_kernel_functional2.hpp}    |   4 +-
 .../utility/static_kernel_functional3.hpp}    |   8 +-
 .../utility/static_kernel_functional4.hpp}    |   6 +-
 .../static_kernel_in_memory_operation.hpp}    |   4 +-
 .../static_kernel_integral_constant.hpp}      |   0
 .../include/utility/static_kernel_math.hpp}   |   6 +-
 .../include/utility/static_kernel_number.hpp} |   2 +-
 .../static_kernel_reduction_common.hpp}       |   2 +-
 .../static_kernel_reduction_operator.hpp}     |   2 +-
 .../utility/static_kernel_sequence.hpp}       |   8 +-
 .../static_kernel_synchronization.hpp}        |   2 +-
 .../include/utility/static_kernel_tuple.hpp}  |   6 +-
 .../utility/static_kernel_utility.hpp}        |   2 +-
 ..._implicit_gemm_v1r1_ncdhw_kczyx_nkdhw.cpp} |   4 +-
 ...ata_implicit_gemm_v1r1_nchw_kcyx_nkhw.cpp} |   6 +-
 ...licit_gemm_v1r1_xdlops_nchw_kcyx_nkhw.cpp} |   4 +-
 ..._implicit_gemm_v4r1_ncdhw_kczyx_nkdhw.cpp} |   4 +-
 ...ata_implicit_gemm_v4r1_nchw_kcyx_nkhw.cpp} |   4 +-
 ...licit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.cpp} |   4 +-
 ..._implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.cpp} |   4 +-
 ...hts_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp} |   4 +-
 ...licit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp} |   4 +-
 ...4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.cpp} |   4 +-
 ...licit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp} |   4 +-
 ...4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.cpp} |   4 +-
 ...licit_gemm_v4r5_xdlops_nchw_kcyx_nkhw.cpp} |   4 +-
 ...1_gnchw_gkcyx_gnkhw_lds_double_buffer.cpp} |   6 +-
 ...v4r1_nchw_kcyx_nkhw_lds_double_buffer.cpp} |   6 +-
 ...lops_nchw_kcyx_nkhw_lds_double_buffer.cpp} |   4 +-
 ..._implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.cpp} |   4 +-
 ...ion_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp} |   4 +-
 ...tic_kernel_gridwise_generic_reduction.cpp} |  12 +-
 src/mlo_dir_conv.cpp                          |   3 +-
 src/solver.cpp                                |   3 +
 .../conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp     | 241 ++++++++++++
 .../conv_hip_implicit_gemm_bwd_v1r1.cpp       |  10 +-
 ...conv_hip_implicit_gemm_bwd_v1r1_xdlops.cpp |   6 +-
 .../conv_hip_implicit_gemm_bwd_v4r1.cpp       |  11 +-
 ...conv_hip_implicit_gemm_bwd_v4r1_xdlops.cpp |   7 +-
 .../conv_hip_implicit_gemm_fwd_v4r1.cpp       |  20 +-
 .../conv_hip_implicit_gemm_fwd_v4r4.cpp       |   6 +-
 ...conv_hip_implicit_gemm_fwd_v4r4_xdlops.cpp |   4 +-
 ...licit_gemm_fwd_v4r4_xdlops_padded_gemm.cpp |   5 +-
 ...conv_hip_implicit_gemm_fwd_v4r5_xdlops.cpp |   4 +-
 .../conv_hip_implicit_gemm_wrw_v4r4.cpp       |  10 +-
 ...conv_hip_implicit_gemm_wrw_v4r4_xdlops.cpp |   6 +-
 ...licit_gemm_wrw_v4r4_xdlops_padded_gemm.cpp |   2 +-
 114 files changed, 1093 insertions(+), 666 deletions(-)
 create mode 100644 src/include/miopen/solver/ck_util.hpp
 delete mode 100644 src/kernels/composable_kernel/include/utility/common_header.hpp
 rename src/kernels/{composable_kernel/include/kernel_algorithm/convolution_common.hpp => static_composable_kernel/include/kernel_algorithm/static_kernel_convolution_common.hpp} (100%)
 rename src/kernels/{composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_fp16_bfp16_nchw_kcyx_nkhw.hpp => static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_fp16_bfp16_nchw_kcyx_nkhw.hpp} (97%)
 rename src/kernels/{composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_ncdhw_kczyx_nkdhw.hpp => static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_ncdhw_kczyx_nkdhw.hpp} (98%)
 rename src/kernels/{composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp => static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp} (97%)
 rename src/kernels/{composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_gnchw_gkcyx_gnkhw.hpp => static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_gnchw_gkcyx_gnkhw.hpp} (97%)
 rename src/kernels/{composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r1_ncdhw_kczyx_nkdhw.hpp => static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_ncdhw_kczyx_nkdhw.hpp} (99%)
 rename src/kernels/{composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp => static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp} (99%)
 rename src/kernels/{composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_gnchw_gkcyx_gnkhw.hpp => static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_gnchw_gkcyx_gnkhw.hpp} (99%)
 rename src/kernels/{composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_weights_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.hpp => static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.hpp} (97%)
 rename src/kernels/{composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_weights_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp => static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp} (97%)
 rename src/kernels/{composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp => static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp} (98%)
 rename src/kernels/{composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.hpp => static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.hpp} (98%)
 rename src/kernels/{composable_kernel/include/kernel_algorithm/gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp => static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp} (97%)
 rename src/kernels/{composable_kernel/include/kernel_algorithm/gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.hpp => static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.hpp} (97%)
 rename src/kernels/{composable_kernel/include/kernel_algorithm/gridwise_convolution_forward_implicit_gemm_v4r5_xdlops_nchw_kcyx_nkhw.hpp => static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_forward_implicit_gemm_v4r5_xdlops_nchw_kcyx_nkhw.hpp} (97%)
 rename src/kernels/{composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_fp16_bfp16_gnchw_gkcyx_gnkhw_lds_double_buffer.hpp => static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_implicit_gemm_v4r1_fp16_bfp16_gnchw_gkcyx_gnkhw_lds_double_buffer.hpp} (98%)
 rename src/kernels/{composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp => static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_implicit_gemm_v4r1_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp} (98%)
 rename src/kernels/{composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer.hpp => static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_implicit_gemm_v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer.hpp} (98%)
 rename src/kernels/{composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp => static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp} (98%)
 rename src/kernels/{composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.hpp => static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.hpp} (98%)
 rename src/kernels/{composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp => static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp} (97%)
 rename src/kernels/{composable_kernel/include/kernel_algorithm/gridwise_generic_2d_reduction_blockwise.hpp => static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_generic_2d_reduction_blockwise.hpp} (98%)
 rename src/kernels/{composable_kernel/include/kernel_algorithm/gridwise_generic_2d_reduction_direct_threadwise.hpp => static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_generic_2d_reduction_direct_threadwise.hpp} (98%)
 rename src/kernels/{composable_kernel/include/kernel_algorithm/gridwise_generic_2d_reduction_direct_warpwise.hpp => static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_generic_2d_reduction_direct_warpwise.hpp} (98%)
 rename src/kernels/{composable_kernel/include/kernel_algorithm/gridwise_generic_2d_reduction_multiblock.hpp => static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_generic_2d_reduction_multiblock.hpp} (98%)
 rename src/kernels/{composable_kernel/include/kernel_algorithm/gridwise_generic_reduction.hpp => static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_generic_reduction.hpp} (98%)
 rename src/kernels/{composable_kernel/include/kernel_algorithm/reduction_functions.hpp => static_composable_kernel/include/kernel_algorithm/static_kernel_reduction_functions.hpp} (99%)
 rename src/kernels/{composable_kernel/include/kernel_algorithm/reduction_kernel_simple_configurator.hpp => static_composable_kernel/include/kernel_algorithm/static_kernel_reduction_kernel_simple_configurator.hpp} (95%)
 rename src/kernels/{composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp => static_composable_kernel/include/tensor_description/static_kernel_ConstantMatrixDescriptor.hpp} (95%)
 rename src/kernels/{composable_kernel/include/tensor_description/ConstantMergedTensorDescriptor_deprecated.hpp => static_composable_kernel/include/tensor_description/static_kernel_ConstantMergedTensorDescriptor_deprecated.hpp} (98%)
 rename src/kernels/{composable_kernel/include/tensor_description/ConstantTensorDescriptor_deprecated.hpp => static_composable_kernel/include/tensor_description/static_kernel_ConstantTensorDescriptor_deprecated.hpp} (99%)
 rename src/kernels/{composable_kernel/include/tensor_description/dimension.hpp => static_composable_kernel/include/tensor_description/static_kernel_dimension.hpp} (88%)
 rename src/kernels/{composable_kernel/include/tensor_description/multi_index_transform.hpp => static_composable_kernel/include/tensor_description/static_kernel_multi_index_transform.hpp} (99%)
 rename src/kernels/{composable_kernel/include/tensor_description/tensor_coordinate.hpp => static_composable_kernel/include/tensor_description/static_kernel_tensor_coordinate.hpp} (98%)
 rename src/kernels/{composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp => static_composable_kernel/include/tensor_description/static_kernel_tensor_coordinate_deprecated.hpp} (98%)
 rename src/kernels/{composable_kernel/include/tensor_description/tensor_coordinate_helper.hpp => static_composable_kernel/include/tensor_description/static_kernel_tensor_coordinate_helper.hpp} (100%)
 rename src/kernels/{composable_kernel/include/tensor_description/tensor_descriptor.hpp => static_composable_kernel/include/tensor_description/static_kernel_tensor_descriptor.hpp} (99%)
 rename src/kernels/{composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp => static_composable_kernel/include/tensor_description/static_kernel_tensor_descriptor_helper.hpp} (98%)
 rename src/kernels/{composable_kernel/include/tensor_operation/blockwise_gemm.hpp => static_composable_kernel/include/tensor_operation/static_kernel_blockwise_gemm.hpp} (99%)
 rename src/kernels/{composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp => static_composable_kernel/include/tensor_operation/static_kernel_blockwise_gemm_xdlops.hpp} (97%)
 rename src/kernels/{composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp => static_composable_kernel/include/tensor_operation/static_kernel_blockwise_generic_tensor_slice_copy.hpp} (97%)
 rename src/kernels/{composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp => static_composable_kernel/include/tensor_operation/static_kernel_blockwise_generic_tensor_slice_copy_deprecated.hpp} (98%)
 rename src/kernels/{composable_kernel/include/tensor_operation/gridwise_gemm.hpp => static_composable_kernel/include/tensor_operation/static_kernel_gridwise_gemm.hpp} (97%)
 rename src/kernels/{composable_kernel/include/tensor_operation/gridwise_gemm_fp16_bfp16.hpp => static_composable_kernel/include/tensor_operation/static_kernel_gridwise_gemm_fp16_bfp16.hpp} (98%)
 rename src/kernels/{composable_kernel/include/tensor_operation/gridwise_gemm_xdlops.hpp => static_composable_kernel/include/tensor_operation/static_kernel_gridwise_gemm_xdlops.hpp} (98%)
 rename src/kernels/{composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_fp16_bfp16.hpp => static_composable_kernel/include/tensor_operation/static_kernel_gridwise_gemm_xdlops_fp16_bfp16.hpp} (99%)
 rename src/kernels/{composable_kernel/include/tensor_operation/threadwise_gemm.hpp => static_composable_kernel/include/tensor_operation/static_kernel_threadwise_gemm.hpp} (97%)
 rename src/kernels/{composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp => static_composable_kernel/include/tensor_operation/static_kernel_threadwise_generic_tensor_slice_copy.hpp} (99%)
 rename src/kernels/{composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp => static_composable_kernel/include/tensor_operation/static_kernel_threadwise_generic_tensor_slice_copy_deprecated.hpp} (98%)
 rename src/kernels/{composable_kernel/include/tensor_operation/xdlops_gemm.hpp => static_composable_kernel/include/tensor_operation/static_kernel_xdlops_gemm.hpp} (99%)
 rename src/kernels/{composable_kernel/include/tensor_operation/xdlops_gemm_inline_asm.hpp => static_composable_kernel/include/tensor_operation/static_kernel_xdlops_gemm_inline_asm.hpp} (99%)
 rename src/kernels/{composable_kernel/include/utility/amd_buffer_addressing.hpp => static_composable_kernel/include/utility/static_kernel_amd_buffer_addressing.hpp} (99%)
 rename src/kernels/{composable_kernel/include/utility/amd_inline_asm.hpp => static_composable_kernel/include/utility/static_kernel_amd_inline_asm.hpp} (99%)
 rename src/kernels/{composable_kernel/include/utility/amd_xdlops.hpp => static_composable_kernel/include/utility/static_kernel_amd_xdlops.hpp} (99%)
 rename src/kernels/{composable_kernel/include/utility/amd_xdlops_inline_asm.hpp => static_composable_kernel/include/utility/static_kernel_amd_xdlops_inline_asm.hpp} (99%)
 rename src/kernels/{composable_kernel/include/utility/array.hpp => static_composable_kernel/include/utility/static_kernel_array.hpp} (99%)
 rename src/kernels/{composable_kernel/include/utility/ck_utils_type.hpp => static_composable_kernel/include/utility/static_kernel_ck_utils_type.hpp} (94%)
 create mode 100644 src/kernels/static_composable_kernel/include/utility/static_kernel_common_header.hpp
 rename src/kernels/{composable_kernel/include/utility/config.hpp => static_composable_kernel/include/utility/static_kernel_config.hpp} (100%)
 rename src/kernels/{composable_kernel/include/utility/float_type.hpp => static_composable_kernel/include/utility/static_kernel_float_type.hpp} (100%)
 rename src/kernels/{composable_kernel/include/utility/functional.hpp => static_composable_kernel/include/utility/static_kernel_functional.hpp} (94%)
 rename src/kernels/{composable_kernel/include/utility/functional2.hpp => static_composable_kernel/include/utility/static_kernel_functional2.hpp} (93%)
 rename src/kernels/{composable_kernel/include/utility/functional3.hpp => static_composable_kernel/include/utility/static_kernel_functional3.hpp} (96%)
 rename src/kernels/{composable_kernel/include/utility/functional4.hpp => static_composable_kernel/include/utility/static_kernel_functional4.hpp} (85%)
 rename src/kernels/{composable_kernel/include/utility/in_memory_operation.hpp => static_composable_kernel/include/utility/static_kernel_in_memory_operation.hpp} (98%)
 rename src/kernels/{composable_kernel/include/utility/integral_constant.hpp => static_composable_kernel/include/utility/static_kernel_integral_constant.hpp} (100%)
 rename src/kernels/{composable_kernel/include/utility/math.hpp => static_composable_kernel/include/utility/static_kernel_math.hpp} (96%)
 rename src/kernels/{composable_kernel/include/utility/number.hpp => static_composable_kernel/include/utility/static_kernel_number.hpp} (95%)
 rename src/kernels/{composable_kernel/include/utility/reduction_common.hpp => static_composable_kernel/include/utility/static_kernel_reduction_common.hpp} (98%)
 rename src/kernels/{composable_kernel/include/utility/reduction_operator.hpp => static_composable_kernel/include/utility/static_kernel_reduction_operator.hpp} (99%)
 rename src/kernels/{composable_kernel/include/utility/sequence.hpp => static_composable_kernel/include/utility/static_kernel_sequence.hpp} (99%)
 rename src/kernels/{composable_kernel/include/utility/synchronization.hpp => static_composable_kernel/include/utility/static_kernel_synchronization.hpp} (89%)
 rename src/kernels/{composable_kernel/include/utility/tuple.hpp => static_composable_kernel/include/utility/static_kernel_tuple.hpp} (97%)
 rename src/kernels/{composable_kernel/include/utility/utility.hpp => static_composable_kernel/include/utility/static_kernel_utility.hpp} (85%)
 rename src/kernels/{composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_data_implicit_gemm_v1r1_ncdhw_kczyx_nkdhw.cpp => static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_ncdhw_kczyx_nkdhw.cpp} (98%)
 rename src/kernels/{composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.cpp => static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.cpp} (97%)
 rename src/kernels/{composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_nchw_kcyx_nkhw.cpp => static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_nchw_kcyx_nkhw.cpp} (98%)
 rename src/kernels/{composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_data_implicit_gemm_v4r1_ncdhw_kczyx_nkdhw.cpp => static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_ncdhw_kczyx_nkdhw.cpp} (98%)
 rename src/kernels/{composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.cpp => static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.cpp} (97%)
 rename src/kernels/{composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.cpp => static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.cpp} (97%)
 rename src/kernels/{composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_weights_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.cpp => static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.cpp} (98%)
 rename src/kernels/{composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_weights_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp => static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp} (97%)
 rename src/kernels/{composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp => static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp} (98%)
 rename src/kernels/{composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.cpp => static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.cpp} (98%)
 rename src/kernels/{composable_kernel/src/kernel_wrapper/gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp => static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp} (98%)
 rename src/kernels/{composable_kernel/src/kernel_wrapper/gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.cpp => static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.cpp} (98%)
 rename src/kernels/{composable_kernel/src/kernel_wrapper/gridwise_convolution_forward_implicit_gemm_v4r5_xdlops_nchw_kcyx_nkhw.cpp => static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_forward_implicit_gemm_v4r5_xdlops_nchw_kcyx_nkhw.cpp} (98%)
 rename src/kernels/{composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer.cpp => static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer.cpp} (98%)
 rename src/kernels/{composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.cpp => static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.cpp} (98%)
 rename src/kernels/{composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_nchw_kcyx_nkhw_lds_double_buffer.cpp => static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_nchw_kcyx_nkhw_lds_double_buffer.cpp} (99%)
 rename src/kernels/{composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.cpp => static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.cpp} (98%)
 rename src/kernels/{composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp => static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp} (98%)
 rename src/kernels/{composable_kernel/src/kernel_wrapper/gridwise_generic_reduction.cpp => static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_generic_reduction.cpp} (97%)
 create mode 100644 src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index cba1d5a516..168713f84a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -233,13 +233,14 @@ set( MIOpen_Source
     solver/conv_hip_implicit_gemm_bwd_v1r1.cpp
     solver/conv_hip_implicit_gemm_bwd_v4r1.cpp
     solver/conv_hip_implicit_gemm_bwd_v4r1_xdlops.cpp
+    solver/conv_hip_implicit_gemm_bwd_v1r1_xdlops.cpp
+    solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops.cpp
+    solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops_padded_gemm.cpp
+    solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp
     solver/conv_asm_implicit_gemm_v4r1_dynamic.cpp
     solver/conv_asm_implicit_gemm_wrw_v4r1_dynamic.cpp
     solver/conv_asm_implicit_gemm_wrw_gtc_dynamic_xdlops.cpp
-    solver/conv_hip_implicit_gemm_bwd_v1r1_xdlops.cpp
     solver/conv_asm_implicit_gemm_bwd_v4r1_dynamic.cpp
-    solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops.cpp
-    solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops_padded_gemm.cpp
     solver/conv_asm_implicit_gemm_gtc_fwd.cpp
     solver/conv_asm_implicit_gemm_gtc_bwd.cpp
     solver/conv_asm_implicit_gemm_gtc_perf_config.cpp
@@ -262,8 +263,10 @@ if(MIOPEN_ENABLE_SQLITE AND MIOPEN_ENABLE_SQLITE_KERN_CACHE)
 endif()
 
 if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN_BACKEND STREQUAL "HIP" OR MIOPEN_BACKEND STREQUAL "HIPNOGPU")
-    file(GLOB_RECURSE COMPOSABLE_KERNEL_INCLUDE "kernels/composable_kernel/include/*/*.hpp")
-    file(GLOB_RECURSE COMPOSABLE_KERNEL_SOURCE "kernels/composable_kernel/src/*/*.cpp")
+    file(GLOB_RECURSE STATIC_COMPOSABLE_KERNEL_INCLUDE "kernels/static_composable_kernel/include/*/*.hpp")
+    file(GLOB_RECURSE STATIC_COMPOSABLE_KERNEL_SOURCE "kernels/static_composable_kernel/src/*/*.cpp")
+    file(GLOB_RECURSE COMPOSABLE_KERNEL_INCLUDE "composable_kernel/composable_kernel/include/*/*.hpp")
+    file(GLOB_RECURSE COMPOSABLE_KERNEL_SOURCE "composable_kernel/composable_kernel/src/*/*.cpp")
     file(GLOB_RECURSE COMPOSABLE_KERNEL_DYNAMIC_ASM_SOURCE "kernels/dynamic_igemm/*.s")
     file(GLOB_RECURSE COMPOSABLE_KERNEL_DYNAMIC_ASM_INCLUDE "kernels/dynamic_igemm/*.inc")
     file(GLOB_RECURSE COMPOSABLE_KERNEL_DYNAMIC_CPP_SOURCE "kernels/dynamic_igemm/*.cpp")
@@ -271,6 +274,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
     file(GLOB_RECURSE GPU_REFERENCE_KERNEL_ASM "kernels/gpu_reference_kernel/*.s")
 
     set(MIOPEN_KERNEL_INCLUDES
+        ${STATIC_COMPOSABLE_KERNEL_INCLUDE}
         ${COMPOSABLE_KERNEL_INCLUDE}
         ${COMPOSABLE_KERNEL_DYNAMIC_ASM_INCLUDE}
         include/miopen/implicitgemm_params.hpp
@@ -354,6 +358,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         )
 
     set(MIOPEN_KERNELS
+        ${STATIC_COMPOSABLE_KERNEL_SOURCE}
         ${COMPOSABLE_KERNEL_SOURCE}
         ${COMPOSABLE_KERNEL_DYNAMIC_ASM_SOURCE}
         ${COMPOSABLE_KERNEL_DYNAMIC_CPP_SOURCE}
diff --git a/src/composable_kernel/external/half/include/half.hpp b/src/composable_kernel/external/half/include/half.hpp
index b698aac39f..25f543881f 100644
--- a/src/composable_kernel/external/half/include/half.hpp
+++ b/src/composable_kernel/external/half/include/half.hpp
@@ -2404,8 +2404,7 @@ unsigned int gamma(unsigned int arg)
        0.0114684895434781459556 }; double t = arg + 4.65, s = p[0]; for(unsigned int i=0; i<5; ++i)
                                 s += p[i+1] / (arg+i);
                         return std::log(s) + (arg-0.5)*std::log(t) - t;
-*/ static const f31 pi(0xC90FDAA2, 1),
-        lbe(0xB8AA3B29, 0);
+*/ static const f31 pi(0xC90FDAA2, 1), lbe(0xB8AA3B29, 0);
     unsigned int abs = arg & 0x7FFF, sign = arg & 0x8000;
     bool bsign = sign != 0;
     f31 z(abs), x = sign ? (z + f31(0x80000000, 0)) : z, t = x + f31(0x94CCCCCD, 2),
diff --git a/src/include/miopen/solver.hpp b/src/include/miopen/solver.hpp
index 532dd35a4c..72f44791ea 100644
--- a/src/include/miopen/solver.hpp
+++ b/src/include/miopen/solver.hpp
@@ -2134,6 +2134,7 @@ struct PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm
     CalculateGemmBBlockCopyPerformanceParameters(const ConvolutionContext& ctx) const;
     std::tuple<std::size_t, bool> CalculateLdsNumberOfByte(const ConvolutionContext& ctx) const;
 };
+
 struct ConvHipImplicitGemmWrwV4R4Xdlops_Padded_Gemm : SolverBase<ConvolutionContext>
 {
     PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm
@@ -2150,6 +2151,45 @@ struct ConvHipImplicitGemmWrwV4R4Xdlops_Padded_Gemm : SolverBase<ConvolutionCont
     Search(const ConvolutionContext&, const AnyInvokeParams& invoke_ctx) const;
 };
 
+struct PerformanceConvCkIgemmFwdV6r1DlopsNchw : Serializable<PerformanceConvCkIgemmFwdV6r1DlopsNchw>
+{
+    int ck_tunable_list_id;
+
+    PerformanceConvCkIgemmFwdV6r1DlopsNchw(int a) : ck_tunable_list_id(a) {}
+
+    PerformanceConvCkIgemmFwdV6r1DlopsNchw() : PerformanceConvCkIgemmFwdV6r1DlopsNchw(-1) {}
+
+    PerformanceConvCkIgemmFwdV6r1DlopsNchw(bool) : PerformanceConvCkIgemmFwdV6r1DlopsNchw(0) {}
+
+    template <class Self, class F>
+    static void Visit(Self&& self, F f)
+    {
+        f(self.ck_tunable_list_id, "ck_tunable_list_id");
+    }
+
+    bool SetNextValue(const ConvolutionContext&);
+    bool IsValid(const ConvolutionContext&) const;
+    bool operator==(const PerformanceConvCkIgemmFwdV6r1DlopsNchw& config) const
+    {
+        return ck_tunable_list_id == config.ck_tunable_list_id;
+    }
+};
+
+struct ConvCkIgemmFwdV6r1DlopsNchw : SolverBase<ConvolutionContext>
+{
+    bool IsApplicable(const ConvolutionContext&) const;
+    std::size_t GetWorkspaceSize(const ConvolutionContext&) const;
+    bool IsDynamic() const { return true; }
+    PerformanceConvCkIgemmFwdV6r1DlopsNchw GetPerformanceConfig(const ConvolutionContext&) const;
+    bool IsValidPerformanceConfig(const ConvolutionContext&,
+                                  const PerformanceConvCkIgemmFwdV6r1DlopsNchw&) const;
+    PerformanceConvCkIgemmFwdV6r1DlopsNchw Search(const ConvolutionContext&,
+                                                  const AnyInvokeParams&) const;
+    ConvSolution GetSolution(const ConvolutionContext&,
+                             const PerformanceConvCkIgemmFwdV6r1DlopsNchw,
+                             bool disableConfigOverrideFromEnv = false) const;
+};
+
 struct ConvDirectNaiveConvFwd : SolverBase<ConvolutionContext>
 {
     bool IsApplicable(const ConvolutionContext& ctx) const;
diff --git a/src/include/miopen/solver/ck_util.hpp b/src/include/miopen/solver/ck_util.hpp
new file mode 100644
index 0000000000..04a7d88eeb
--- /dev/null
+++ b/src/include/miopen/solver/ck_util.hpp
@@ -0,0 +1,341 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef GUARD_CK_UTIL_HPP_
+#define GUARD_CK_UTIL_HPP_
+
+#include <miopen/env.hpp>
+#include <miopen/hip_build_utils.hpp>
+#include <miopen/mlo_internal.hpp>
+#include <miopen/rocm_features.hpp>
+#include <algorithm>
+
+#include "../composable_kernel/composable_kernel/include/utility/data_type_enum.hpp"
+#include "../composable_kernel/host/driver_online/include/convolution_problem_descriptor.hpp"
+#include "../composable_kernel/host/driver_online/include/online_driver_common.hpp"
+
+MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM)
+MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CK_USE_AMD_BUFFER_ADDRESSING)
+
+namespace miopen {
+namespace solver {
+
+static inline bool is_composable_kernel_supported_hardware(const ConvolutionContext& c)
+{
+    return (StartsWith(c.GetStream().GetDeviceName(), "gfx803") &&
+            c.GetStream().GetMaxComputeUnits() == 64) ||
+           StartsWith(c.GetStream().GetDeviceName(), "gfx900") ||
+           StartsWith(c.GetStream().GetDeviceName(), "gfx906") ||
+           StartsWith(c.GetStream().GetDeviceName(), "gfx908") ||
+           StartsWith(c.GetStream().GetDeviceName(), "gfx90a") ||
+           StartsWith(c.GetStream().GetDeviceName(), "gfx1030");
+}
+
+static inline bool support_amd_buffer_atomic_fadd(const std::string& device_name)
+{
+    return StartsWith(device_name, "gfx908");
+}
+
+static inline auto get_ck_common_compiler_flag(const ConvolutionContext& ctx)
+{
+    auto compiler_flag = std::string(" --std=c++17");
+
+    // GPU target
+    std::string gpu_target;
+
+    if(StartsWith(ctx.GetStream().GetDeviceName(), "gfx803"))
+        compiler_flag += std::string(" -DCK_AMD_GPU_GFX803");
+    else if(StartsWith(ctx.GetStream().GetDeviceName(), "gfx900"))
+        compiler_flag += std::string(" -DCK_AMD_GPU_GFX900");
+    else if(StartsWith(ctx.GetStream().GetDeviceName(), "gfx906"))
+        compiler_flag += std::string(" -DCK_AMD_GPU_GFX906");
+    else if(StartsWith(ctx.GetStream().GetDeviceName(), "gfx908"))
+        compiler_flag += std::string(" -DCK_AMD_GPU_GFX908");
+    else if(StartsWith(ctx.GetStream().GetDeviceName(), "gfx90a"))
+        compiler_flag += std::string(" -DCK_AMD_GPU_GFX90A");
+    else if(StartsWith(ctx.GetStream().GetDeviceName(), "gfx1030"))
+        compiler_flag += std::string(" -DCK_AMD_GPU_GFX1030");
+
+    // buffer atomic-fadd
+    compiler_flag += std::string(" -DCK_USE_AMD_BUFFER_ATOMIC_FADD=") +
+                     (support_amd_buffer_atomic_fadd(ctx.GetStream().GetDeviceName()) ? '1' : '0');
+
+    // sync LDS
+    compiler_flag +=
+        std::string(" -DCK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM=") +
+        (miopen::IsDisabled(MIOPEN_DEBUG_CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM{}) ? '0' : '1');
+
+    // buffer addressing
+    compiler_flag += std::string(" -DCK_USE_AMD_BUFFER_ADDRESSING=") +
+                     (miopen::IsDisabled(MIOPEN_DEBUG_CK_USE_AMD_BUFFER_ADDRESSING{}) ? '0' : '1');
+
+    return compiler_flag;
+}
+
+// 1. get the original dimension of conv problem
+//    (undo the dimeniosn swapping happened inside ConvolutionContext)
+// 2. adjust right padding size to align with the way implicit GEMM deal with padding
+struct ConvolutionContextInterpreter
+{
+    static auto GetGroupCountG(const ConvolutionContext& c) { return c.group_counts; }
+
+    static auto GetBatchN(const ConvolutionContext& c) { return c.batch_sz; }
+
+    static auto GetOutputLayout(const ConvolutionContext& c)
+    {
+        if(c.direction.IsForward())
+            return c.out_layout;
+        else
+            return c.in_layout;
+    }
+
+    static auto GetOutputChannelK(const ConvolutionContext& c)
+    {
+        if(c.direction.IsForward())
+            return c.n_outputs;
+        else
+            return c.n_inputs;
+    }
+
+    static auto GetInputLayout(const ConvolutionContext& c)
+    {
+        if(c.direction.IsForward())
+            return c.in_layout;
+        else
+            return c.out_layout;
+    }
+
+    static auto GetInputChannelC(const ConvolutionContext& c)
+    {
+        if(c.direction.IsForward())
+            return c.n_inputs;
+        else
+            return c.n_outputs;
+    }
+
+    static auto GetInputDepthDi(const ConvolutionContext& c)
+    {
+        if(c.direction.IsForward())
+            return c.in_depth;
+        else
+            return c.out_depth;
+    }
+
+    static auto GetInputHeightHi(const ConvolutionContext& c)
+    {
+        if(c.direction.IsForward())
+            return c.in_height;
+        else
+            return c.out_height;
+    }
+
+    static auto GetInputWidthWi(const ConvolutionContext& c)
+    {
+        if(c.direction.IsForward())
+            return c.in_width;
+        else
+            return c.out_width;
+    }
+
+    static auto GetOutputDepthDo(const ConvolutionContext& c)
+    {
+        if(c.direction.IsForward())
+            return c.out_depth;
+        else
+            return c.in_depth;
+    }
+
+    static auto GetOutputHeightHo(const ConvolutionContext& c)
+    {
+        if(c.direction.IsForward())
+            return c.out_height;
+        else
+            return c.in_height;
+    }
+
+    static auto GetOutputWidthWo(const ConvolutionContext& c)
+    {
+        if(c.direction.IsForward())
+            return c.out_width;
+        else
+            return c.in_width;
+    }
+
+    static auto GetOutputDataType(const ConvolutionContext& c)
+    {
+        return c.direction.IsForward() ? c.out_data_type : c.in_data_type;
+    }
+
+    static auto GetInputDataType(const ConvolutionContext& c)
+    {
+        return c.direction.IsForward() ? c.in_data_type : c.out_data_type;
+    }
+
+    static auto GetFilterDepthZ(const ConvolutionContext& c) { return c.kernel_size_d; }
+
+    static auto GetFilterLayout(const ConvolutionContext& c) { return c.weights_layout; }
+
+    static auto GetFilterHeightY(const ConvolutionContext& c) { return c.kernel_size_h; }
+
+    static auto GetFilterWidthX(const ConvolutionContext& c) { return c.kernel_size_w; }
+
+    // adjust conv_stride_d to 1 if Do is 1
+    static auto GetAdjustedConvolutionStrideD(const ConvolutionContext& c)
+    {
+        return GetOutputDepthDo(c) > 1 ? c.kernel_stride_d : 1;
+    }
+
+    // adjust conv_stride_h to 1 if Ho is 1
+    static auto GetAdjustedConvolutionStrideH(const ConvolutionContext& c)
+    {
+        return GetOutputHeightHo(c) > 1 ? c.kernel_stride_h : 1;
+    }
+
+    // adjust conv_stride_w to 1 if Wo is 1
+    static auto GetAdjustedConvolutionStrideW(const ConvolutionContext& c)
+    {
+        return GetOutputWidthWo(c) > 1 ? c.kernel_stride_w : 1;
+    }
+
+    // adjust conv_dilation_d to 1 if Z is 1
+    static auto GetAdjustedConvolutionDilationD(const ConvolutionContext& c)
+    {
+        return GetFilterDepthZ(c) > 1 ? c.kernel_dilation_d : 1;
+    }
+
+    // adjust conv_dilation_h to 1 if Y is 1
+    static auto GetAdjustedConvolutionDilationH(const ConvolutionContext& c)
+    {
+        return GetFilterHeightY(c) > 1 ? c.kernel_dilation_h : 1;
+    }
+
+    // adjust conv_dilation_w to 1 if X is 1
+    static auto GetAdjustedConvolutionDilationW(const ConvolutionContext& c)
+    {
+        return GetFilterWidthX(c) > 1 ? c.kernel_dilation_w : 1;
+    }
+
+    static auto GetInputLeftPadD(const ConvolutionContext& c) { return c.pad_d; }
+
+    static auto GetInputLeftPadH(const ConvolutionContext& c) { return c.pad_h; }
+
+    static auto GetInputLeftPadW(const ConvolutionContext& c) { return c.pad_w; }
+
+    // adjust right padding size to align with the way implicit GEMM deal with padding
+    static auto GetAdjustedInputRightPadD(const ConvolutionContext& c)
+    {
+        int di              = GetInputDepthDi(c);
+        int dout            = GetOutputDepthDo(c);
+        int z               = GetFilterDepthZ(c);
+        int conv_stride_d   = GetAdjustedConvolutionStrideD(c);
+        int conv_dilation_d = GetAdjustedConvolutionDilationD(c);
+        int in_left_pad_d   = GetInputLeftPadD(c);
+
+        int di_padded = 1 + (z - 1) * conv_dilation_d + (dout - 1) * conv_stride_d;
+
+        int in_right_pad_d =
+            di_padded > (in_left_pad_d + di) ? di_padded - (in_left_pad_d + di) : 0;
+
+        return in_right_pad_d;
+    }
+
+    // adjust right padding size to align with the way implicit GEMM deal with padding
+    static auto GetAdjustedInputRightPadH(const ConvolutionContext& c)
+    {
+        int hi              = GetInputHeightHi(c);
+        int ho              = GetOutputHeightHo(c);
+        int y               = GetFilterHeightY(c);
+        int conv_stride_h   = GetAdjustedConvolutionStrideH(c);
+        int conv_dilation_h = GetAdjustedConvolutionDilationH(c);
+        int in_left_pad_h   = GetInputLeftPadH(c);
+
+        int hi_padded = 1 + (y - 1) * conv_dilation_h + (ho - 1) * conv_stride_h;
+
+        int in_right_pad_h =
+            hi_padded > (in_left_pad_h + hi) ? hi_padded - (in_left_pad_h + hi) : 0;
+
+        return in_right_pad_h;
+    }
+
+    // adjust right padding size to align with the way implicit GEMM deal with padding
+    static auto GetAdjustedInputRightPadW(const ConvolutionContext& c)
+    {
+        int wi              = GetInputWidthWi(c);
+        int wo              = GetOutputWidthWo(c);
+        int x               = GetFilterWidthX(c);
+        int conv_stride_w   = GetAdjustedConvolutionStrideW(c);
+        int conv_dilation_w = GetAdjustedConvolutionDilationW(c);
+        int in_left_pad_w   = GetInputLeftPadW(c);
+
+        int wi_padded = 1 + (x - 1) * conv_dilation_w + (wo - 1) * conv_stride_w;
+
+        int in_right_pad_w =
+            wi_padded > (in_left_pad_w + wi) ? wi_padded - (in_left_pad_w + wi) : 0;
+
+        return in_right_pad_w;
+    }
+};
+
+static inline auto get_ck_convolution_problem_descriptor(const ConvolutionContext& ctx)
+{
+    ck::DataTypeEnum_t ck_datatype;
+
+    if(ctx.IsFp32())
+        ck_datatype = ck::DataTypeEnum_t::Float;
+    else if(ctx.IsFp16())
+        ck_datatype = ck::DataTypeEnum_t::Half;
+    else if(ctx.IsBfp16())
+        ck_datatype = ck::DataTypeEnum_t::BFloat16;
+    else
+        ck_datatype = ck::DataTypeEnum_t::Unknown;
+
+    return ck_driver::ConvolutionProblemDescriptor{
+        ConvolutionContextInterpreter::GetBatchN(ctx),
+        ConvolutionContextInterpreter::GetOutputChannelK(ctx),
+        ConvolutionContextInterpreter::GetInputChannelC(ctx),
+        ConvolutionContextInterpreter::GetFilterHeightY(ctx),
+        ConvolutionContextInterpreter::GetFilterWidthX(ctx),
+        ConvolutionContextInterpreter::GetInputHeightHi(ctx),
+        ConvolutionContextInterpreter::GetInputWidthWi(ctx),
+        ConvolutionContextInterpreter::GetOutputHeightHo(ctx),
+        ConvolutionContextInterpreter::GetOutputWidthWo(ctx),
+        ConvolutionContextInterpreter::GetAdjustedConvolutionStrideH(ctx),
+        ConvolutionContextInterpreter::GetAdjustedConvolutionStrideW(ctx),
+        ConvolutionContextInterpreter::GetAdjustedConvolutionDilationH(ctx),
+        ConvolutionContextInterpreter::GetAdjustedConvolutionDilationW(ctx),
+        ConvolutionContextInterpreter::GetInputLeftPadH(ctx),
+        ConvolutionContextInterpreter::GetInputLeftPadW(ctx),
+        ConvolutionContextInterpreter::GetAdjustedInputRightPadH(ctx),
+        ConvolutionContextInterpreter::GetAdjustedInputRightPadW(ctx),
+        ck_datatype,
+        ck_datatype,
+        ck_datatype};
+}
+
+} // namespace solver
+} // namespace miopen
+
+#endif
diff --git a/src/include/miopen/solver/implicitgemm_util.hpp b/src/include/miopen/solver/implicitgemm_util.hpp
index f7a15cc3ac..113c1643fc 100644
--- a/src/include/miopen/solver/implicitgemm_util.hpp
+++ b/src/include/miopen/solver/implicitgemm_util.hpp
@@ -31,6 +31,7 @@
 #include <miopen/hip_build_utils.hpp>
 #include <miopen/mlo_internal.hpp>
 #include <miopen/rocm_features.hpp>
+#include <miopen/solver/ck_util.hpp>
 #include <algorithm>
 
 MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_IMPLICIT_GEMM_NON_XDLOPS_INLINE_ASM)
@@ -57,281 +58,6 @@ namespace miopen {
 
 namespace solver {
 
-// greatest common divisor, aka highest common factor
-template <typename T>
-T gcd(T x, T y)
-{
-    assert(!(x == 0 && y == 0));
-
-    if(x < 0 || y < 0)
-    {
-        return gcd(abs(x), abs(y));
-    }
-    else if(x == y || x == 0)
-    {
-        return y;
-    }
-    else if(y == 0)
-    {
-        return x;
-    }
-    else if(x > y)
-    {
-        return gcd(x % y, y);
-    }
-    else
-    {
-        return gcd(x, y % x);
-    }
-}
-
-template <typename T, typename... Ys>
-T gcd(T x, Ys... ys)
-{
-    return gcd(x, gcd(ys...));
-}
-
-// least common multiple
-template <typename T>
-T lcm(T x, T y)
-{
-    if(x == 0 || y == 0)
-    {
-        return 0;
-    }
-    else
-    {
-        return (x * y) / gcd(x, y);
-    }
-}
-
-template <typename T, typename... Ys>
-T lcm(T x, Ys... ys)
-{
-    return lcm(x, lcm(ys...));
-}
-
-template <typename T>
-T integer_divide_ceil(T x, T y)
-{
-    if(y == 0)
-    {
-        MIOPEN_THROW("divisor should not be 0");
-    }
-
-    return (x + y - 1) / y;
-}
-
-template <typename T>
-T integer_least_multiple(T x, T y)
-{
-    return y * integer_divide_ceil(x, y);
-}
-
-// 1. get the original dimension of conv problem
-//    (undo the dimeniosn swapping happened inside ConvolutionContext)
-// 2. adjust right padding size to align with the way implicit GEMM deal with padding
-struct ConvolutionContextInterpreter
-{
-    static auto GetGroupCountG(const ConvolutionContext& c) { return c.group_counts; }
-
-    static auto GetBatchN(const ConvolutionContext& c) { return c.batch_sz; }
-
-    static auto GetOutputLayout(const ConvolutionContext& c)
-    {
-        if(c.direction.IsForward())
-            return c.out_layout;
-        else
-            return c.in_layout;
-    }
-
-    static auto GetOutputChannelK(const ConvolutionContext& c)
-    {
-        if(c.direction.IsForward())
-            return c.n_outputs;
-        else
-            return c.n_inputs;
-    }
-
-    static auto GetInputLayout(const ConvolutionContext& c)
-    {
-        if(c.direction.IsForward())
-            return c.in_layout;
-        else
-            return c.out_layout;
-    }
-
-    static auto GetInputChannelC(const ConvolutionContext& c)
-    {
-        if(c.direction.IsForward())
-            return c.n_inputs;
-        else
-            return c.n_outputs;
-    }
-
-    static auto GetInputDepthDi(const ConvolutionContext& c)
-    {
-        if(c.direction.IsForward())
-            return c.in_depth;
-        else
-            return c.out_depth;
-    }
-
-    static auto GetInputHeightHi(const ConvolutionContext& c)
-    {
-        if(c.direction.IsForward())
-            return c.in_height;
-        else
-            return c.out_height;
-    }
-
-    static auto GetInputWidthWi(const ConvolutionContext& c)
-    {
-        if(c.direction.IsForward())
-            return c.in_width;
-        else
-            return c.out_width;
-    }
-
-    static auto GetOutputDepthDo(const ConvolutionContext& c)
-    {
-        if(c.direction.IsForward())
-            return c.out_depth;
-        else
-            return c.in_depth;
-    }
-
-    static auto GetOutputHeightHo(const ConvolutionContext& c)
-    {
-        if(c.direction.IsForward())
-            return c.out_height;
-        else
-            return c.in_height;
-    }
-
-    static auto GetOutputWidthWo(const ConvolutionContext& c)
-    {
-        if(c.direction.IsForward())
-            return c.out_width;
-        else
-            return c.in_width;
-    }
-
-    static auto GetOutputDataType(const ConvolutionContext& c)
-    {
-        return c.direction.IsForward() ? c.out_data_type : c.in_data_type;
-    }
-
-    static auto GetInputDataType(const ConvolutionContext& c)
-    {
-        return c.direction.IsForward() ? c.in_data_type : c.out_data_type;
-    }
-
-    static auto GetFilterDepthZ(const ConvolutionContext& c) { return c.kernel_size_d; }
-
-    static auto GetFilterLayout(const ConvolutionContext& c) { return c.weights_layout; }
-
-    static auto GetFilterHeightY(const ConvolutionContext& c) { return c.kernel_size_h; }
-
-    static auto GetFilterWidthX(const ConvolutionContext& c) { return c.kernel_size_w; }
-
-    // adjust conv_stride_d to 1 if Do is 1
-    static auto GetAdjustedConvolutionStrideD(const ConvolutionContext& c)
-    {
-        return GetOutputDepthDo(c) > 1 ? c.kernel_stride_d : 1;
-    }
-
-    // adjust conv_stride_h to 1 if Ho is 1
-    static auto GetAdjustedConvolutionStrideH(const ConvolutionContext& c)
-    {
-        return GetOutputHeightHo(c) > 1 ? c.kernel_stride_h : 1;
-    }
-
-    // adjust conv_stride_w to 1 if Wo is 1
-    static auto GetAdjustedConvolutionStrideW(const ConvolutionContext& c)
-    {
-        return GetOutputWidthWo(c) > 1 ? c.kernel_stride_w : 1;
-    }
-
-    // adjust conv_dilation_d to 1 if Z is 1
-    static auto GetAdjustedConvolutionDilationD(const ConvolutionContext& c)
-    {
-        return GetFilterDepthZ(c) > 1 ? c.kernel_dilation_d : 1;
-    }
-
-    // adjust conv_dilation_h to 1 if Y is 1
-    static auto GetAdjustedConvolutionDilationH(const ConvolutionContext& c)
-    {
-        return GetFilterHeightY(c) > 1 ? c.kernel_dilation_h : 1;
-    }
-
-    // adjust conv_dilation_w to 1 if X is 1
-    static auto GetAdjustedConvolutionDilationW(const ConvolutionContext& c)
-    {
-        return GetFilterWidthX(c) > 1 ? c.kernel_dilation_w : 1;
-    }
-
-    static auto GetInputLeftPadD(const ConvolutionContext& c) { return c.pad_d; }
-
-    static auto GetInputLeftPadH(const ConvolutionContext& c) { return c.pad_h; }
-
-    static auto GetInputLeftPadW(const ConvolutionContext& c) { return c.pad_w; }
-
-    // adjust right padding size to align with the way implicit GEMM deal with padding
-    static auto GetAdjustedInputRightPadD(const ConvolutionContext& c)
-    {
-        int di              = GetInputDepthDi(c);
-        int dout            = GetOutputDepthDo(c);
-        int z               = GetFilterDepthZ(c);
-        int conv_stride_d   = GetAdjustedConvolutionStrideD(c);
-        int conv_dilation_d = GetAdjustedConvolutionDilationD(c);
-        int in_left_pad_d   = GetInputLeftPadD(c);
-
-        int di_padded = 1 + (z - 1) * conv_dilation_d + (dout - 1) * conv_stride_d;
-
-        int in_right_pad_d =
-            di_padded > (in_left_pad_d + di) ? di_padded - (in_left_pad_d + di) : 0;
-
-        return in_right_pad_d;
-    }
-
-    // adjust right padding size to align with the way implicit GEMM deal with padding
-    static auto GetAdjustedInputRightPadH(const ConvolutionContext& c)
-    {
-        int hi              = GetInputHeightHi(c);
-        int ho              = GetOutputHeightHo(c);
-        int y               = GetFilterHeightY(c);
-        int conv_stride_h   = GetAdjustedConvolutionStrideH(c);
-        int conv_dilation_h = GetAdjustedConvolutionDilationH(c);
-        int in_left_pad_h   = GetInputLeftPadH(c);
-
-        int hi_padded = 1 + (y - 1) * conv_dilation_h + (ho - 1) * conv_stride_h;
-
-        int in_right_pad_h =
-            hi_padded > (in_left_pad_h + hi) ? hi_padded - (in_left_pad_h + hi) : 0;
-
-        return in_right_pad_h;
-    }
-
-    // adjust right padding size to align with the way implicit GEMM deal with padding
-    static auto GetAdjustedInputRightPadW(const ConvolutionContext& c)
-    {
-        int wi              = GetInputWidthWi(c);
-        int wo              = GetOutputWidthWo(c);
-        int x               = GetFilterWidthX(c);
-        int conv_stride_w   = GetAdjustedConvolutionStrideW(c);
-        int conv_dilation_w = GetAdjustedConvolutionDilationW(c);
-        int in_left_pad_w   = GetInputLeftPadW(c);
-
-        int wi_padded = 1 + (x - 1) * conv_dilation_w + (wo - 1) * conv_stride_w;
-
-        int in_right_pad_w =
-            wi_padded > (in_left_pad_w + wi) ? wi_padded - (in_left_pad_w + wi) : 0;
-
-        return in_right_pad_w;
-    }
-};
-
 // these functions map the dimensions of a bwd-wrw problem into a fwd problem
 // they are not supposed to be called by backward-data
 static inline std::size_t KernelFilterStrideH(const ConvolutionContext& c)
@@ -748,11 +474,6 @@ static inline bool use_amd_inline_asm(const ConvolutionContext& ctx)
     return !miopen::IsDisabled(MIOPEN_DEBUG_IMPLICIT_GEMM_NON_XDLOPS_INLINE_ASM{});
 }
 
-static inline bool support_amd_buffer_atomic_fadd(const std::string& device_name)
-{
-    return StartsWith(device_name, "gfx908");
-}
-
 static inline bool is_use_amd_buffer_load_store(const ConvolutionContext& ctx)
 {
 #if WORKAROUND_MIOPEN_ISSUE_557
@@ -843,7 +564,7 @@ int amd_lds_write_max_length()
 
 constexpr std::size_t get_lds_max_number_of_byte() { return 65536; }
 
-static inline auto get_ck_common_compiler_flag(const ConvolutionContext& ctx)
+static inline auto get_static_ck_common_compiler_flag(const ConvolutionContext& ctx)
 {
     auto compiler_flag = std::string(" --std=c++14");
 
@@ -885,6 +606,77 @@ static inline bool IsComposableKernelSupportedHardware(const ConvolutionContext&
            StartsWith(c.GetStream().GetDeviceName(), "gfx1030");
 }
 
+// greatest common divisor, aka highest common factor
+template <typename T>
+T gcd(T x, T y)
+{
+    assert(!(x == 0 && y == 0));
+
+    if(x < 0 || y < 0)
+    {
+        return gcd(abs(x), abs(y));
+    }
+    else if(x == y || x == 0)
+    {
+        return y;
+    }
+    else if(y == 0)
+    {
+        return x;
+    }
+    else if(x > y)
+    {
+        return gcd(x % y, y);
+    }
+    else
+    {
+        return gcd(x, y % x);
+    }
+}
+
+template <typename T, typename... Ys>
+T gcd(T x, Ys... ys)
+{
+    return gcd(x, gcd(ys...));
+}
+
+// least common multiple
+template <typename T>
+T lcm(T x, T y)
+{
+    if(x == 0 || y == 0)
+    {
+        return 0;
+    }
+    else
+    {
+        return (x * y) / gcd(x, y);
+    }
+}
+
+template <typename T, typename... Ys>
+T lcm(T x, Ys... ys)
+{
+    return lcm(x, lcm(ys...));
+}
+
+template <typename T>
+T integer_divide_ceil(T x, T y)
+{
+    if(y == 0)
+    {
+        MIOPEN_THROW("divisor should not be 0");
+    }
+
+    return (x + y - 1) / y;
+}
+
+template <typename T>
+T integer_least_multiple(T x, T y)
+{
+    return y * integer_divide_ceil(x, y);
+}
+
 } // namespace solver
 } // namespace miopen
 
diff --git a/src/kernels/composable_kernel/include/utility/common_header.hpp b/src/kernels/composable_kernel/include/utility/common_header.hpp
deleted file mode 100644
index c1bd9e0fc7..0000000000
--- a/src/kernels/composable_kernel/include/utility/common_header.hpp
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef CK_COMMON_HEADER_HPP
-#define CK_COMMON_HEADER_HPP
-
-#include "config.hpp"
-#include "utility.hpp"
-#include "integral_constant.hpp"
-#include "number.hpp"
-#include "float_type.hpp"
-#include "ck_utils_type.hpp"
-#include "tuple.hpp"
-#include "math.hpp"
-#include "sequence.hpp"
-#include "array.hpp"
-#include "functional.hpp"
-#include "functional2.hpp"
-#include "functional3.hpp"
-#include "functional4.hpp"
-#include "in_memory_operation.hpp"
-#include "synchronization.hpp"
-
-#if CK_USE_AMD_INLINE_ASM
-#include "amd_inline_asm.hpp"
-#endif
-
-#if CK_USE_AMD_XDLOPS
-#include "amd_xdlops.hpp"
-#include "amd_xdlops_inline_asm.hpp"
-#endif
-
-#endif
diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/convolution_common.hpp b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_convolution_common.hpp
similarity index 100%
rename from src/kernels/composable_kernel/include/kernel_algorithm/convolution_common.hpp
rename to src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_convolution_common.hpp
diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_fp16_bfp16_nchw_kcyx_nkhw.hpp b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_fp16_bfp16_nchw_kcyx_nkhw.hpp
similarity index 97%
rename from src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_fp16_bfp16_nchw_kcyx_nkhw.hpp
rename to src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_fp16_bfp16_nchw_kcyx_nkhw.hpp
index 32b4990203..6afd7467e6 100644
--- a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_fp16_bfp16_nchw_kcyx_nkhw.hpp
+++ b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_fp16_bfp16_nchw_kcyx_nkhw.hpp
@@ -1,10 +1,10 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V1R1_FP16_BFP16_NCHW_KCYX_NKHW_HPP
 #define CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V1R1_FP16_BFP16_NCHW_KCYX_NKHW_HPP
 
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "gridwise_gemm_fp16_bfp16.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_tensor_descriptor.hpp"
+#include "static_kernel_tensor_descriptor_helper.hpp"
+#include "static_kernel_gridwise_gemm_fp16_bfp16.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_ncdhw_kczyx_nkdhw.hpp b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_ncdhw_kczyx_nkdhw.hpp
similarity index 98%
rename from src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_ncdhw_kczyx_nkdhw.hpp
rename to src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_ncdhw_kczyx_nkdhw.hpp
index 8a81913b5f..094a304d35 100644
--- a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_ncdhw_kczyx_nkdhw.hpp
+++ b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_ncdhw_kczyx_nkdhw.hpp
@@ -1,10 +1,10 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V1R1_NCDHW_KCZYX_NKDHW_HPP
 #define CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V1R1_NCDHW_KCZYX_NKDHW_HPP
 
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "gridwise_gemm.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_tensor_descriptor.hpp"
+#include "static_kernel_tensor_descriptor_helper.hpp"
+#include "static_kernel_gridwise_gemm.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp
similarity index 97%
rename from src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp
rename to src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp
index 71a9bb6dc0..34db70afc6 100644
--- a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp
+++ b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp
@@ -1,10 +1,10 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V1R1_NCHW_KCYX_NKHW_HPP
 #define CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V1R1_NCHW_KCYX_NKHW_HPP
 
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "gridwise_gemm.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_tensor_descriptor.hpp"
+#include "static_kernel_tensor_descriptor_helper.hpp"
+#include "static_kernel_gridwise_gemm.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_gnchw_gkcyx_gnkhw.hpp b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_gnchw_gkcyx_gnkhw.hpp
similarity index 97%
rename from src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_gnchw_gkcyx_gnkhw.hpp
rename to src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_gnchw_gkcyx_gnkhw.hpp
index 2e94813d25..1be87d1cf3 100644
--- a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_gnchw_gkcyx_gnkhw.hpp
+++ b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_gnchw_gkcyx_gnkhw.hpp
@@ -1,10 +1,10 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V1R1_XDLOPS_GNCHW_GKCYX_GNKHW_HPP
 #define CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V1R1_XDLOPS_GNCHW_GKCYX_GNKHW_HPP
 
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "gridwise_gemm_xdlops_fp16_bfp16.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_tensor_descriptor.hpp"
+#include "static_kernel_tensor_descriptor_helper.hpp"
+#include "static_kernel_gridwise_gemm_xdlops_fp16_bfp16.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r1_ncdhw_kczyx_nkdhw.hpp b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_ncdhw_kczyx_nkdhw.hpp
similarity index 99%
rename from src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r1_ncdhw_kczyx_nkdhw.hpp
rename to src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_ncdhw_kczyx_nkdhw.hpp
index 7c4bdc1069..b77783f7f1 100644
--- a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r1_ncdhw_kczyx_nkdhw.hpp
+++ b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_ncdhw_kczyx_nkdhw.hpp
@@ -1,10 +1,10 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V4R1_NCDHW_KCZYX_NKDHW_HPP
 #define CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V4R1_NCDHW_KCZYX_NKDHW_HPP
 
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "gridwise_gemm.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_tensor_descriptor.hpp"
+#include "static_kernel_tensor_descriptor_helper.hpp"
+#include "static_kernel_gridwise_gemm.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
similarity index 99%
rename from src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
rename to src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
index d4e9da5e54..f99065c2cd 100644
--- a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
@@ -1,10 +1,10 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V4R1_NCHW_KCYX_NKHW_HPP
 #define CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V4R1_NCHW_KCYX_NKHW_HPP
 
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "gridwise_gemm.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_tensor_descriptor.hpp"
+#include "static_kernel_tensor_descriptor_helper.hpp"
+#include "static_kernel_gridwise_gemm.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_gnchw_gkcyx_gnkhw.hpp b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_gnchw_gkcyx_gnkhw.hpp
similarity index 99%
rename from src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_gnchw_gkcyx_gnkhw.hpp
rename to src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_gnchw_gkcyx_gnkhw.hpp
index dc7430e539..c28b5a41bd 100644
--- a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_gnchw_gkcyx_gnkhw.hpp
+++ b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_gnchw_gkcyx_gnkhw.hpp
@@ -1,11 +1,11 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V4R1_XDLOPS_GNCHW_GKCYX_GNKHW_HPP
 #define CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V4R1_XDLOPS_GNCHW_GKCYX_GNKHW_HPP
 
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "ConstantMatrixDescriptor.hpp"
-#include "gridwise_gemm_xdlops_fp16_bfp16.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_tensor_descriptor.hpp"
+#include "static_kernel_tensor_descriptor_helper.hpp"
+#include "static_kernel_ConstantMatrixDescriptor.hpp"
+#include "static_kernel_gridwise_gemm_xdlops_fp16_bfp16.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_weights_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.hpp b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.hpp
similarity index 97%
rename from src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_weights_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.hpp
rename to src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.hpp
index edef2eee32..1d2cfa1fa9 100644
--- a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_weights_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.hpp
+++ b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.hpp
@@ -1,11 +1,11 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_BACKWARD_WEIGHTS_IMPLICIT_GEMM_V4R4_NCDHW_KCZYX_NKDHW_HPP
 #define CK_GRIDWISE_CONVOLUTION_BACKWARD_WEIGHTS_IMPLICIT_GEMM_V4R4_NCDHW_KCZYX_NKDHW_HPP
 
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "gridwise_gemm.hpp"
-#include "convolution_common.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_tensor_descriptor.hpp"
+#include "static_kernel_tensor_descriptor_helper.hpp"
+#include "static_kernel_gridwise_gemm.hpp"
+#include "static_kernel_convolution_common.hpp"
 
 namespace ck {
 // GemmM = K
diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_weights_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
similarity index 97%
rename from src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_weights_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
rename to src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
index 8295972d34..4f5a312a8f 100644
--- a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_weights_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -1,11 +1,11 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_BACKWARD_WEIGHTS_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_HPP
 #define CK_GRIDWISE_CONVOLUTION_BACKWARD_WEIGHTS_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_HPP
 
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "gridwise_gemm.hpp"
-#include "convolution_common.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_tensor_descriptor.hpp"
+#include "static_kernel_tensor_descriptor_helper.hpp"
+#include "static_kernel_gridwise_gemm.hpp"
+#include "static_kernel_convolution_common.hpp"
 
 namespace ck {
 // GemmM = K
diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
similarity index 98%
rename from src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
rename to src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
index 8f3cf7a560..8231f81f41 100644
--- a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
+++ b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
@@ -1,11 +1,11 @@
 #ifndef CK_GRIDWISE_GROUP_CONVOLUTION_BACKWARD_WEIGHTS_IMPLICIT_GEMM_V4R4_XDLOPS_GNCHW_GKCYX_GNKHW_HPP
 #define CK_GRIDWISE_GROUP_CONVOLUTION_BACKWARD_WEIGHTS_IMPLICIT_GEMM_V4R4_XDLOPS_GNCHW_GKCYX_GNKHW_HPP
 
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "ConstantMatrixDescriptor.hpp"
-#include "gridwise_gemm_xdlops_fp16_bfp16.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_tensor_descriptor.hpp"
+#include "static_kernel_tensor_descriptor_helper.hpp"
+#include "static_kernel_ConstantMatrixDescriptor.hpp"
+#include "static_kernel_gridwise_gemm_xdlops_fp16_bfp16.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.hpp b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.hpp
similarity index 98%
rename from src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.hpp
rename to src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.hpp
index 726824e5ec..4301aa07a9 100644
--- a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.hpp
+++ b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.hpp
@@ -1,11 +1,11 @@
 #ifndef CK_GRIDWISE_GROUP_CONVOLUTION_BACKWARD_WEIGHTS_IMPLICIT_GEMM_V4R4_XDLOPS_GNCHW_GKCYX_GNKHW_PADDED_GEMM_HPP
 #define CK_GRIDWISE_GROUP_CONVOLUTION_BACKWARD_WEIGHTS_IMPLICIT_GEMM_V4R4_XDLOPS_GNCHW_GKCYX_GNKHW_PADDED_GEMM_HPP
 
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "ConstantMatrixDescriptor.hpp"
-#include "gridwise_gemm_xdlops_fp16_bfp16.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_tensor_descriptor.hpp"
+#include "static_kernel_tensor_descriptor_helper.hpp"
+#include "static_kernel_ConstantMatrixDescriptor.hpp"
+#include "static_kernel_gridwise_gemm_xdlops_fp16_bfp16.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
similarity index 97%
rename from src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
rename to src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
index 4295b56a5a..0bbde3dbb3 100644
--- a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
+++ b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
@@ -1,11 +1,11 @@
 #ifndef CK_GRIDWISE_GROUP_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V4R4_XDLOPS_NCHW_KCYX_NKHW_HPP
 #define CK_GRIDWISE_GROUP_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V4R4_XDLOPS_NCHW_KCYX_NKHW_HPP
 
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "ConstantMatrixDescriptor.hpp"
-#include "gridwise_gemm_xdlops_fp16_bfp16.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_tensor_descriptor.hpp"
+#include "static_kernel_tensor_descriptor_helper.hpp"
+#include "static_kernel_ConstantMatrixDescriptor.hpp"
+#include "static_kernel_gridwise_gemm_xdlops_fp16_bfp16.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.hpp b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.hpp
similarity index 97%
rename from src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.hpp
rename to src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.hpp
index 4686133c6d..4031c14341 100644
--- a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.hpp
+++ b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.hpp
@@ -1,11 +1,11 @@
 #ifndef CK_GRIDWISE_GROUP_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V4R4_XDLOPS_GNCHW_GKCYX_GNKHW_PADDED_GEMM_HPP
 #define CK_GRIDWISE_GROUP_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V4R4_XDLOPS_GNCHW_GKCYX_GNKHW_PADDED_GEMM_HPP
 
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "ConstantMatrixDescriptor.hpp"
-#include "gridwise_gemm_xdlops_fp16_bfp16.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_tensor_descriptor.hpp"
+#include "static_kernel_tensor_descriptor_helper.hpp"
+#include "static_kernel_ConstantMatrixDescriptor.hpp"
+#include "static_kernel_gridwise_gemm_xdlops_fp16_bfp16.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_forward_implicit_gemm_v4r5_xdlops_nchw_kcyx_nkhw.hpp b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_forward_implicit_gemm_v4r5_xdlops_nchw_kcyx_nkhw.hpp
similarity index 97%
rename from src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_forward_implicit_gemm_v4r5_xdlops_nchw_kcyx_nkhw.hpp
rename to src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_forward_implicit_gemm_v4r5_xdlops_nchw_kcyx_nkhw.hpp
index f702af051c..1493d3d358 100644
--- a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_forward_implicit_gemm_v4r5_xdlops_nchw_kcyx_nkhw.hpp
+++ b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_forward_implicit_gemm_v4r5_xdlops_nchw_kcyx_nkhw.hpp
@@ -1,11 +1,11 @@
 #ifndef CK_GRIDWISE_GROUP_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V4R5_XDLOPS_GNCHW_GKCYX_GNKHW_HPP
 #define CK_GRIDWISE_GROUP_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V4R5_XDLOPS_GNCHW_GKCYX_GNKHW_HPP
 
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "ConstantMatrixDescriptor.hpp"
-#include "gridwise_gemm_xdlops_fp16_bfp16.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_tensor_descriptor.hpp"
+#include "static_kernel_tensor_descriptor_helper.hpp"
+#include "static_kernel_ConstantMatrixDescriptor.hpp"
+#include "static_kernel_gridwise_gemm_xdlops_fp16_bfp16.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_fp16_bfp16_gnchw_gkcyx_gnkhw_lds_double_buffer.hpp b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_implicit_gemm_v4r1_fp16_bfp16_gnchw_gkcyx_gnkhw_lds_double_buffer.hpp
similarity index 98%
rename from src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_fp16_bfp16_gnchw_gkcyx_gnkhw_lds_double_buffer.hpp
rename to src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_implicit_gemm_v4r1_fp16_bfp16_gnchw_gkcyx_gnkhw_lds_double_buffer.hpp
index 0ada670970..00f534e33f 100644
--- a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_fp16_bfp16_gnchw_gkcyx_gnkhw_lds_double_buffer.hpp
+++ b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_implicit_gemm_v4r1_fp16_bfp16_gnchw_gkcyx_gnkhw_lds_double_buffer.hpp
@@ -1,14 +1,14 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R1_FP16_BF16_GNCHW_GKCYX_GNKHW_LDS_DOUBLE_BUFFER_HPP
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R1_FP16_BF16_GNCHW_GKCYX_GNKHW_LDS_DOUBLE_BUFFER_HPP
 
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "ConstantMatrixDescriptor.hpp"
-#include "blockwise_generic_tensor_slice_copy.hpp"
-#include "threadwise_generic_tensor_slice_copy.hpp"
-#include "blockwise_gemm.hpp"
-#include "convolution_common.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_tensor_descriptor.hpp"
+#include "static_kernel_tensor_descriptor_helper.hpp"
+#include "static_kernel_ConstantMatrixDescriptor.hpp"
+#include "static_kernel_blockwise_generic_tensor_slice_copy.hpp"
+#include "static_kernel_threadwise_generic_tensor_slice_copy.hpp"
+#include "static_kernel_blockwise_gemm.hpp"
+#include "static_kernel_convolution_common.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_implicit_gemm_v4r1_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp
similarity index 98%
rename from src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp
rename to src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_implicit_gemm_v4r1_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp
index d76c866d0e..e62569e6b1 100644
--- a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_implicit_gemm_v4r1_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -1,14 +1,14 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R1_FP16_BF16_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_HPP
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R1_FP16_BF16_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_HPP
 
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "ConstantMatrixDescriptor.hpp"
-#include "blockwise_generic_tensor_slice_copy.hpp"
-#include "threadwise_generic_tensor_slice_copy.hpp"
-#include "blockwise_gemm.hpp"
-#include "convolution_common.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_tensor_descriptor.hpp"
+#include "static_kernel_tensor_descriptor_helper.hpp"
+#include "static_kernel_ConstantMatrixDescriptor.hpp"
+#include "static_kernel_blockwise_generic_tensor_slice_copy.hpp"
+#include "static_kernel_threadwise_generic_tensor_slice_copy.hpp"
+#include "static_kernel_blockwise_gemm.hpp"
+#include "static_kernel_convolution_common.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer.hpp b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_implicit_gemm_v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer.hpp
similarity index 98%
rename from src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer.hpp
rename to src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_implicit_gemm_v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer.hpp
index b49442178c..e5ecc4a10a 100644
--- a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer.hpp
+++ b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_implicit_gemm_v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer.hpp
@@ -1,14 +1,14 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R1_GNCHW_GKCYX_GNKHW_LDS_DOUBLE_BUFFER_HPP
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R1_GNCHW_GKCYX_GNKHW_LDS_DOUBLE_BUFFER_HPP
 
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "ConstantMatrixDescriptor.hpp"
-#include "blockwise_generic_tensor_slice_copy.hpp"
-#include "threadwise_generic_tensor_slice_copy.hpp"
-#include "blockwise_gemm.hpp"
-#include "convolution_common.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_tensor_descriptor.hpp"
+#include "static_kernel_tensor_descriptor_helper.hpp"
+#include "static_kernel_ConstantMatrixDescriptor.hpp"
+#include "static_kernel_blockwise_generic_tensor_slice_copy.hpp"
+#include "static_kernel_threadwise_generic_tensor_slice_copy.hpp"
+#include "static_kernel_blockwise_gemm.hpp"
+#include "static_kernel_convolution_common.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
similarity index 98%
rename from src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
rename to src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
index a462c6b560..5be3a81285 100644
--- a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -1,14 +1,14 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R1_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_HPP
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R1_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_HPP
 
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "ConstantMatrixDescriptor.hpp"
-#include "blockwise_generic_tensor_slice_copy.hpp"
-#include "threadwise_generic_tensor_slice_copy.hpp"
-#include "blockwise_gemm.hpp"
-#include "convolution_common.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_tensor_descriptor.hpp"
+#include "static_kernel_tensor_descriptor_helper.hpp"
+#include "static_kernel_ConstantMatrixDescriptor.hpp"
+#include "static_kernel_blockwise_generic_tensor_slice_copy.hpp"
+#include "static_kernel_threadwise_generic_tensor_slice_copy.hpp"
+#include "static_kernel_blockwise_gemm.hpp"
+#include "static_kernel_convolution_common.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.hpp b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.hpp
similarity index 98%
rename from src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.hpp
rename to src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.hpp
index 4d9bd7da01..15e31365c4 100644
--- a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.hpp
+++ b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.hpp
@@ -1,10 +1,10 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R4_NCDHW_KCZYX_NKDHW_HPP
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R4_NCDHW_KCZYX_NKDHW_HPP
 
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "gridwise_gemm.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_tensor_descriptor.hpp"
+#include "static_kernel_tensor_descriptor_helper.hpp"
+#include "static_kernel_gridwise_gemm.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
similarity index 97%
rename from src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
rename to src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
index c961bac670..7ddec0b6cc 100644
--- a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -1,10 +1,10 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_HPP
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_HPP
 
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "gridwise_gemm.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_tensor_descriptor.hpp"
+#include "static_kernel_tensor_descriptor_helper.hpp"
+#include "static_kernel_gridwise_gemm.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_generic_2d_reduction_blockwise.hpp b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_generic_2d_reduction_blockwise.hpp
similarity index 98%
rename from src/kernels/composable_kernel/include/kernel_algorithm/gridwise_generic_2d_reduction_blockwise.hpp
rename to src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_generic_2d_reduction_blockwise.hpp
index 0f9477a078..5e76983570 100644
--- a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_generic_2d_reduction_blockwise.hpp
+++ b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_generic_2d_reduction_blockwise.hpp
@@ -26,13 +26,13 @@
 #ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_BLOCKWISE_HPP
 #define CK_GRIDWISE_GENERIC_2D_REDUCTION_BLOCKWISE_HPP
 
-#include "float_type.hpp"
-#include "reduction_operator.hpp"
-#include "reduction_functions.hpp"
-#include "reduction_common.hpp"
+#include "static_kernel_float_type.hpp"
+#include "static_kernel_reduction_operator.hpp"
+#include "static_kernel_reduction_functions.hpp"
+#include "static_kernel_reduction_common.hpp"
 
-#include "blockwise_generic_tensor_slice_copy.hpp"
-#include "ConstantMatrixDescriptor.hpp"
+#include "static_kernel_blockwise_generic_tensor_slice_copy.hpp"
+#include "static_kernel_ConstantMatrixDescriptor.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_generic_2d_reduction_direct_threadwise.hpp b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_generic_2d_reduction_direct_threadwise.hpp
similarity index 98%
rename from src/kernels/composable_kernel/include/kernel_algorithm/gridwise_generic_2d_reduction_direct_threadwise.hpp
rename to src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_generic_2d_reduction_direct_threadwise.hpp
index 600f5513be..449ac8ceae 100644
--- a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_generic_2d_reduction_direct_threadwise.hpp
+++ b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_generic_2d_reduction_direct_threadwise.hpp
@@ -26,12 +26,12 @@
 #ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_THREADWISE_HPP
 #define CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_THREADWISE_HPP
 
-#include "float_type.hpp"
-#include "reduction_operator.hpp"
-#include "reduction_functions.hpp"
-#include "reduction_common.hpp"
+#include "static_kernel_float_type.hpp"
+#include "static_kernel_reduction_operator.hpp"
+#include "static_kernel_reduction_functions.hpp"
+#include "static_kernel_reduction_common.hpp"
 
-#include "threadwise_generic_tensor_slice_copy.hpp"
+#include "static_kernel_threadwise_generic_tensor_slice_copy.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_generic_2d_reduction_direct_warpwise.hpp b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_generic_2d_reduction_direct_warpwise.hpp
similarity index 98%
rename from src/kernels/composable_kernel/include/kernel_algorithm/gridwise_generic_2d_reduction_direct_warpwise.hpp
rename to src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_generic_2d_reduction_direct_warpwise.hpp
index 5e4af31f75..c7f2933164 100644
--- a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_generic_2d_reduction_direct_warpwise.hpp
+++ b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_generic_2d_reduction_direct_warpwise.hpp
@@ -26,12 +26,12 @@
 #ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_WARPWISE_HPP
 #define CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_WARPWISE_HPP
 
-#include "float_type.hpp"
-#include "reduction_operator.hpp"
-#include "reduction_functions.hpp"
-#include "reduction_common.hpp"
+#include "static_kernel_float_type.hpp"
+#include "static_kernel_reduction_operator.hpp"
+#include "static_kernel_reduction_functions.hpp"
+#include "static_kernel_reduction_common.hpp"
 
-#include "threadwise_generic_tensor_slice_copy.hpp"
+#include "static_kernel_threadwise_generic_tensor_slice_copy.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_generic_2d_reduction_multiblock.hpp b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_generic_2d_reduction_multiblock.hpp
similarity index 98%
rename from src/kernels/composable_kernel/include/kernel_algorithm/gridwise_generic_2d_reduction_multiblock.hpp
rename to src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_generic_2d_reduction_multiblock.hpp
index 1d9fc2ae47..47f32bbf29 100644
--- a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_generic_2d_reduction_multiblock.hpp
+++ b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_generic_2d_reduction_multiblock.hpp
@@ -26,13 +26,13 @@
 #ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_MULTIBLOCK_HPP
 #define CK_GRIDWISE_GENERIC_2D_REDUCTION_MULTIBLOCK_HPP
 
-#include "float_type.hpp"
-#include "reduction_operator.hpp"
-#include "reduction_functions.hpp"
-#include "reduction_common.hpp"
+#include "static_kernel_float_type.hpp"
+#include "static_kernel_reduction_operator.hpp"
+#include "static_kernel_reduction_functions.hpp"
+#include "static_kernel_reduction_common.hpp"
 
-#include "blockwise_generic_tensor_slice_copy.hpp"
-#include "ConstantMatrixDescriptor.hpp"
+#include "static_kernel_blockwise_generic_tensor_slice_copy.hpp"
+#include "static_kernel_ConstantMatrixDescriptor.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_generic_reduction.hpp b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_generic_reduction.hpp
similarity index 98%
rename from src/kernels/composable_kernel/include/kernel_algorithm/gridwise_generic_reduction.hpp
rename to src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_generic_reduction.hpp
index e2b7fbd8d7..5ddb980182 100644
--- a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_generic_reduction.hpp
+++ b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_gridwise_generic_reduction.hpp
@@ -26,17 +26,17 @@
 #ifndef CK_GRIDWISE_GENERIC_REDUCTION_HPP
 #define CK_GRIDWISE_GENERIC_REDUCTION_HPP
 
-#include "float_type.hpp"
-#include "reduction_common.hpp"
-#include "reduction_operator.hpp"
-#include "reduction_kernel_simple_configurator.hpp"
+#include "static_kernel_float_type.hpp"
+#include "static_kernel_reduction_common.hpp"
+#include "static_kernel_reduction_operator.hpp"
+#include "static_kernel_reduction_kernel_simple_configurator.hpp"
 
-#include "tensor_descriptor_helper.hpp"
+#include "static_kernel_tensor_descriptor_helper.hpp"
 
-#include "gridwise_generic_2d_reduction_direct_threadwise.hpp"
-#include "gridwise_generic_2d_reduction_direct_warpwise.hpp"
-#include "gridwise_generic_2d_reduction_blockwise.hpp"
-#include "gridwise_generic_2d_reduction_multiblock.hpp"
+#include "static_kernel_gridwise_generic_2d_reduction_direct_threadwise.hpp"
+#include "static_kernel_gridwise_generic_2d_reduction_direct_warpwise.hpp"
+#include "static_kernel_gridwise_generic_2d_reduction_blockwise.hpp"
+#include "static_kernel_gridwise_generic_2d_reduction_multiblock.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/reduction_functions.hpp b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_reduction_functions.hpp
similarity index 99%
rename from src/kernels/composable_kernel/include/kernel_algorithm/reduction_functions.hpp
rename to src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_reduction_functions.hpp
index 914591c09b..7a55d2d780 100644
--- a/src/kernels/composable_kernel/include/kernel_algorithm/reduction_functions.hpp
+++ b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_reduction_functions.hpp
@@ -26,10 +26,10 @@
 #ifndef CK_REDUCTION_FUNCTIONS_HPP
 #define CK_REDUCTION_FUNCTIONS_HPP
 
-#include "float_type.hpp"
+#include "static_kernel_float_type.hpp"
 
-#include "reduction_common.hpp"
-#include "reduction_operator.hpp"
+#include "static_kernel_reduction_common.hpp"
+#include "static_kernel_reduction_operator.hpp"
 
 namespace ck {
 namespace detail {
diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/reduction_kernel_simple_configurator.hpp b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_reduction_kernel_simple_configurator.hpp
similarity index 95%
rename from src/kernels/composable_kernel/include/kernel_algorithm/reduction_kernel_simple_configurator.hpp
rename to src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_reduction_kernel_simple_configurator.hpp
index e120c1c009..c679ae15e9 100644
--- a/src/kernels/composable_kernel/include/kernel_algorithm/reduction_kernel_simple_configurator.hpp
+++ b/src/kernels/static_composable_kernel/include/kernel_algorithm/static_kernel_reduction_kernel_simple_configurator.hpp
@@ -1,8 +1,8 @@
 #ifndef REDUCTION_KERNEL_SIMPLE_CONFIGURATOR_HPP_
 #define REDUCTION_KERNEL_SIMPLE_CONFIGURATOR_HPP_ 1
 
-#include "number.hpp"
-#include "reduction_common.hpp"
+#include "static_kernel_number.hpp"
+#include "static_kernel_reduction_common.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp b/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_ConstantMatrixDescriptor.hpp
similarity index 95%
rename from src/kernels/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp
rename to src/kernels/static_composable_kernel/include/tensor_description/static_kernel_ConstantMatrixDescriptor.hpp
index 0ebd9dc4a1..56158ccf72 100644
--- a/src/kernels/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp
+++ b/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_ConstantMatrixDescriptor.hpp
@@ -1,9 +1,9 @@
 #ifndef CK_CONSTANT_MATRIX_DESCRIPTOR_HPP
 #define CK_CONSTANT_MATRIX_DESCRIPTOR_HPP
 
-#include "common_header.hpp"
-#include "ConstantTensorDescriptor_deprecated.hpp"
-#include "tensor_descriptor.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_ConstantTensorDescriptor_deprecated.hpp"
+#include "static_kernel_tensor_descriptor.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/tensor_description/ConstantMergedTensorDescriptor_deprecated.hpp b/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_ConstantMergedTensorDescriptor_deprecated.hpp
similarity index 98%
rename from src/kernels/composable_kernel/include/tensor_description/ConstantMergedTensorDescriptor_deprecated.hpp
rename to src/kernels/static_composable_kernel/include/tensor_description/static_kernel_ConstantMergedTensorDescriptor_deprecated.hpp
index 814e47d1c1..02e675203d 100644
--- a/src/kernels/composable_kernel/include/tensor_description/ConstantMergedTensorDescriptor_deprecated.hpp
+++ b/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_ConstantMergedTensorDescriptor_deprecated.hpp
@@ -1,8 +1,8 @@
 #ifndef CK_CONSTANT_MERGED_TENSOR_DESCRIPTOR_DEPRECATED_HPP
 #define CK_CONSTANT_MERGED_TENSOR_DESCRIPTOR_DEPRECATED_HPP
 
-#include "common_header.hpp"
-#include "ConstantTensorDescriptor_deprecated.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_ConstantTensorDescriptor_deprecated.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/tensor_description/ConstantTensorDescriptor_deprecated.hpp b/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_ConstantTensorDescriptor_deprecated.hpp
similarity index 99%
rename from src/kernels/composable_kernel/include/tensor_description/ConstantTensorDescriptor_deprecated.hpp
rename to src/kernels/static_composable_kernel/include/tensor_description/static_kernel_ConstantTensorDescriptor_deprecated.hpp
index d745f69f80..78c1daad88 100644
--- a/src/kernels/composable_kernel/include/tensor_description/ConstantTensorDescriptor_deprecated.hpp
+++ b/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_ConstantTensorDescriptor_deprecated.hpp
@@ -1,7 +1,7 @@
 #ifndef CK_CONSTANT_TENSOR_DESCRIPTOR_DEPRECATED_HPP
 #define CK_CONSTANT_TENSOR_DESCRIPTOR_DEPRECATED_HPP
 
-#include "common_header.hpp"
+#include "static_kernel_common_header.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/tensor_description/dimension.hpp b/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_dimension.hpp
similarity index 88%
rename from src/kernels/composable_kernel/include/tensor_description/dimension.hpp
rename to src/kernels/static_composable_kernel/include/tensor_description/static_kernel_dimension.hpp
index 566895b9a4..3e5e55e648 100644
--- a/src/kernels/composable_kernel/include/tensor_description/dimension.hpp
+++ b/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_dimension.hpp
@@ -1,7 +1,7 @@
 #ifndef CK_DIMENSION_HPP
 #define CK_DIMENSION_HPP
 
-#include "common_header.hpp"
+#include "static_kernel_common_header.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/tensor_description/multi_index_transform.hpp b/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_multi_index_transform.hpp
similarity index 99%
rename from src/kernels/composable_kernel/include/tensor_description/multi_index_transform.hpp
rename to src/kernels/static_composable_kernel/include/tensor_description/static_kernel_multi_index_transform.hpp
index 681426c4d5..f95fddef82 100644
--- a/src/kernels/composable_kernel/include/tensor_description/multi_index_transform.hpp
+++ b/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_multi_index_transform.hpp
@@ -1,7 +1,7 @@
 #ifndef CK_MULTI_INDEX_TRANSFORM_HPP
 #define CK_MULTI_INDEX_TRANSFORM_HPP
 
-#include "common_header.hpp"
+#include "static_kernel_common_header.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/tensor_description/tensor_coordinate.hpp b/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_coordinate.hpp
similarity index 98%
rename from src/kernels/composable_kernel/include/tensor_description/tensor_coordinate.hpp
rename to src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_coordinate.hpp
index a2d6bb3fb1..42c84b6c30 100644
--- a/src/kernels/composable_kernel/include/tensor_description/tensor_coordinate.hpp
+++ b/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_coordinate.hpp
@@ -1,10 +1,10 @@
 #ifndef CK_TENSOR_COORDINATE_HPP
 #define CK_TENSOR_COORDINATE_HPP
 
-#include "common_header.hpp"
-#include "dimension.hpp"
-#include "multi_index_transform.hpp"
-#include "tensor_descriptor.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_dimension.hpp"
+#include "static_kernel_multi_index_transform.hpp"
+#include "static_kernel_tensor_descriptor.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp b/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_coordinate_deprecated.hpp
similarity index 98%
rename from src/kernels/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
rename to src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_coordinate_deprecated.hpp
index 69659445a0..070e38dabc 100644
--- a/src/kernels/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
+++ b/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_coordinate_deprecated.hpp
@@ -1,9 +1,9 @@
 #ifndef CK_TENSOR_COORDINATE_DEPRECATED_HPP
 #define CK_TENSOR_COORDINATE_DEPRECATED_HPP
 
-#include "common_header.hpp"
-#include "ConstantTensorDescriptor_deprecated.hpp"
-#include "ConstantMergedTensorDescriptor_deprecated.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_ConstantTensorDescriptor_deprecated.hpp"
+#include "static_kernel_ConstantMergedTensorDescriptor_deprecated.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/tensor_description/tensor_coordinate_helper.hpp b/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_coordinate_helper.hpp
similarity index 100%
rename from src/kernels/composable_kernel/include/tensor_description/tensor_coordinate_helper.hpp
rename to src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_coordinate_helper.hpp
diff --git a/src/kernels/composable_kernel/include/tensor_description/tensor_descriptor.hpp b/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_descriptor.hpp
similarity index 99%
rename from src/kernels/composable_kernel/include/tensor_description/tensor_descriptor.hpp
rename to src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_descriptor.hpp
index 42ca4d862e..5d02c2c07e 100644
--- a/src/kernels/composable_kernel/include/tensor_description/tensor_descriptor.hpp
+++ b/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_descriptor.hpp
@@ -1,9 +1,9 @@
 #ifndef CK_TENSOR_DESCRIPTOR_HPP
 #define CK_TENSOR_DESCRIPTOR_HPP
 
-#include "common_header.hpp"
-#include "dimension.hpp"
-#include "multi_index_transform.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_dimension.hpp"
+#include "static_kernel_multi_index_transform.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp b/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_descriptor_helper.hpp
similarity index 98%
rename from src/kernels/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
rename to src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_descriptor_helper.hpp
index 3c06f49d58..e71096d567 100644
--- a/src/kernels/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
+++ b/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_descriptor_helper.hpp
@@ -1,8 +1,8 @@
 #ifndef CK_TENSOR_DESCRIPTOR_HELPER_HPP
 #define CK_TENSOR_DESCRIPTOR_HELPER_HPP
 
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_tensor_descriptor.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/tensor_operation/blockwise_gemm.hpp b/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_blockwise_gemm.hpp
similarity index 99%
rename from src/kernels/composable_kernel/include/tensor_operation/blockwise_gemm.hpp
rename to src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_blockwise_gemm.hpp
index 6106581896..c4dfcb94da 100644
--- a/src/kernels/composable_kernel/include/tensor_operation/blockwise_gemm.hpp
+++ b/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_blockwise_gemm.hpp
@@ -1,9 +1,9 @@
 #ifndef CK_BLOCKWISE_GEMM_HPP
 #define CK_BLOCKWISE_GEMM_HPP
 
-#include "common_header.hpp"
-#include "ConstantMatrixDescriptor.hpp"
-#include "threadwise_gemm.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_ConstantMatrixDescriptor.hpp"
+#include "static_kernel_threadwise_gemm.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp b/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_blockwise_gemm_xdlops.hpp
similarity index 97%
rename from src/kernels/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp
rename to src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_blockwise_gemm_xdlops.hpp
index 2cdce109e3..4617af7f66 100644
--- a/src/kernels/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp
+++ b/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_blockwise_gemm_xdlops.hpp
@@ -1,11 +1,11 @@
 #ifndef CK_BLOCKWISE_GEMM_XDLOPS_HPP
 #define CK_BLOCKWISE_GEMM_XDLOPS_HPP
 
-#include "common_header.hpp"
-#include "ConstantMatrixDescriptor.hpp"
-#include "xdlops_gemm.hpp"
-#include "xdlops_gemm_inline_asm.hpp"
-#include "threadwise_gemm.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_ConstantMatrixDescriptor.hpp"
+#include "static_kernel_xdlops_gemm.hpp"
+#include "static_kernel_xdlops_gemm_inline_asm.hpp"
+#include "static_kernel_threadwise_gemm.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_blockwise_generic_tensor_slice_copy.hpp
similarity index 97%
rename from src/kernels/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
rename to src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_blockwise_generic_tensor_slice_copy.hpp
index b935f86794..cd8b41464e 100644
--- a/src/kernels/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_blockwise_generic_tensor_slice_copy.hpp
@@ -1,11 +1,11 @@
 #ifndef CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_HPP
 #define CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_HPP
 
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "tensor_coordinate.hpp"
-#include "threadwise_generic_tensor_slice_copy.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_tensor_descriptor.hpp"
+#include "static_kernel_tensor_descriptor_helper.hpp"
+#include "static_kernel_tensor_coordinate.hpp"
+#include "static_kernel_threadwise_generic_tensor_slice_copy.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp b/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_blockwise_generic_tensor_slice_copy_deprecated.hpp
similarity index 98%
rename from src/kernels/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
rename to src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_blockwise_generic_tensor_slice_copy_deprecated.hpp
index 784b7548c5..d0fd60b990 100644
--- a/src/kernels/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
+++ b/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_blockwise_generic_tensor_slice_copy_deprecated.hpp
@@ -1,11 +1,11 @@
 #ifndef CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP
 #define CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP
 
-#include "common_header.hpp"
-#include "ConstantTensorDescriptor_deprecated.hpp"
-#include "ConstantMergedTensorDescriptor_deprecated.hpp"
-#include "tensor_coordinate_deprecated.hpp"
-#include "threadwise_generic_tensor_slice_copy_deprecated.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_ConstantTensorDescriptor_deprecated.hpp"
+#include "static_kernel_ConstantMergedTensorDescriptor_deprecated.hpp"
+#include "static_kernel_tensor_coordinate_deprecated.hpp"
+#include "static_kernel_threadwise_generic_tensor_slice_copy_deprecated.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/tensor_operation/gridwise_gemm.hpp b/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_gridwise_gemm.hpp
similarity index 97%
rename from src/kernels/composable_kernel/include/tensor_operation/gridwise_gemm.hpp
rename to src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_gridwise_gemm.hpp
index dc64be44be..21970f46f1 100644
--- a/src/kernels/composable_kernel/include/tensor_operation/gridwise_gemm.hpp
+++ b/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_gridwise_gemm.hpp
@@ -1,13 +1,13 @@
 #ifndef CK_GRIDWISE_GEMM_HPP
 #define CK_GRIDWISE_GEMM_HPP
 
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "ConstantMatrixDescriptor.hpp"
-#include "blockwise_generic_tensor_slice_copy.hpp"
-#include "threadwise_generic_tensor_slice_copy.hpp"
-#include "blockwise_gemm.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_tensor_descriptor.hpp"
+#include "static_kernel_tensor_descriptor_helper.hpp"
+#include "static_kernel_ConstantMatrixDescriptor.hpp"
+#include "static_kernel_blockwise_generic_tensor_slice_copy.hpp"
+#include "static_kernel_threadwise_generic_tensor_slice_copy.hpp"
+#include "static_kernel_blockwise_gemm.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/tensor_operation/gridwise_gemm_fp16_bfp16.hpp b/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_gridwise_gemm_fp16_bfp16.hpp
similarity index 98%
rename from src/kernels/composable_kernel/include/tensor_operation/gridwise_gemm_fp16_bfp16.hpp
rename to src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_gridwise_gemm_fp16_bfp16.hpp
index e0e8592b53..2dcd8da934 100644
--- a/src/kernels/composable_kernel/include/tensor_operation/gridwise_gemm_fp16_bfp16.hpp
+++ b/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_gridwise_gemm_fp16_bfp16.hpp
@@ -1,13 +1,13 @@
 #ifndef CK_GRIDWISE_GEMM_FP16_BFP16_HPP
 #define CK_GRIDWISE_GEMM_FP16_BFP16_HPP
 
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "ConstantMatrixDescriptor.hpp"
-#include "blockwise_generic_tensor_slice_copy.hpp"
-#include "threadwise_generic_tensor_slice_copy.hpp"
-#include "blockwise_gemm.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_tensor_descriptor.hpp"
+#include "static_kernel_tensor_descriptor_helper.hpp"
+#include "static_kernel_ConstantMatrixDescriptor.hpp"
+#include "static_kernel_blockwise_generic_tensor_slice_copy.hpp"
+#include "static_kernel_threadwise_generic_tensor_slice_copy.hpp"
+#include "static_kernel_blockwise_gemm.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops.hpp b/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_gridwise_gemm_xdlops.hpp
similarity index 98%
rename from src/kernels/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops.hpp
rename to src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_gridwise_gemm_xdlops.hpp
index 57e39bf7c7..44652321f7 100644
--- a/src/kernels/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops.hpp
+++ b/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_gridwise_gemm_xdlops.hpp
@@ -1,13 +1,13 @@
 #ifndef CK_GRIDWISE_GEMM_XDLOPS_HPP
 #define CK_GRIDWISE_GEMM_XDLOPS_HPP
 
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "ConstantMatrixDescriptor.hpp"
-#include "blockwise_generic_tensor_slice_copy.hpp"
-#include "threadwise_generic_tensor_slice_copy.hpp"
-#include "blockwise_gemm_xdlops.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_tensor_descriptor.hpp"
+#include "static_kernel_tensor_descriptor_helper.hpp"
+#include "static_kernel_ConstantMatrixDescriptor.hpp"
+#include "static_kernel_blockwise_generic_tensor_slice_copy.hpp"
+#include "static_kernel_threadwise_generic_tensor_slice_copy.hpp"
+#include "static_kernel_blockwise_gemm_xdlops.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_fp16_bfp16.hpp b/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_gridwise_gemm_xdlops_fp16_bfp16.hpp
similarity index 99%
rename from src/kernels/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_fp16_bfp16.hpp
rename to src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_gridwise_gemm_xdlops_fp16_bfp16.hpp
index 2bc0afdbb0..61a9a0ca34 100644
--- a/src/kernels/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_fp16_bfp16.hpp
+++ b/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_gridwise_gemm_xdlops_fp16_bfp16.hpp
@@ -1,13 +1,13 @@
 #ifndef CK_GRIDWISE_GEMM_XDLOPS_FP16_BFP16_HPP
 #define CK_GRIDWISE_GEMM_XDLOPS_FP16_BFP16_HPP
 
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "ConstantMatrixDescriptor.hpp"
-#include "blockwise_generic_tensor_slice_copy.hpp"
-#include "threadwise_generic_tensor_slice_copy.hpp"
-#include "blockwise_gemm_xdlops.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_tensor_descriptor.hpp"
+#include "static_kernel_tensor_descriptor_helper.hpp"
+#include "static_kernel_ConstantMatrixDescriptor.hpp"
+#include "static_kernel_blockwise_generic_tensor_slice_copy.hpp"
+#include "static_kernel_threadwise_generic_tensor_slice_copy.hpp"
+#include "static_kernel_blockwise_gemm_xdlops.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/tensor_operation/threadwise_gemm.hpp b/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_threadwise_gemm.hpp
similarity index 97%
rename from src/kernels/composable_kernel/include/tensor_operation/threadwise_gemm.hpp
rename to src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_threadwise_gemm.hpp
index 7cfd54e050..3985f4de0d 100644
--- a/src/kernels/composable_kernel/include/tensor_operation/threadwise_gemm.hpp
+++ b/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_threadwise_gemm.hpp
@@ -1,9 +1,9 @@
 #ifndef CK_THREADWISE_GEMM_HPP
 #define CK_THREADWISE_GEMM_HPP
 
-#include "common_header.hpp"
-#include "ConstantMatrixDescriptor.hpp"
-#include "math.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_ConstantMatrixDescriptor.hpp"
+#include "static_kernel_math.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_threadwise_generic_tensor_slice_copy.hpp
similarity index 99%
rename from src/kernels/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
rename to src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_threadwise_generic_tensor_slice_copy.hpp
index 5c2e379149..da9b5b5804 100644
--- a/src/kernels/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_threadwise_generic_tensor_slice_copy.hpp
@@ -1,10 +1,10 @@
 #ifndef CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_HPP
 #define CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_HPP
 
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "tensor_coordinate.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_tensor_descriptor.hpp"
+#include "static_kernel_tensor_descriptor_helper.hpp"
+#include "static_kernel_tensor_coordinate.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp b/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_threadwise_generic_tensor_slice_copy_deprecated.hpp
similarity index 98%
rename from src/kernels/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
rename to src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_threadwise_generic_tensor_slice_copy_deprecated.hpp
index 22212f221a..8c49d7714f 100644
--- a/src/kernels/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
+++ b/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_threadwise_generic_tensor_slice_copy_deprecated.hpp
@@ -1,10 +1,10 @@
 #ifndef CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP
 #define CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP
 
-#include "common_header.hpp"
-#include "ConstantTensorDescriptor_deprecated.hpp"
-#include "ConstantMergedTensorDescriptor_deprecated.hpp"
-#include "tensor_coordinate_deprecated.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_ConstantTensorDescriptor_deprecated.hpp"
+#include "static_kernel_ConstantMergedTensorDescriptor_deprecated.hpp"
+#include "static_kernel_tensor_coordinate_deprecated.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/tensor_operation/xdlops_gemm.hpp b/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_xdlops_gemm.hpp
similarity index 99%
rename from src/kernels/composable_kernel/include/tensor_operation/xdlops_gemm.hpp
rename to src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_xdlops_gemm.hpp
index a3641a3d3a..4728783c60 100644
--- a/src/kernels/composable_kernel/include/tensor_operation/xdlops_gemm.hpp
+++ b/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_xdlops_gemm.hpp
@@ -1,9 +1,9 @@
 #ifndef CK_XDLOPS_GEMM_HPP
 #define CK_XDLOPS_GEMM_HPP
 
-#include "common_header.hpp"
-#include "ConstantMatrixDescriptor.hpp"
-#include "math.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_ConstantMatrixDescriptor.hpp"
+#include "static_kernel_math.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/tensor_operation/xdlops_gemm_inline_asm.hpp b/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_xdlops_gemm_inline_asm.hpp
similarity index 99%
rename from src/kernels/composable_kernel/include/tensor_operation/xdlops_gemm_inline_asm.hpp
rename to src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_xdlops_gemm_inline_asm.hpp
index 66c747207c..5e8c43c6c2 100644
--- a/src/kernels/composable_kernel/include/tensor_operation/xdlops_gemm_inline_asm.hpp
+++ b/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_xdlops_gemm_inline_asm.hpp
@@ -1,9 +1,9 @@
 #ifndef CK_XDLOPS_GEMM_INLINE_ASM_HPP
 #define CK_XDLOPS_GEMM_INLINE_ASM_HPP
 
-#include "common_header.hpp"
-#include "ConstantMatrixDescriptor.hpp"
-#include "math.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_ConstantMatrixDescriptor.hpp"
+#include "static_kernel_math.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/utility/amd_buffer_addressing.hpp b/src/kernels/static_composable_kernel/include/utility/static_kernel_amd_buffer_addressing.hpp
similarity index 99%
rename from src/kernels/composable_kernel/include/utility/amd_buffer_addressing.hpp
rename to src/kernels/static_composable_kernel/include/utility/static_kernel_amd_buffer_addressing.hpp
index fc754657e3..0bc3f1804d 100644
--- a/src/kernels/composable_kernel/include/utility/amd_buffer_addressing.hpp
+++ b/src/kernels/static_composable_kernel/include/utility/static_kernel_amd_buffer_addressing.hpp
@@ -1,7 +1,7 @@
 #ifndef CK_AMD_BUFFER_ADDRESSING_HPP
 #define CK_AMD_BUFFER_ADDRESSING_HPP
 
-#include "float_type.hpp"
+#include "static_kernel_float_type.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/utility/amd_inline_asm.hpp b/src/kernels/static_composable_kernel/include/utility/static_kernel_amd_inline_asm.hpp
similarity index 99%
rename from src/kernels/composable_kernel/include/utility/amd_inline_asm.hpp
rename to src/kernels/static_composable_kernel/include/utility/static_kernel_amd_inline_asm.hpp
index 065174a957..cf8402fd9c 100644
--- a/src/kernels/composable_kernel/include/utility/amd_inline_asm.hpp
+++ b/src/kernels/static_composable_kernel/include/utility/static_kernel_amd_inline_asm.hpp
@@ -1,7 +1,7 @@
 #ifndef CK_AMD_INLINE_ASM_HPP
 #define CK_AMD_INLINE_ASM_HPP
 
-#include "float_type.hpp"
+#include "static_kernel_float_type.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/utility/amd_xdlops.hpp b/src/kernels/static_composable_kernel/include/utility/static_kernel_amd_xdlops.hpp
similarity index 99%
rename from src/kernels/composable_kernel/include/utility/amd_xdlops.hpp
rename to src/kernels/static_composable_kernel/include/utility/static_kernel_amd_xdlops.hpp
index 03b394ea0e..a766d09c52 100644
--- a/src/kernels/composable_kernel/include/utility/amd_xdlops.hpp
+++ b/src/kernels/static_composable_kernel/include/utility/static_kernel_amd_xdlops.hpp
@@ -1,7 +1,7 @@
 #ifndef CK_AMD_XDLOPS_HPP
 #define CK_AMD_XDLOPS_HPP
 
-#include "float_type.hpp"
+#include "static_kernel_float_type.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/utility/amd_xdlops_inline_asm.hpp b/src/kernels/static_composable_kernel/include/utility/static_kernel_amd_xdlops_inline_asm.hpp
similarity index 99%
rename from src/kernels/composable_kernel/include/utility/amd_xdlops_inline_asm.hpp
rename to src/kernels/static_composable_kernel/include/utility/static_kernel_amd_xdlops_inline_asm.hpp
index ac254ec69c..566aab2efe 100644
--- a/src/kernels/composable_kernel/include/utility/amd_xdlops_inline_asm.hpp
+++ b/src/kernels/static_composable_kernel/include/utility/static_kernel_amd_xdlops_inline_asm.hpp
@@ -1,7 +1,7 @@
 #ifndef CK_AMD_XDLOPS_INLINE_ASM_HPP
 #define CK_AMD_XDLOPS_INLINE_ASM_HPP
 
-#include "float_type.hpp"
+#include "static_kernel_float_type.hpp"
 
 namespace ck {
 // clang-format off
diff --git a/src/kernels/composable_kernel/include/utility/array.hpp b/src/kernels/static_composable_kernel/include/utility/static_kernel_array.hpp
similarity index 99%
rename from src/kernels/composable_kernel/include/utility/array.hpp
rename to src/kernels/static_composable_kernel/include/utility/static_kernel_array.hpp
index 213b20530d..8767fb2c1f 100644
--- a/src/kernels/composable_kernel/include/utility/array.hpp
+++ b/src/kernels/static_composable_kernel/include/utility/static_kernel_array.hpp
@@ -1,8 +1,8 @@
 #ifndef CK_ARRAY_HPP
 #define CK_ARRAY_HPP
 
-#include "sequence.hpp"
-#include "functional2.hpp"
+#include "static_kernel_sequence.hpp"
+#include "static_kernel_functional2.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/utility/ck_utils_type.hpp b/src/kernels/static_composable_kernel/include/utility/static_kernel_ck_utils_type.hpp
similarity index 94%
rename from src/kernels/composable_kernel/include/utility/ck_utils_type.hpp
rename to src/kernels/static_composable_kernel/include/utility/static_kernel_ck_utils_type.hpp
index afb37877e6..6220c20bac 100644
--- a/src/kernels/composable_kernel/include/utility/ck_utils_type.hpp
+++ b/src/kernels/static_composable_kernel/include/utility/static_kernel_ck_utils_type.hpp
@@ -1,7 +1,7 @@
 #ifndef CK_UTILS_TYPE_HPP
 #define CK_UTILS_TYPE_HPP
 
-#include "integral_constant.hpp"
+#include "static_kernel_integral_constant.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/static_composable_kernel/include/utility/static_kernel_common_header.hpp b/src/kernels/static_composable_kernel/include/utility/static_kernel_common_header.hpp
new file mode 100644
index 0000000000..265056430c
--- /dev/null
+++ b/src/kernels/static_composable_kernel/include/utility/static_kernel_common_header.hpp
@@ -0,0 +1,30 @@
+#ifndef CK_COMMON_HEADER_HPP
+#define CK_COMMON_HEADER_HPP
+
+#include "static_kernel_config.hpp"
+#include "static_kernel_utility.hpp"
+#include "static_kernel_integral_constant.hpp"
+#include "static_kernel_number.hpp"
+#include "static_kernel_float_type.hpp"
+#include "static_kernel_ck_utils_type.hpp"
+#include "static_kernel_tuple.hpp"
+#include "static_kernel_math.hpp"
+#include "static_kernel_sequence.hpp"
+#include "static_kernel_array.hpp"
+#include "static_kernel_functional.hpp"
+#include "static_kernel_functional2.hpp"
+#include "static_kernel_functional3.hpp"
+#include "static_kernel_functional4.hpp"
+#include "static_kernel_in_memory_operation.hpp"
+#include "static_kernel_synchronization.hpp"
+
+#if CK_USE_AMD_INLINE_ASM
+#include "static_kernel_amd_inline_asm.hpp"
+#endif
+
+#if CK_USE_AMD_XDLOPS
+#include "static_kernel_amd_xdlops.hpp"
+#include "static_kernel_amd_xdlops_inline_asm.hpp"
+#endif
+
+#endif
diff --git a/src/kernels/composable_kernel/include/utility/config.hpp b/src/kernels/static_composable_kernel/include/utility/static_kernel_config.hpp
similarity index 100%
rename from src/kernels/composable_kernel/include/utility/config.hpp
rename to src/kernels/static_composable_kernel/include/utility/static_kernel_config.hpp
diff --git a/src/kernels/composable_kernel/include/utility/float_type.hpp b/src/kernels/static_composable_kernel/include/utility/static_kernel_float_type.hpp
similarity index 100%
rename from src/kernels/composable_kernel/include/utility/float_type.hpp
rename to src/kernels/static_composable_kernel/include/utility/static_kernel_float_type.hpp
diff --git a/src/kernels/composable_kernel/include/utility/functional.hpp b/src/kernels/static_composable_kernel/include/utility/static_kernel_functional.hpp
similarity index 94%
rename from src/kernels/composable_kernel/include/utility/functional.hpp
rename to src/kernels/static_composable_kernel/include/utility/static_kernel_functional.hpp
index e8434970a0..e2577598b5 100644
--- a/src/kernels/composable_kernel/include/utility/functional.hpp
+++ b/src/kernels/static_composable_kernel/include/utility/static_kernel_functional.hpp
@@ -1,9 +1,9 @@
 #ifndef CK_FUNCTIONAL_HPP
 #define CK_FUNCTIONAL_HPP
 
-#include "integral_constant.hpp"
-#include "sequence.hpp"
-#include "ck_utils_type.hpp"
+#include "static_kernel_integral_constant.hpp"
+#include "static_kernel_sequence.hpp"
+#include "static_kernel_ck_utils_type.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/utility/functional2.hpp b/src/kernels/static_composable_kernel/include/utility/static_kernel_functional2.hpp
similarity index 93%
rename from src/kernels/composable_kernel/include/utility/functional2.hpp
rename to src/kernels/static_composable_kernel/include/utility/static_kernel_functional2.hpp
index ed0ce1ce0e..91fa96d7a6 100644
--- a/src/kernels/composable_kernel/include/utility/functional2.hpp
+++ b/src/kernels/static_composable_kernel/include/utility/static_kernel_functional2.hpp
@@ -1,8 +1,8 @@
 #ifndef CK_FUNCTIONAL2_HPP
 #define CK_FUNCTIONAL2_HPP
 
-#include "functional.hpp"
-#include "sequence.hpp"
+#include "static_kernel_functional.hpp"
+#include "static_kernel_sequence.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/utility/functional3.hpp b/src/kernels/static_composable_kernel/include/utility/static_kernel_functional3.hpp
similarity index 96%
rename from src/kernels/composable_kernel/include/utility/functional3.hpp
rename to src/kernels/static_composable_kernel/include/utility/static_kernel_functional3.hpp
index 48a0933793..fe9295bf71 100644
--- a/src/kernels/composable_kernel/include/utility/functional3.hpp
+++ b/src/kernels/static_composable_kernel/include/utility/static_kernel_functional3.hpp
@@ -1,10 +1,10 @@
 #ifndef CK_FUNCTIONAL3_HPP
 #define CK_FUNCTIONAL3_HPP
 
-#include "functional.hpp"
-#include "functional2.hpp"
-#include "sequence.hpp"
-#include "array.hpp"
+#include "static_kernel_functional.hpp"
+#include "static_kernel_functional2.hpp"
+#include "static_kernel_sequence.hpp"
+#include "static_kernel_array.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/utility/functional4.hpp b/src/kernels/static_composable_kernel/include/utility/static_kernel_functional4.hpp
similarity index 85%
rename from src/kernels/composable_kernel/include/utility/functional4.hpp
rename to src/kernels/static_composable_kernel/include/utility/static_kernel_functional4.hpp
index 70475ced4a..1745cfddab 100644
--- a/src/kernels/composable_kernel/include/utility/functional4.hpp
+++ b/src/kernels/static_composable_kernel/include/utility/static_kernel_functional4.hpp
@@ -1,9 +1,9 @@
 #ifndef CK_FUNCTIONAL4_HPP
 #define CK_FUNCTIONAL4_HPP
 
-#include "sequence.hpp"
-#include "tuple.hpp"
-#include "array.hpp"
+#include "static_kernel_sequence.hpp"
+#include "static_kernel_tuple.hpp"
+#include "static_kernel_array.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/utility/in_memory_operation.hpp b/src/kernels/static_composable_kernel/include/utility/static_kernel_in_memory_operation.hpp
similarity index 98%
rename from src/kernels/composable_kernel/include/utility/in_memory_operation.hpp
rename to src/kernels/static_composable_kernel/include/utility/static_kernel_in_memory_operation.hpp
index 3d28e8ac3a..eb14039b26 100644
--- a/src/kernels/composable_kernel/include/utility/in_memory_operation.hpp
+++ b/src/kernels/static_composable_kernel/include/utility/static_kernel_in_memory_operation.hpp
@@ -1,10 +1,10 @@
 #ifndef CK_IN_MEMORY_OPERATION_AMD_HPP
 #define CK_IN_MEMORY_OPERATION_AMD_HPP
 
-#include "float_type.hpp"
+#include "static_kernel_float_type.hpp"
 
 #if CK_USE_AMD_BUFFER_ADDRESSING
-#include "amd_buffer_addressing.hpp"
+#include "static_kernel_amd_buffer_addressing.hpp"
 #endif
 
 namespace ck {
diff --git a/src/kernels/composable_kernel/include/utility/integral_constant.hpp b/src/kernels/static_composable_kernel/include/utility/static_kernel_integral_constant.hpp
similarity index 100%
rename from src/kernels/composable_kernel/include/utility/integral_constant.hpp
rename to src/kernels/static_composable_kernel/include/utility/static_kernel_integral_constant.hpp
diff --git a/src/kernels/composable_kernel/include/utility/math.hpp b/src/kernels/static_composable_kernel/include/utility/static_kernel_math.hpp
similarity index 96%
rename from src/kernels/composable_kernel/include/utility/math.hpp
rename to src/kernels/static_composable_kernel/include/utility/static_kernel_math.hpp
index b58c0648c0..c46e7929f1 100644
--- a/src/kernels/composable_kernel/include/utility/math.hpp
+++ b/src/kernels/static_composable_kernel/include/utility/static_kernel_math.hpp
@@ -1,9 +1,9 @@
 #ifndef CK_MATH_HPP
 #define CK_MATH_HPP
 
-#include "config.hpp"
-#include "integral_constant.hpp"
-#include "ck_utils_type.hpp"
+#include "static_kernel_config.hpp"
+#include "static_kernel_integral_constant.hpp"
+#include "static_kernel_ck_utils_type.hpp"
 
 namespace ck {
 namespace math {
diff --git a/src/kernels/composable_kernel/include/utility/number.hpp b/src/kernels/static_composable_kernel/include/utility/static_kernel_number.hpp
similarity index 95%
rename from src/kernels/composable_kernel/include/utility/number.hpp
rename to src/kernels/static_composable_kernel/include/utility/static_kernel_number.hpp
index f8c5643694..4019b882b0 100644
--- a/src/kernels/composable_kernel/include/utility/number.hpp
+++ b/src/kernels/static_composable_kernel/include/utility/static_kernel_number.hpp
@@ -1,7 +1,7 @@
 #ifndef CK_NUMBER_HPP
 #define CK_NUMBER_HPP
 
-#include "integral_constant.hpp"
+#include "static_kernel_integral_constant.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/utility/reduction_common.hpp b/src/kernels/static_composable_kernel/include/utility/static_kernel_reduction_common.hpp
similarity index 98%
rename from src/kernels/composable_kernel/include/utility/reduction_common.hpp
rename to src/kernels/static_composable_kernel/include/utility/static_kernel_reduction_common.hpp
index e1c5c2f2df..dcc49e9240 100644
--- a/src/kernels/composable_kernel/include/utility/reduction_common.hpp
+++ b/src/kernels/static_composable_kernel/include/utility/static_kernel_reduction_common.hpp
@@ -26,7 +26,7 @@
 #ifndef CK_REDUCTION_COMMON_HPP
 #define CK_REDUCTION_COMMON_HPP
 
-#include "float_type.hpp"
+#include "static_kernel_float_type.hpp"
 
 #define WORKAROUND_SWDEV_274384 (HIP_PACKAGE_VERSION_FLAT >= 4002021203ULL)
 
diff --git a/src/kernels/composable_kernel/include/utility/reduction_operator.hpp b/src/kernels/static_composable_kernel/include/utility/static_kernel_reduction_operator.hpp
similarity index 99%
rename from src/kernels/composable_kernel/include/utility/reduction_operator.hpp
rename to src/kernels/static_composable_kernel/include/utility/static_kernel_reduction_operator.hpp
index a9c041895d..e996e7a8b0 100644
--- a/src/kernels/composable_kernel/include/utility/reduction_operator.hpp
+++ b/src/kernels/static_composable_kernel/include/utility/static_kernel_reduction_operator.hpp
@@ -29,7 +29,7 @@
 #ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
 #include <limits>
 #endif
-#include "reduction_common.hpp"
+#include "static_kernel_reduction_common.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/utility/sequence.hpp b/src/kernels/static_composable_kernel/include/utility/static_kernel_sequence.hpp
similarity index 99%
rename from src/kernels/composable_kernel/include/utility/sequence.hpp
rename to src/kernels/static_composable_kernel/include/utility/static_kernel_sequence.hpp
index 3ca59910fd..2f1e9e9c19 100644
--- a/src/kernels/composable_kernel/include/utility/sequence.hpp
+++ b/src/kernels/static_composable_kernel/include/utility/static_kernel_sequence.hpp
@@ -1,10 +1,10 @@
 #ifndef CK_SEQUENCE_HPP
 #define CK_SEQUENCE_HPP
 
-#include "integral_constant.hpp"
-#include "ck_utils_type.hpp"
-#include "functional.hpp"
-#include "math.hpp"
+#include "static_kernel_integral_constant.hpp"
+#include "static_kernel_ck_utils_type.hpp"
+#include "static_kernel_functional.hpp"
+#include "static_kernel_math.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/utility/synchronization.hpp b/src/kernels/static_composable_kernel/include/utility/static_kernel_synchronization.hpp
similarity index 89%
rename from src/kernels/composable_kernel/include/utility/synchronization.hpp
rename to src/kernels/static_composable_kernel/include/utility/static_kernel_synchronization.hpp
index da74f2074d..1def1d21cd 100644
--- a/src/kernels/composable_kernel/include/utility/synchronization.hpp
+++ b/src/kernels/static_composable_kernel/include/utility/static_kernel_synchronization.hpp
@@ -1,7 +1,7 @@
 #ifndef CK_SYNCHRONIZATION_AMD_HPP
 #define CK_SYNCHRONIZATION_AMD_HPP
 
-#include "config.hpp"
+#include "static_kernel_config.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/utility/tuple.hpp b/src/kernels/static_composable_kernel/include/utility/static_kernel_tuple.hpp
similarity index 97%
rename from src/kernels/composable_kernel/include/utility/tuple.hpp
rename to src/kernels/static_composable_kernel/include/utility/static_kernel_tuple.hpp
index bfae078c7b..7b1e2f2d95 100644
--- a/src/kernels/composable_kernel/include/utility/tuple.hpp
+++ b/src/kernels/static_composable_kernel/include/utility/static_kernel_tuple.hpp
@@ -1,9 +1,9 @@
 #ifndef CK_TUPLE_HPP
 #define CK_TUPLE_HPP
 
-#include "integral_constant.hpp"
-#include "ck_utils_type.hpp"
-#include "sequence.hpp"
+#include "static_kernel_integral_constant.hpp"
+#include "static_kernel_ck_utils_type.hpp"
+#include "static_kernel_sequence.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/include/utility/utility.hpp b/src/kernels/static_composable_kernel/include/utility/static_kernel_utility.hpp
similarity index 85%
rename from src/kernels/composable_kernel/include/utility/utility.hpp
rename to src/kernels/static_composable_kernel/include/utility/static_kernel_utility.hpp
index 9f34e044b7..2476b98315 100644
--- a/src/kernels/composable_kernel/include/utility/utility.hpp
+++ b/src/kernels/static_composable_kernel/include/utility/static_kernel_utility.hpp
@@ -1,7 +1,7 @@
 #ifndef CK_UTILITY_HPP
 #define CK_UTILITY_HPP
 
-#include "config.hpp"
+#include "static_kernel_config.hpp"
 
 namespace ck {
 
diff --git a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_data_implicit_gemm_v1r1_ncdhw_kczyx_nkdhw.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_ncdhw_kczyx_nkdhw.cpp
similarity index 98%
rename from src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_data_implicit_gemm_v1r1_ncdhw_kczyx_nkdhw.cpp
rename to src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_ncdhw_kczyx_nkdhw.cpp
index 4cd31653dc..cfc55e9313 100644
--- a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_data_implicit_gemm_v1r1_ncdhw_kczyx_nkdhw.cpp
+++ b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_ncdhw_kczyx_nkdhw.cpp
@@ -1,5 +1,5 @@
-#include "common_header.hpp"
-#include "gridwise_convolution_backward_data_implicit_gemm_v1r1_ncdhw_kczyx_nkdhw.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_ncdhw_kczyx_nkdhw.hpp"
 #include "float_types.h"
 
 extern "C" __global__
diff --git a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.cpp
similarity index 97%
rename from src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.cpp
rename to src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.cpp
index 4006f11963..da38031465 100644
--- a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.cpp
+++ b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.cpp
@@ -1,6 +1,6 @@
-#include "common_header.hpp"
-#include "gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp"
-#include "gridwise_convolution_backward_data_implicit_gemm_v1r1_fp16_bfp16_nchw_kcyx_nkhw.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp"
+#include "static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_fp16_bfp16_nchw_kcyx_nkhw.hpp"
 #include "float_types.h"
 
 extern "C" __global__
diff --git a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_nchw_kcyx_nkhw.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_nchw_kcyx_nkhw.cpp
similarity index 98%
rename from src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_nchw_kcyx_nkhw.cpp
rename to src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_nchw_kcyx_nkhw.cpp
index 5c307b0bb2..afa27e199b 100644
--- a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_nchw_kcyx_nkhw.cpp
+++ b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_nchw_kcyx_nkhw.cpp
@@ -1,5 +1,5 @@
-#include "common_header.hpp"
-#include "gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_gnchw_gkcyx_gnkhw.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_gnchw_gkcyx_gnkhw.hpp"
 #include "float_types.h"
 
 extern "C" __global__
diff --git a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_data_implicit_gemm_v4r1_ncdhw_kczyx_nkdhw.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_ncdhw_kczyx_nkdhw.cpp
similarity index 98%
rename from src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_data_implicit_gemm_v4r1_ncdhw_kczyx_nkdhw.cpp
rename to src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_ncdhw_kczyx_nkdhw.cpp
index d81b1cbcc8..5b485ddc32 100644
--- a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_data_implicit_gemm_v4r1_ncdhw_kczyx_nkdhw.cpp
+++ b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_ncdhw_kczyx_nkdhw.cpp
@@ -1,5 +1,5 @@
-#include "common_header.hpp"
-#include "gridwise_convolution_backward_data_implicit_gemm_v4r1_ncdhw_kczyx_nkdhw.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_ncdhw_kczyx_nkdhw.hpp"
 #include "float_types.h"
 
 extern "C" __global__
diff --git a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.cpp
similarity index 97%
rename from src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.cpp
rename to src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.cpp
index 5c742c107a..794cb02ab7 100644
--- a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.cpp
+++ b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.cpp
@@ -1,5 +1,5 @@
-#include "common_header.hpp"
-#include "gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
 #include "float_types.h"
 
 extern "C" __global__
diff --git a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.cpp
similarity index 97%
rename from src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.cpp
rename to src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.cpp
index 8de2333eaa..a215cb9ed4 100644
--- a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.cpp
+++ b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.cpp
@@ -1,5 +1,5 @@
-#include "common_header.hpp"
-#include "gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_gnchw_gkcyx_gnkhw.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_gnchw_gkcyx_gnkhw.hpp"
 #include "float_types.h"
 
 extern "C" __global__
diff --git a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_weights_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.cpp
similarity index 98%
rename from src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_weights_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.cpp
rename to src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.cpp
index e4e0162772..6b85751408 100644
--- a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_weights_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.cpp
+++ b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.cpp
@@ -1,5 +1,5 @@
-#include "common_header.hpp"
-#include "gridwise_convolution_backward_weights_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.hpp"
 #include "float_types.h"
 
 extern "C" __global__
diff --git a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_weights_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp
similarity index 97%
rename from src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_weights_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp
rename to src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp
index a8b88dca12..065f85911f 100644
--- a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_weights_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp
+++ b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp
@@ -1,5 +1,5 @@
-#include "common_header.hpp"
-#include "gridwise_convolution_backward_weights_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
 #include "float_types.h"
 
 extern "C" __global__
diff --git a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
similarity index 98%
rename from src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
rename to src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
index f03c956052..ebf2aa4c21 100644
--- a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
+++ b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
@@ -1,5 +1,5 @@
-#include "common_header.hpp"
-#include "gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
 #include "float_types.h"
 
 extern "C" __global__
diff --git a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.cpp
similarity index 98%
rename from src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.cpp
rename to src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.cpp
index 5fb5600cc3..c0db8b92c5 100644
--- a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.cpp
+++ b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.cpp
@@ -1,5 +1,5 @@
-#include "common_header.hpp"
-#include "gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.hpp"
 #include "float_types.h"
 
 extern "C" __global__
diff --git a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
similarity index 98%
rename from src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
rename to src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
index 95be2ea216..d32bf2dc65 100644
--- a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
+++ b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
@@ -1,5 +1,5 @@
-#include "common_header.hpp"
-#include "gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
 #include "float_types.h"
 
 extern "C" __global__
diff --git a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.cpp
similarity index 98%
rename from src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.cpp
rename to src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.cpp
index 8f18f669b7..0be48afa0b 100644
--- a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.cpp
+++ b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.cpp
@@ -1,5 +1,5 @@
-#include "common_header.hpp"
-#include "gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.hpp"
 #include "float_types.h"
 
 extern "C" __global__
diff --git a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_forward_implicit_gemm_v4r5_xdlops_nchw_kcyx_nkhw.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_forward_implicit_gemm_v4r5_xdlops_nchw_kcyx_nkhw.cpp
similarity index 98%
rename from src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_forward_implicit_gemm_v4r5_xdlops_nchw_kcyx_nkhw.cpp
rename to src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_forward_implicit_gemm_v4r5_xdlops_nchw_kcyx_nkhw.cpp
index c93f2e95d7..ee705df074 100644
--- a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_forward_implicit_gemm_v4r5_xdlops_nchw_kcyx_nkhw.cpp
+++ b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_forward_implicit_gemm_v4r5_xdlops_nchw_kcyx_nkhw.cpp
@@ -1,5 +1,5 @@
-#include "common_header.hpp"
-#include "gridwise_convolution_forward_implicit_gemm_v4r5_xdlops_nchw_kcyx_nkhw.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_gridwise_convolution_forward_implicit_gemm_v4r5_xdlops_nchw_kcyx_nkhw.hpp"
 #include "float_types.h"
 
 extern "C" __global__
diff --git a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer.cpp
similarity index 98%
rename from src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer.cpp
rename to src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer.cpp
index 9061730f56..f2178f2af9 100644
--- a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer.cpp
+++ b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer.cpp
@@ -1,6 +1,6 @@
-#include "common_header.hpp"
-#include "gridwise_convolution_implicit_gemm_v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer.hpp"
-#include "gridwise_convolution_implicit_gemm_v4r1_fp16_bfp16_gnchw_gkcyx_gnkhw_lds_double_buffer.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_gridwise_convolution_implicit_gemm_v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer.hpp"
+#include "static_kernel_gridwise_convolution_implicit_gemm_v4r1_fp16_bfp16_gnchw_gkcyx_gnkhw_lds_double_buffer.hpp"
 #include "float_types.h"
 
 extern "C" __global__
diff --git a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.cpp
similarity index 98%
rename from src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.cpp
rename to src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.cpp
index 62166d51ca..7cd2ed0c80 100644
--- a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.cpp
+++ b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.cpp
@@ -1,6 +1,6 @@
-#include "common_header.hpp"
-#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp"
-#include "gridwise_convolution_implicit_gemm_v4r1_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp"
+#include "static_kernel_gridwise_convolution_implicit_gemm_v4r1_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp"
 #include "float_types.h"
 
 extern "C" __global__
diff --git a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_nchw_kcyx_nkhw_lds_double_buffer.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_nchw_kcyx_nkhw_lds_double_buffer.cpp
similarity index 99%
rename from src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_nchw_kcyx_nkhw_lds_double_buffer.cpp
rename to src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_nchw_kcyx_nkhw_lds_double_buffer.cpp
index ea6b8410ac..1c7fd21e3c 100644
--- a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_nchw_kcyx_nkhw_lds_double_buffer.cpp
+++ b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_nchw_kcyx_nkhw_lds_double_buffer.cpp
@@ -1,5 +1,5 @@
-#include "common_header.hpp"
-#include "ConstantTensorDescriptor_deprecated.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_ConstantTensorDescriptor_deprecated.hpp"
 #include "gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_nchw_kcyx_nkhw_lds_double_buffer.hpp"
 #include "gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_fp16_bfp16_fwd_nchw_kcyx_nkhw_lds_double_buffer.hpp"
 #include "gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_fp16_bfp16_wrw_nchw_kcyx_nkhw_lds_double_buffer.hpp"
diff --git a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.cpp
similarity index 98%
rename from src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.cpp
rename to src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.cpp
index 3d72a9ebf4..f862ce834d 100644
--- a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.cpp
+++ b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.cpp
@@ -1,5 +1,5 @@
-#include "common_header.hpp"
-#include "gridwise_convolution_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_gridwise_convolution_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.hpp"
 #include "float_types.h"
 
 extern "C" __global__
diff --git a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp
similarity index 98%
rename from src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp
rename to src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp
index d1038cb280..ff1d531805 100644
--- a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp
+++ b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp
@@ -1,5 +1,5 @@
-#include "common_header.hpp"
-#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
+#include "static_kernel_common_header.hpp"
+#include "static_kernel_gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
 #include "float_types.h"
 
 extern "C" __global__
diff --git a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_generic_reduction.cpp
similarity index 97%
rename from src/kernels/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction.cpp
rename to src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_generic_reduction.cpp
index cd9227dd7e..ad711484a8 100644
--- a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction.cpp
+++ b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_generic_reduction.cpp
@@ -1,9 +1,9 @@
-#include "config.hpp"
-#include "number.hpp"
-#include "sequence.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "reduction_common.hpp"
-#include "gridwise_generic_reduction.hpp"
+#include "static_kernel_config.hpp"
+#include "static_kernel_number.hpp"
+#include "static_kernel_sequence.hpp"
+#include "static_kernel_tensor_descriptor_helper.hpp"
+#include "static_kernel_reduction_common.hpp"
+#include "static_kernel_gridwise_generic_reduction.hpp"
 
 using namespace ck;
 
diff --git a/src/mlo_dir_conv.cpp b/src/mlo_dir_conv.cpp
index af94703c47..c251dc3312 100644
--- a/src/mlo_dir_conv.cpp
+++ b/src/mlo_dir_conv.cpp
@@ -174,7 +174,8 @@ static auto GetImplicitGemmSolvers()
         miopen::solver::ConvAsmImplicitGemmGTCDynamicFwdXdlops,
         miopen::solver::ConvAsmImplicitGemmGTCDynamicBwdXdlops,
         miopen::solver::ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC,
-        miopen::solver::ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC>{};
+        miopen::solver::ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC,
+        miopen::solver::ConvCkIgemmFwdV6r1DlopsNchw>{};
 }
 
 static auto GetWindogradSolvers()
diff --git a/src/solver.cpp b/src/solver.cpp
index 0703bd3e6f..813bf8e589 100644
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -471,6 +471,9 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
     Register(registry, ++id, Primitive::Activation, SolverDbId(activ::ActivBwdSolver0{}));
     Register(registry, ++id, Primitive::Activation, SolverDbId(activ::ActivBwdSolver1{}));
 
+    RegisterWithSolver(
+        registry, ++id, ConvCkIgemmFwdV6r1DlopsNchw{}, miopenConvolutionAlgoImplicitGEMM);
+
     // IMPORTANT: New solvers should be added to the end of the function!
 }
 
diff --git a/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp b/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp
new file mode 100644
index 0000000000..fc25d75b77
--- /dev/null
+++ b/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp
@@ -0,0 +1,241 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <miopen/conv/invokers/impl_gemm.hpp>
+#include <miopen/conv/data_invoke_params.hpp>
+#include <miopen/solver.hpp>
+#include <miopen/handle.hpp>
+#include <miopen/generic_search.hpp>
+#include <miopen/solver/ck_util.hpp>
+#include <cstddef>
+
+#include "../composable_kernel/host/driver_online/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp"
+
+MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW)
+
+namespace miopen {
+namespace solver {
+
+static inline auto get_ck_tunable_conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw(
+    const PerformanceConvCkIgemmFwdV6r1DlopsNchw& config)
+{
+    return ck_driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetTunableList()[config
+                                                                              .ck_tunable_list_id];
+}
+
+bool PerformanceConvCkIgemmFwdV6r1DlopsNchw::SetNextValue(const ConvolutionContext&)
+{
+    if(ck_tunable_list_id <
+       ck_driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetTunableList().size() - 1)
+    {
+        ck_tunable_list_id++;
+        return true;
+    }
+    else
+    {
+        return false;
+    }
+}
+
+bool PerformanceConvCkIgemmFwdV6r1DlopsNchw::IsValid(const ConvolutionContext& ctx) const
+{
+    auto compile_param = ck_driver::CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{};
+    bool found         = false;
+
+    std::tie(compile_param, found) =
+        ck_driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::CalculateCompileParameterBasedOnTunable(
+            get_ck_convolution_problem_descriptor(ctx),
+            get_ck_tunable_conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw(*this));
+
+    if(!found)
+        return false;
+
+    return ck_driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::IsValidCompileParameter(
+        get_ck_convolution_problem_descriptor(ctx), compile_param);
+}
+
+bool ConvCkIgemmFwdV6r1DlopsNchw::IsApplicable(const ConvolutionContext& ctx) const
+{
+    if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW{}))
+        return false;
+    if(!ctx.use_hip_kernels)
+        return false;
+    if(!is_composable_kernel_supported_hardware(ctx))
+        return false;
+    if(!ctx.IsLayoutDefault())
+        return false;
+    if(!ctx.direction.IsForward())
+        return false;
+    if(!ctx.Is2d())
+        return false;
+    if(!(ctx.IsFp32() or ctx.IsFp16()))
+        return false;
+    if(ctx.group_counts != 1)
+        return false;
+
+    return ck_driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::IsApplicable(
+        get_ck_convolution_problem_descriptor(ctx));
+}
+
+PerformanceConvCkIgemmFwdV6r1DlopsNchw
+ConvCkIgemmFwdV6r1DlopsNchw::GetPerformanceConfig(const ConvolutionContext& ctx) const
+{
+    for(int i = 0; i < ck_driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetTunableList().size(); ++i)
+    {
+        if(IsValidPerformanceConfig(ctx, i))
+        {
+            return PerformanceConvCkIgemmFwdV6r1DlopsNchw(i);
+        }
+    }
+
+    MIOPEN_LOG_E("cannot find a valid performance config");
+}
+
+bool ConvCkIgemmFwdV6r1DlopsNchw::IsValidPerformanceConfig(
+    const ConvolutionContext& ctx, const PerformanceConvCkIgemmFwdV6r1DlopsNchw& config) const
+{
+    return config.IsValid(ctx);
+}
+
+ConvSolution ConvCkIgemmFwdV6r1DlopsNchw::GetSolution(
+    const ConvolutionContext& ctx, const PerformanceConvCkIgemmFwdV6r1DlopsNchw config, bool) const
+{
+    ConvSolution sol;
+    KernelInfo kernel0_info, kernel1_info;
+
+    const auto ck_conv_problem_desc = get_ck_convolution_problem_descriptor(ctx);
+
+    auto ck_compile_param = ck_driver::CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{};
+
+    std::tie(ck_compile_param, std::ignore) =
+        ck_driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::CalculateCompileParameterBasedOnTunable(
+            ck_conv_problem_desc, get_ck_tunable_conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw(config));
+
+    // kernel0: prepare
+    {
+        kernel0_info.kernel_file =
+            "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp";
+
+        kernel0_info.kernel_name =
+            "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare";
+
+        kernel0_info.l_wk = {1, 1, 1};
+        kernel0_info.g_wk = {1, 1, 1};
+
+        kernel0_info.comp_options =
+            ck_compile_param.GetCompileParameterString() + get_ck_common_compiler_flag(ctx);
+    }
+
+    // kernel1: compute
+    {
+        kernel1_info.kernel_file =
+            "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp";
+
+        kernel1_info.kernel_name =
+            "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw";
+
+        const auto block_size =
+            std::size_t(ck_driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetBlockSize(
+                ck_conv_problem_desc, ck_compile_param));
+
+        const auto grid_size =
+            std::size_t(ck_driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetGridSize(
+                ck_conv_problem_desc, ck_compile_param));
+
+        kernel1_info.l_wk = {block_size, 1, 1};
+        kernel1_info.g_wk = {block_size * grid_size, 1, 1};
+
+        kernel1_info.comp_options =
+            ck_compile_param.GetCompileParameterString() + get_ck_common_compiler_flag(ctx);
+    }
+
+    sol.construction_params.push_back(kernel0_info);
+    sol.construction_params.push_back(kernel1_info);
+
+    // workspace is used to save transformed tensor descriptors
+    sol.workspce_sz = GetWorkspaceSize(ctx);
+
+    sol.invoker_factory = [=](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle, const AnyInvokeParams& primitive_params) {
+            const auto& data_ctx = primitive_params.CastTo<conv::DataInvokeParams>();
+            const auto& tensors  = data_ctx.tensors;
+            auto kernel0         = handle.Run(kernels[0]);
+            auto kernel1         = handle.Run(kernels[1]);
+
+            float elapsed = 0;
+
+            // kernel for transforming tensor descriptors
+            kernel0(ck_conv_problem_desc.N,
+                    ck_conv_problem_desc.C,
+                    ck_conv_problem_desc.Hi,
+                    ck_conv_problem_desc.Wi,
+                    ck_conv_problem_desc.K,
+                    ck_conv_problem_desc.Y,
+                    ck_conv_problem_desc.X,
+                    ck_conv_problem_desc.ConvStrideH,
+                    ck_conv_problem_desc.ConvStrideW,
+                    ck_conv_problem_desc.ConvDilationH,
+                    ck_conv_problem_desc.ConvDilationW,
+                    ck_conv_problem_desc.InLeftPadH,
+                    ck_conv_problem_desc.InLeftPadW,
+                    ck_conv_problem_desc.InRightPadH,
+                    ck_conv_problem_desc.InRightPadW,
+                    data_ctx.workSpace);
+
+            if(handle.IsProfilingEnabled())
+            {
+                elapsed += handle.GetKernelTime();
+            }
+
+            // kernel for computatition
+            kernel1(tensors.w, tensors.in, tensors.out, data_ctx.workSpace);
+
+            if(handle.IsProfilingEnabled())
+            {
+                elapsed += handle.GetKernelTime();
+                handle.ResetKernelTime();
+                handle.AccumKernelTime(elapsed);
+            }
+        };
+    };
+
+    return sol;
+}
+
+std::size_t ConvCkIgemmFwdV6r1DlopsNchw::GetWorkspaceSize(const ConvolutionContext& ctx) const
+{
+    return ck_driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetMaxWorkSpaceSize(
+        get_ck_convolution_problem_descriptor(ctx));
+}
+
+PerformanceConvCkIgemmFwdV6r1DlopsNchw
+ConvCkIgemmFwdV6r1DlopsNchw::Search(const ConvolutionContext& ctx,
+                                    const AnyInvokeParams& invoke_ctx) const
+{
+    return GenericSearch(*this, ctx, invoke_ctx);
+}
+
+} // namespace solver
+} // namespace miopen
diff --git a/src/solver/conv_hip_implicit_gemm_bwd_v1r1.cpp b/src/solver/conv_hip_implicit_gemm_bwd_v1r1.cpp
index 72d20e6f7e..147c72649b 100644
--- a/src/solver/conv_hip_implicit_gemm_bwd_v1r1.cpp
+++ b/src/solver/conv_hip_implicit_gemm_bwd_v1r1.cpp
@@ -716,9 +716,11 @@ ConvSolution ConvHipImplicitGemmBwdDataV1R1::GetSolution(
     construction_parameters.g_wk.push_back(1);
     construction_parameters.g_wk.push_back(1);
 
-    construction_parameters.kernel_file =
-        ctx.Is3d() ? "gridwise_convolution_backward_data_implicit_gemm_v1r1_ncdhw_kczyx_nkdhw.cpp"
-                   : "gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.cpp";
+    construction_parameters.kernel_file = ctx.Is3d()
+                                              ? "static_kernel_gridwise_convolution_backward_data_"
+                                                "implicit_gemm_v1r1_ncdhw_kczyx_nkdhw.cpp"
+                                              : "static_kernel_gridwise_convolution_backward_data_"
+                                                "implicit_gemm_v1r1_nchw_kcyx_nkhw.cpp";
 
     construction_parameters.kernel_name =
         ctx.Is3d() ? "gridwise_convolution_backward_data_implicit_gemm_v1r1_ncdhw_kczyx_nkdhw"
@@ -819,7 +821,7 @@ ConvSolution ConvHipImplicitGemmBwdDataV1R1::GetSolution(
         std::string(" -DCK_PARAM_DEPENDENT_GRID_SIZE=") + std::to_string(grid_size) +
         std::string(" -DCK_THREADWISE_GEMM_USE_AMD_INLINE_ASM=") + (use_amd_inline_asm(ctx) ? '1' : '0') +
         std::string(" -DCK_USE_AMD_INLINE_ASM=") + (use_amd_inline_asm(ctx) ? '1' : '0') +
-        get_ck_common_compiler_flag(ctx) +
+        get_static_ck_common_compiler_flag(ctx) +
         ctx.general_compile_options;
     // clang-format on
     if(ctx.Is3d())
diff --git a/src/solver/conv_hip_implicit_gemm_bwd_v1r1_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_v1r1_xdlops.cpp
index 87d292a1dc..db1eb2d6e9 100644
--- a/src/solver/conv_hip_implicit_gemm_bwd_v1r1_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_bwd_v1r1_xdlops.cpp
@@ -812,8 +812,8 @@ ConvSolution ConvHipImplicitGemmBwdDataV1R1Xdlops::GetSolution(
 
     KernelInfo construction_parameters;
 
-    construction_parameters.kernel_file =
-        "gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_nchw_kcyx_nkhw.cpp";
+    construction_parameters.kernel_file = "static_kernel_gridwise_convolution_backward_data_"
+                                          "implicit_gemm_v1r1_xdlops_nchw_kcyx_nkhw.cpp";
     construction_parameters.kernel_name =
         "gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_nchw_kcyx_nkhw";
 
@@ -900,7 +900,7 @@ ConvSolution ConvHipImplicitGemmBwdDataV1R1Xdlops::GetSolution(
         std::string(" -DCK_USE_AMD_XDLOPS=") + std::to_string(IsXdlopsSupport(ctx) ? 1 : 0) +
         std::string(" -DCK_USE_AMD_XDLOPS_INLINE_ASM=") + std::to_string(miopen::IsEnabled(MIOPEN_DEBUG_IMPLICIT_GEMM_XDLOPS_INLINE_ASM{}) ? 1 : 0) +
         std::string(" -DCK_USE_AMD_XDLOPS_EMULATE=") + (miopen::IsEnabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_XDLOPS_EMULATE{}) ? '1' : '0') +
-        get_ck_common_compiler_flag(ctx) +
+        get_static_ck_common_compiler_flag(ctx) +
         ctx.general_compile_options;
 
     result.invoker_factory = conv::MakeImplGemmDataInvokerFactory(ctx);
diff --git a/src/solver/conv_hip_implicit_gemm_bwd_v4r1.cpp b/src/solver/conv_hip_implicit_gemm_bwd_v4r1.cpp
index de137f9220..c9e7d5ca0c 100644
--- a/src/solver/conv_hip_implicit_gemm_bwd_v4r1.cpp
+++ b/src/solver/conv_hip_implicit_gemm_bwd_v4r1.cpp
@@ -848,16 +848,17 @@ ConvSolution ConvHipImplicitGemmBwdDataV4R1::GetSolution(
 
             if(ctx.Is3d())
             {
-                construction_parameters.kernel_file =
-                    "gridwise_convolution_backward_data_implicit_gemm_v4r1_ncdhw_kczyx_nkdhw.cpp";
+                construction_parameters.kernel_file = "static_kernel_gridwise_convolution_backward_"
+                                                      "data_implicit_gemm_v4r1_ncdhw_kczyx_nkdhw."
+                                                      "cpp";
 
                 construction_parameters.kernel_name =
                     "gridwise_convolution_backward_data_implicit_gemm_v4r1_ncdhw_kczyx_nkdhw";
             }
             else
             {
-                construction_parameters.kernel_file =
-                    "gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.cpp";
+                construction_parameters.kernel_file = "static_kernel_gridwise_convolution_backward_"
+                                                      "data_implicit_gemm_v4r1_nchw_kcyx_nkhw.cpp";
 
                 construction_parameters.kernel_name =
                     "gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw";
@@ -940,7 +941,7 @@ ConvSolution ConvHipImplicitGemmBwdDataV4R1::GetSolution(
                 std::string(" -DCK_THREADWISE_GEMM_USE_AMD_INLINE_ASM=") + (use_amd_inline_asm(ctx) ? '1' : '0') +
                 std::string(" -DCK_USE_AMD_INLINE_ASM=") + (use_amd_inline_asm(ctx) ? '1' : '0') +
                 std::string(" -DCK_PARAM_GEMM_ID=") + std::to_string(gemm_id) +
-                get_ck_common_compiler_flag(ctx) +
+                get_static_ck_common_compiler_flag(ctx) +
                 ctx.general_compile_options;
             // clang-format on
 
diff --git a/src/solver/conv_hip_implicit_gemm_bwd_v4r1_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_v4r1_xdlops.cpp
index 2f7b6c30dd..4627862655 100644
--- a/src/solver/conv_hip_implicit_gemm_bwd_v4r1_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_bwd_v4r1_xdlops.cpp
@@ -910,8 +910,9 @@ ConvSolution ConvHipImplicitGemmBwdDataV4R1Xdlops::GetSolution(
             construction_parameters.g_wk.push_back(1);
             construction_parameters.g_wk.push_back(1);
 
-            construction_parameters.kernel_file =
-                "gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.cpp";
+            construction_parameters.kernel_file = "static_kernel_gridwise_convolution_backward_"
+                                                  "data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw."
+                                                  "cpp";
 
             construction_parameters.kernel_name =
                 "gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw";
@@ -992,7 +993,7 @@ ConvSolution ConvHipImplicitGemmBwdDataV4R1Xdlops::GetSolution(
                 std::string(" -DCK_USE_AMD_XDLOPS_INLINE_ASM=") + std::to_string(miopen::IsEnabled(MIOPEN_DEBUG_IMPLICIT_GEMM_XDLOPS_INLINE_ASM{}) ? 1 : 0) +
                 std::string(" -DCK_USE_AMD_XDLOPS_EMULATE=") + (miopen::IsEnabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_XDLOPS_EMULATE{}) ? '1' : '0') +
                 std::string(" -DCK_PARAM_GEMM_ID=") + std::to_string(gemm_id) +
-                get_ck_common_compiler_flag(ctx) +
+                get_static_ck_common_compiler_flag(ctx) +
                 ctx.general_compile_options;
 
                 construction_parameters.comp_options +=
diff --git a/src/solver/conv_hip_implicit_gemm_fwd_v4r1.cpp b/src/solver/conv_hip_implicit_gemm_fwd_v4r1.cpp
index f34e40c233..dc0199ef9f 100644
--- a/src/solver/conv_hip_implicit_gemm_fwd_v4r1.cpp
+++ b/src/solver/conv_hip_implicit_gemm_fwd_v4r1.cpp
@@ -236,16 +236,16 @@ ConvSolution ConvHipImplicitGemmV4R1Fwd::GetSolution(const ConvolutionContext& c
 
     if(group_counts > 1)
     {
-        construction_parameters.kernel_file =
-            "gridwise_convolution_implicit_gemm_v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer.cpp";
+        construction_parameters.kernel_file = "static_kernel_gridwise_convolution_implicit_gemm_"
+                                              "v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer.cpp";
 
         construction_parameters.kernel_name =
             "gridwise_convolution_implicit_gemm_v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer";
     }
     else
     {
-        construction_parameters.kernel_file =
-            "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.cpp";
+        construction_parameters.kernel_file = "static_kernel_gridwise_convolution_implicit_gemm_"
+                                              "v4r1_nchw_kcyx_nkhw_lds_double_buffer.cpp";
 
         construction_parameters.kernel_name =
             "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer";
@@ -337,7 +337,7 @@ ConvSolution ConvHipImplicitGemmV4R1Fwd::GetSolution(const ConvolutionContext& c
         std::string(" -DCK_PARAM_EPACK_LENGTH=") + std::to_string(GetEPackLength(ctx, false)) +
         std::string(" -DCK_THREADWISE_GEMM_USE_AMD_INLINE_ASM=") + (use_amd_inline_asm(ctx) ? '1' : '0') +
         std::string(" -DCK_USE_AMD_INLINE_ASM=") + (use_amd_inline_asm(ctx) ? '1' : '0') +
-        get_ck_common_compiler_flag(ctx) +
+        get_static_ck_common_compiler_flag(ctx) +
         ctx.general_compile_options;
     // clang-format on
 
@@ -439,16 +439,16 @@ ConvSolution ConvHipImplicitGemmV4R1WrW::GetSolution(const ConvolutionContext& c
 
     if(ctx.group_counts > 1)
     {
-        construction_parameters.kernel_file =
-            "gridwise_convolution_implicit_gemm_v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer.cpp";
+        construction_parameters.kernel_file = "static_kernel_gridwise_convolution_implicit_gemm_"
+                                              "v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer.cpp";
 
         construction_parameters.kernel_name =
             "gridwise_convolution_implicit_gemm_v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer";
     }
     else
     {
-        construction_parameters.kernel_file =
-            "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.cpp";
+        construction_parameters.kernel_file = "static_kernel_gridwise_convolution_implicit_gemm_"
+                                              "v4r1_nchw_kcyx_nkhw_lds_double_buffer.cpp";
 
         construction_parameters.kernel_name =
             "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer";
@@ -538,7 +538,7 @@ ConvSolution ConvHipImplicitGemmV4R1WrW::GetSolution(const ConvolutionContext& c
         std::string(" -DCK_PARAM_EPACK_LENGTH=") + std::to_string(GetEPackLength(ctx, false)) +
         std::string(" -DCK_THREADWISE_GEMM_USE_AMD_INLINE_ASM=") + (use_amd_inline_asm(ctx)? '1' : '0') +
         std::string(" -DCK_USE_AMD_INLINE_ASM=") + (use_amd_inline_asm(ctx) ? '1' : '0') +
-        get_ck_common_compiler_flag(ctx) +
+        get_static_ck_common_compiler_flag(ctx) +
         ctx.general_compile_options;
     // clang-format on
 
diff --git a/src/solver/conv_hip_implicit_gemm_fwd_v4r4.cpp b/src/solver/conv_hip_implicit_gemm_fwd_v4r4.cpp
index 5d9a1b4091..52b7c598e8 100644
--- a/src/solver/conv_hip_implicit_gemm_fwd_v4r4.cpp
+++ b/src/solver/conv_hip_implicit_gemm_fwd_v4r4.cpp
@@ -656,7 +656,7 @@ ConvSolution ConvHipImplicitGemmV4R4Fwd::GetSolution(const ConvolutionContext& c
     if(ctx.Is3d())
     {
         construction_parameters.kernel_file =
-            "gridwise_convolution_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.cpp";
+            "static_kernel_gridwise_convolution_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.cpp";
 
         construction_parameters.kernel_name =
             "gridwise_convolution_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw";
@@ -664,7 +664,7 @@ ConvSolution ConvHipImplicitGemmV4R4Fwd::GetSolution(const ConvolutionContext& c
     else
     {
         construction_parameters.kernel_file =
-            "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp";
+            "static_kernel_gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp";
 
         construction_parameters.kernel_name =
             "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw";
@@ -749,7 +749,7 @@ ConvSolution ConvHipImplicitGemmV4R4Fwd::GetSolution(const ConvolutionContext& c
         std::string(" -DCK_PARAM_DEPENDENT_GRID_SIZE=") + std::to_string(grid_size) +
         std::string(" -DCK_THREADWISE_GEMM_USE_AMD_INLINE_ASM=") + (use_amd_inline_asm(ctx) ? '1' : '0') +
         std::string(" -DCK_USE_AMD_INLINE_ASM=") + (use_amd_inline_asm(ctx) ? '1' : '0') +
-        get_ck_common_compiler_flag(ctx) +
+        get_static_ck_common_compiler_flag(ctx) +
         ctx.general_compile_options;
 
         if (ctx.Is3d()){
diff --git a/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops.cpp
index 56f78e39f6..e46d7dfaaa 100644
--- a/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops.cpp
@@ -867,7 +867,7 @@ ConvSolution ConvHipImplicitGemmForwardV4R4Xdlops::GetSolution(
     KernelInfo construction_parameters;
 
     construction_parameters.kernel_file =
-        "gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp";
+        "static_kernel_gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp";
 
     construction_parameters.kernel_name =
         "gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw";
@@ -953,7 +953,7 @@ ConvSolution ConvHipImplicitGemmForwardV4R4Xdlops::GetSolution(
         std::string(" -DCK_USE_AMD_XDLOPS=") + std::to_string(IsXdlopsSupport(ctx) ? 1 : 0) +
         std::string(" -DCK_USE_AMD_XDLOPS_INLINE_ASM=") + std::to_string(miopen::IsEnabled(MIOPEN_DEBUG_IMPLICIT_GEMM_XDLOPS_INLINE_ASM{}) ? 1 : 0) +
         std::string(" -DCK_USE_AMD_XDLOPS_EMULATE=") + (miopen::IsEnabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_XDLOPS_EMULATE{}) ? '1' : '0') +
-        get_ck_common_compiler_flag(ctx) +
+        get_static_ck_common_compiler_flag(ctx) +
         ctx.general_compile_options;
     // clang-format on
 
diff --git a/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops_padded_gemm.cpp b/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops_padded_gemm.cpp
index 716a9f7ea4..c3ec872785 100644
--- a/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops_padded_gemm.cpp
+++ b/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops_padded_gemm.cpp
@@ -924,7 +924,8 @@ ConvSolution ConvHipImplicitGemmForwardV4R4Xdlops_Padded_Gemm::GetSolution(
     KernelInfo construction_parameters;
 
     construction_parameters.kernel_file = //
-        "gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.cpp";
+        "static_kernel_gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_"
+        "padded_gemm.cpp";
 
     construction_parameters.kernel_name =
         "gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm";
@@ -1026,7 +1027,7 @@ ConvSolution ConvHipImplicitGemmForwardV4R4Xdlops_Padded_Gemm::GetSolution(
         std::string(" -DCK_USE_AMD_XDLOPS=") + std::to_string(IsXdlopsSupport(ctx) ? 1 : 0) +
         std::string(" -DCK_USE_AMD_XDLOPS_INLINE_ASM=") + std::to_string(miopen::IsEnabled(MIOPEN_DEBUG_IMPLICIT_GEMM_XDLOPS_INLINE_ASM{}) ? 1 : 0) +
         std::string(" -DCK_USE_AMD_XDLOPS_EMULATE=") + (miopen::IsEnabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_XDLOPS_EMULATE{}) ? '1' : '0') +
-        get_ck_common_compiler_flag(ctx) +
+        get_static_ck_common_compiler_flag(ctx) +
         ctx.general_compile_options;
     // clang-format on
 
diff --git a/src/solver/conv_hip_implicit_gemm_fwd_v4r5_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_fwd_v4r5_xdlops.cpp
index dfa88967a0..b5c33b2e41 100644
--- a/src/solver/conv_hip_implicit_gemm_fwd_v4r5_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_fwd_v4r5_xdlops.cpp
@@ -892,7 +892,7 @@ ConvSolution ConvHipImplicitGemmForwardV4R5Xdlops::GetSolution(
     KernelInfo construction_parameters;
 
     construction_parameters.kernel_file =
-        "gridwise_convolution_forward_implicit_gemm_v4r5_xdlops_nchw_kcyx_nkhw.cpp";
+        "static_kernel_gridwise_convolution_forward_implicit_gemm_v4r5_xdlops_nchw_kcyx_nkhw.cpp";
 
     construction_parameters.kernel_name =
         "gridwise_convolution_forward_implicit_gemm_v4r5_xdlops_nchw_kcyx_nkhw";
@@ -983,7 +983,7 @@ ConvSolution ConvHipImplicitGemmForwardV4R5Xdlops::GetSolution(
         std::string(" -DCK_USE_AMD_XDLOPS=") + std::to_string(IsXdlopsSupport(ctx) ? 1 : 0) +
         std::string(" -DCK_USE_AMD_XDLOPS_INLINE_ASM=") + std::to_string(miopen::IsEnabled(MIOPEN_DEBUG_IMPLICIT_GEMM_XDLOPS_INLINE_ASM{}) ? 1 : 0) +
         std::string(" -DCK_USE_AMD_XDLOPS_EMULATE=") + (miopen::IsEnabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_XDLOPS_EMULATE{}) ? '1' : '0') +
-        get_ck_common_compiler_flag(ctx) +
+        get_static_ck_common_compiler_flag(ctx) +
         ctx.general_compile_options;
     // clang-format on
 
diff --git a/src/solver/conv_hip_implicit_gemm_wrw_v4r4.cpp b/src/solver/conv_hip_implicit_gemm_wrw_v4r4.cpp
index 8125f635db..4671ce5ffa 100644
--- a/src/solver/conv_hip_implicit_gemm_wrw_v4r4.cpp
+++ b/src/solver/conv_hip_implicit_gemm_wrw_v4r4.cpp
@@ -659,16 +659,16 @@ ConvSolution ConvHipImplicitGemmV4R4WrW::GetSolution(const ConvolutionContext& c
 
     if(ctx.Is3d())
     {
-        construction_parameters.kernel_file =
-            "gridwise_convolution_backward_weights_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.cpp";
+        construction_parameters.kernel_file = "static_kernel_gridwise_convolution_backward_weights_"
+                                              "implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.cpp";
 
         construction_parameters.kernel_name =
             "gridwise_convolution_backward_weights_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw";
     }
     else
     {
-        construction_parameters.kernel_file =
-            "gridwise_convolution_backward_weights_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp";
+        construction_parameters.kernel_file = "static_kernel_gridwise_convolution_backward_weights_"
+                                              "implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp";
 
         construction_parameters.kernel_name =
             "gridwise_convolution_backward_weights_implicit_gemm_v4r4_nchw_kcyx_nkhw";
@@ -753,7 +753,7 @@ ConvSolution ConvHipImplicitGemmV4R4WrW::GetSolution(const ConvolutionContext& c
         std::string(" -DCK_PARAM_DEPENDENT_GRID_SIZE=") + std::to_string(grid_size) +
         std::string(" -DCK_THREADWISE_GEMM_USE_AMD_INLINE_ASM=") + (use_amd_inline_asm(ctx) ? '1' : '0') +
         std::string(" -DCK_USE_AMD_INLINE_ASM=") + (use_amd_inline_asm(ctx) ? '1' : '0') +
-        get_ck_common_compiler_flag(ctx) +
+        get_static_ck_common_compiler_flag(ctx) +
         ctx.general_compile_options;
 
         if (ctx.Is3d()){
diff --git a/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops.cpp
index fce94d5894..de346611ca 100644
--- a/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops.cpp
@@ -876,8 +876,8 @@ ConvSolution ConvHipImplicitGemmWrwV4R4Xdlops::GetSolution(
 
     KernelInfo construction_parameters;
 
-    construction_parameters.kernel_file =
-        "gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp";
+    construction_parameters.kernel_file = "static_kernel_gridwise_convolution_backward_weights_"
+                                          "implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp";
 
     construction_parameters.kernel_name =
         "gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw";
@@ -968,7 +968,7 @@ ConvSolution ConvHipImplicitGemmWrwV4R4Xdlops::GetSolution(
         std::string(" -DCK_USE_AMD_XDLOPS=") + std::to_string(IsXdlopsSupport(ctx) ? 1 : 0) +
         std::string(" -DCK_USE_AMD_XDLOPS_INLINE_ASM=") + std::to_string(miopen::IsEnabled(MIOPEN_DEBUG_IMPLICIT_GEMM_XDLOPS_INLINE_ASM{}) ? 1 : 0) +
         std::string(" -DCK_USE_AMD_XDLOPS_EMULATE=") + (miopen::IsEnabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_XDLOPS_EMULATE{}) ? '1' : '0') +
-        get_ck_common_compiler_flag(ctx) +
+        get_static_ck_common_compiler_flag(ctx) +
         ctx.general_compile_options;
     // clang-format on
 
diff --git a/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops_padded_gemm.cpp b/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops_padded_gemm.cpp
index f1e68e4289..70a5196ba8 100644
--- a/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops_padded_gemm.cpp
+++ b/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops_padded_gemm.cpp
@@ -1040,7 +1040,7 @@ ConvSolution ConvHipImplicitGemmWrwV4R4Xdlops_Padded_Gemm::GetSolution(
         std::string(" -DCK_USE_AMD_XDLOPS=") + std::to_string(IsXdlopsSupport(ctx) ? 1 : 0) +
         std::string(" -DCK_USE_AMD_XDLOPS_INLINE_ASM=") + std::to_string(miopen::IsEnabled(MIOPEN_DEBUG_IMPLICIT_GEMM_XDLOPS_INLINE_ASM{}) ? 1 : 0) +
         std::string(" -DCK_USE_AMD_XDLOPS_EMULATE=") + (miopen::IsEnabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_XDLOPS_EMULATE{}) ? '1' : '0') +
-        get_ck_common_compiler_flag(ctx) +
+        get_static_ck_common_compiler_flag(ctx) +
         ctx.general_compile_options;
     // clang-format on
 

From 437cc595c6e206dfebb118985b5171bbc1e29eab Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Fri, 6 Aug 2021 21:32:27 +0000
Subject: [PATCH 03/57] Squashed 'src/composable_kernel/' changes from
 f6edda611..5781adf5c

5781adf5c Update develop (#5) (#6)
97e6d514f Merge pull request #4 from ROCmSoftwarePlatform/separate_online_compile
7b1ec41e5 refactor
49c33aaea refactor
54b3e73d1 rename

git-subtree-dir: src/composable_kernel
git-subtree-split: 5781adf5cf4ac753e2e36da7385791775b744bf7
---
 external/half/include/half.hpp                |  3 +--
 host/CMakeLists.txt                           |  2 +-
 host/driver_offline/CMakeLists.txt            |  1 +
 host/driver_online/CMakeLists.txt             |  7 ++---
 host/driver_online/conv_fwd_driver_online.cpp |  4 +--
 ...mplicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp |  2 +-
 ...plicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp |  2 +-
 ...plicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp |  2 +-
 ...mplicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp | 10 +++----
 .../include/online_driver_common.hpp          |  7 +++++
 .../CMakeLists.txt                            | 26 +++++++++----------
 .../addkernels/CMakeLists.txt                 |  0
 .../addkernels/addkernels.cpp                 |  0
 .../addkernels/include_inliner.cpp            |  0
 .../addkernels/include_inliner.hpp            |  0
 .../addkernels/source_file_desc.hpp           |  0
 .../hip_utility/binary_cache.cpp              | 14 +++++-----
 .../hip_utility/exec_utils.cpp                |  8 +++---
 .../hip_utility/handlehip.cpp                 | 18 ++++++-------
 .../hip_utility/hip_build_utils.cpp           | 10 +++----
 .../hip_utility/hipoc_kernel.cpp              |  4 +--
 .../hip_utility/hipoc_program.cpp             |  8 +++---
 .../hip_utility/kernel_build_params.cpp       |  4 +--
 .../hip_utility/kernel_cache.cpp              |  4 +--
 .../hip_utility/logger.cpp                    |  6 ++---
 .../hip_utility/md5.cpp                       |  4 +--
 .../hip_utility/target_properties.cpp         |  6 ++---
 .../hip_utility/tmp_dir.cpp                   |  8 +++---
 .../include/binary_cache.hpp                  |  4 +--
 .../include/config.h.in                       |  0
 .../include/env.hpp                           | 10 +++----
 .../include/exec_utils.hpp                    |  4 +--
 .../include/handle.hpp                        |  4 +--
 .../include/hipCheck.hpp                      |  0
 .../include/hip_build_utils.hpp               |  6 ++---
 .../include/hipoc_kernel.hpp                  |  4 +--
 .../include/hipoc_program.hpp                 |  4 +--
 .../include/hipoc_program_impl.hpp            |  4 +--
 .../include/kernel.hpp                        |  4 +--
 .../include/kernel_build_params.hpp           |  4 +--
 .../include/kernel_cache.hpp                  |  4 +--
 .../include/logger.hpp                        |  4 +--
 .../include/manage_ptr.hpp                    |  6 ++---
 .../include/md5.hpp                           |  4 +--
 .../include/op_kernel_args.hpp                |  5 ++++
 .../include/simple_hash.hpp                   |  4 +--
 .../include/stringutils.hpp                   |  4 +--
 .../include/target_properties.hpp             |  4 +--
 .../include/tmp_dir.hpp                       |  4 +--
 .../include/write_file.hpp                    |  4 +--
 .../kernel.cpp.in                             |  4 +--
 .../kernel_includes.cpp.in                    |  4 +--
 .../kernels_batch.cpp.in                      |  0
 ...nv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp |  0
 ..._tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp |  0
 ...tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp |  0
 ...tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp |  0
 .../convolution_problem_descriptor.hpp        |  0
 58 files changed, 136 insertions(+), 123 deletions(-)
 rename host/{online_compilation => online_compile}/CMakeLists.txt (83%)
 rename host/{online_compilation => online_compile}/addkernels/CMakeLists.txt (100%)
 rename host/{online_compilation => online_compile}/addkernels/addkernels.cpp (100%)
 rename host/{online_compilation => online_compile}/addkernels/include_inliner.cpp (100%)
 rename host/{online_compilation => online_compile}/addkernels/include_inliner.hpp (100%)
 rename host/{online_compilation => online_compile}/addkernels/source_file_desc.hpp (100%)
 rename host/{online_compilation => online_compile}/hip_utility/binary_cache.cpp (89%)
 rename host/{online_compilation => online_compile}/hip_utility/exec_utils.cpp (91%)
 rename host/{online_compilation => online_compile}/hip_utility/handlehip.cpp (93%)
 rename host/{online_compilation => online_compile}/hip_utility/hip_build_utils.cpp (97%)
 rename host/{online_compilation => online_compile}/hip_utility/hipoc_kernel.cpp (98%)
 rename host/{online_compilation => online_compile}/hip_utility/hipoc_program.cpp (96%)
 rename host/{online_compilation => online_compile}/hip_utility/kernel_build_params.cpp (97%)
 rename host/{online_compilation => online_compile}/hip_utility/kernel_cache.cpp (98%)
 rename host/{online_compilation => online_compile}/hip_utility/logger.cpp (88%)
 rename host/{online_compilation => online_compile}/hip_utility/md5.cpp (99%)
 rename host/{online_compilation => online_compile}/hip_utility/target_properties.cpp (96%)
 rename host/{online_compilation => online_compile}/hip_utility/tmp_dir.cpp (90%)
 rename host/{online_compilation => online_compile}/include/binary_cache.hpp (97%)
 rename host/{online_compilation => online_compile}/include/config.h.in (100%)
 rename host/{online_compilation => online_compile}/include/env.hpp (92%)
 rename host/{online_compilation => online_compile}/include/exec_utils.hpp (96%)
 rename host/{online_compilation => online_compile}/include/handle.hpp (98%)
 rename host/{online_compilation => online_compile}/include/hipCheck.hpp (100%)
 rename host/{online_compilation => online_compile}/include/hip_build_utils.hpp (96%)
 rename host/{online_compilation => online_compile}/include/hipoc_kernel.hpp (99%)
 rename host/{online_compilation => online_compile}/include/hipoc_program.hpp (97%)
 rename host/{online_compilation => online_compile}/include/hipoc_program_impl.hpp (97%)
 rename host/{online_compilation => online_compile}/include/kernel.hpp (96%)
 rename host/{online_compilation => online_compile}/include/kernel_build_params.hpp (98%)
 rename host/{online_compilation => online_compile}/include/kernel_cache.hpp (98%)
 rename host/{online_compilation => online_compile}/include/logger.hpp (84%)
 rename host/{online_compilation => online_compile}/include/manage_ptr.hpp (93%)
 rename host/{online_compilation => online_compile}/include/md5.hpp (66%)
 rename host/{online_compilation => online_compile}/include/op_kernel_args.hpp (94%)
 rename host/{online_compilation => online_compile}/include/simple_hash.hpp (96%)
 rename host/{online_compilation => online_compile}/include/stringutils.hpp (98%)
 rename host/{online_compilation => online_compile}/include/target_properties.hpp (97%)
 rename host/{online_compilation => online_compile}/include/tmp_dir.hpp (87%)
 rename host/{online_compilation => online_compile}/include/write_file.hpp (94%)
 rename host/{online_compilation => online_compile}/kernel.cpp.in (97%)
 rename host/{online_compilation => online_compile}/kernel_includes.cpp.in (97%)
 rename host/{online_compilation => online_compile}/kernels_batch.cpp.in (100%)
 rename host/{driver_online => solver}/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp (100%)
 rename host/{driver_online => solver}/include/conv_tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp (100%)
 rename host/{driver_online => solver}/include/conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp (100%)
 rename host/{driver_online => solver}/include/conv_tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp (100%)
 rename host/{driver_online => solver}/include/convolution_problem_descriptor.hpp (100%)

diff --git a/external/half/include/half.hpp b/external/half/include/half.hpp
index b698aac39f..25f543881f 100644
--- a/external/half/include/half.hpp
+++ b/external/half/include/half.hpp
@@ -2404,8 +2404,7 @@ unsigned int gamma(unsigned int arg)
        0.0114684895434781459556 }; double t = arg + 4.65, s = p[0]; for(unsigned int i=0; i<5; ++i)
                                 s += p[i+1] / (arg+i);
                         return std::log(s) + (arg-0.5)*std::log(t) - t;
-*/ static const f31 pi(0xC90FDAA2, 1),
-        lbe(0xB8AA3B29, 0);
+*/ static const f31 pi(0xC90FDAA2, 1), lbe(0xB8AA3B29, 0);
     unsigned int abs = arg & 0x7FFF, sign = arg & 0x8000;
     bool bsign = sign != 0;
     f31 z(abs), x = sign ? (z + f31(0x80000000, 0)) : z, t = x + f31(0x94CCCCCD, 2),
diff --git a/host/CMakeLists.txt b/host/CMakeLists.txt
index c9779398a6..26739efe34 100644
--- a/host/CMakeLists.txt
+++ b/host/CMakeLists.txt
@@ -1,4 +1,4 @@
 add_subdirectory(host_tensor)
-add_subdirectory(online_compilation)
+add_subdirectory(online_compile)
 add_subdirectory(driver_offline)
 add_subdirectory(driver_online)
diff --git a/host/driver_offline/CMakeLists.txt b/host/driver_offline/CMakeLists.txt
index 85bd31fbca..927975d449 100644
--- a/host/driver_offline/CMakeLists.txt
+++ b/host/driver_offline/CMakeLists.txt
@@ -1,6 +1,7 @@
 include_directories(BEFORE
     include
     ${PROJECT_SOURCE_DIR}/host/host_tensor/include
+    ${PROJECT_SOURCE_DIR}/host/solver/include
     ${PROJECT_SOURCE_DIR}/composable_kernel/include
     ${PROJECT_SOURCE_DIR}/composable_kernel/include/utility
     ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description
diff --git a/host/driver_online/CMakeLists.txt b/host/driver_online/CMakeLists.txt
index 2ae05e0ba5..077e3218a0 100644
--- a/host/driver_online/CMakeLists.txt
+++ b/host/driver_online/CMakeLists.txt
@@ -1,8 +1,9 @@
 include_directories(BEFORE
     include
-    ${PROJECT_BINARY_DIR}/host/online_compilation/include
-    ${PROJECT_SOURCE_DIR}/host/online_compilation/include
+    ${PROJECT_BINARY_DIR}/host/online_compile/include
+    ${PROJECT_SOURCE_DIR}/host/online_compile/include
     ${PROJECT_SOURCE_DIR}/host/host_tensor/include
+    ${PROJECT_SOURCE_DIR}/host/solver/include
     ${PROJECT_SOURCE_DIR}/composable_kernel/include
     ${PROJECT_SOURCE_DIR}/composable_kernel/include/utility
     ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description
@@ -18,4 +19,4 @@ set(CONV_FWD_DRIVER_ONLINE_SOURCE conv_fwd_driver_online.cpp)
 add_executable(conv_fwd_driver_online ${CONV_FWD_DRIVER_ONLINE_SOURCE})
 
 target_link_libraries(conv_fwd_driver_online PRIVATE host_tensor)
-target_link_libraries(conv_fwd_driver_online PRIVATE online_compilation)
+target_link_libraries(conv_fwd_driver_online PRIVATE online_compile)
diff --git a/host/driver_online/conv_fwd_driver_online.cpp b/host/driver_online/conv_fwd_driver_online.cpp
index c91f76fa24..29609d5474 100644
--- a/host/driver_online/conv_fwd_driver_online.cpp
+++ b/host/driver_online/conv_fwd_driver_online.cpp
@@ -39,11 +39,11 @@ int main(int argc, char* argv[])
     using size_t = std::size_t;
 
     hipStream_t stream;
-    olCompile::Handle* handle;
+    online_compile::Handle* handle;
 
     MY_HIP_CHECK(hipStreamCreate(&stream));
 
-    handle = new olCompile::Handle(stream);
+    handle = new online_compile::Handle(stream);
 
     constexpr auto I0 = Number<0>{};
     constexpr auto I1 = Number<1>{};
diff --git a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
index 628bb6d96d..06412fba0b 100644
--- a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
@@ -216,7 +216,7 @@ template <typename TInWei,
           typename InLeftPads,
           typename InRightPads>
 void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
-    olCompile::Handle* handle,
+    online_compile::Handle* handle,
     const InLengths& in_n_c_hi_wi_lengths,
     const WeiLengths& wei_k_c_y_x_lengths,
     const OutLengths& out_n_k_ho_wo_lengths,
diff --git a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
index 1e213b92e1..61ce41fe84 100644
--- a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
@@ -212,7 +212,7 @@ template <typename TInWei,
           typename InLeftPads,
           typename InRightPads>
 void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(
-    olCompile::Handle* handle,
+    online_compile::Handle* handle,
     const InLengths& in_n_c_hi_wi_lengths,
     const WeiLengths& wei_k_c_y_x_lengths,
     const OutLengths& out_n_k_ho_wo_lengths,
diff --git a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
index 8eed1a9934..57724c7612 100644
--- a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
@@ -213,7 +213,7 @@ template <typename TInWei,
           typename InLeftPads,
           typename InRightPads>
 void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk(
-    olCompile::Handle* handle,
+    online_compile::Handle* handle,
     const InLengths& in_n_hi_wi_c_lengths,
     const WeiLengths& wei_k_y_x_c_lengths,
     const OutLengths& out_n_ho_wo_k_lengths,
diff --git a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
index 260c94ee0e..92467a7668 100644
--- a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
@@ -20,7 +20,7 @@ template <typename TInWei,
           typename InLeftPads,
           typename InRightPads>
 void online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
-    olCompile::Handle* handle,
+    online_compile::Handle* handle,
     const InLengths& in_n_c_hi_wi_lengths,
     const WeiLengths& wei_k_c_y_x_lengths,
     const OutLengths& out_n_k_ho_wo_lengths,
@@ -100,13 +100,13 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcy
         "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp";
     std::string algo_name = "implicit_gemm_conv_fwd_v6r1_dlops_nchw";
 
-    std::string compile_param_string = " -std=c++17 " + compile_param.GetCompileParameterString();
+    std::string compile_param_string = get_ck_hip_online_compile_common_flag() + compile_param.GetCompileParameterString();
     std::string network_config       = compile_param_string;
 
     std::vector<float> kernel1_times;
     std::vector<float> kernel2_times;
 
-    for(index_t i = 0; i < nrepeat; ++i)
+    for(index_t i = 0; i < nrepeat + 1; ++i)
     {
         KernelTimer timer1, timer2;
         std::string kernel_name;
@@ -164,11 +164,11 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcy
         auto ave_time1 =
             std::accumulate(
                 std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus<float>{}) /
-            (nrepeat - 1);
+            nrepeat;
         auto ave_time2 =
             std::accumulate(
                 std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus<float>{}) /
-            (nrepeat - 1);
+            nrepeat;
 
         float perf = (float)(conv_problem_desc.CalculateFlop()) /
                      (std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2);
diff --git a/host/driver_online/include/online_driver_common.hpp b/host/driver_online/include/online_driver_common.hpp
index 472ffb52dc..d05a156d89 100644
--- a/host/driver_online/include/online_driver_common.hpp
+++ b/host/driver_online/include/online_driver_common.hpp
@@ -3,6 +3,13 @@
 
 namespace ck_driver {
 
+inline auto get_ck_hip_online_compile_common_flag()
+{
+    std::string param = " -std=c++17";
+
+    return param;
+}
+
 // greatest common divisor, aka highest common factor
 inline int gcd(int x, int y)
 {
diff --git a/host/online_compilation/CMakeLists.txt b/host/online_compile/CMakeLists.txt
similarity index 83%
rename from host/online_compilation/CMakeLists.txt
rename to host/online_compile/CMakeLists.txt
index 02f6795308..1b66703fcd 100644
--- a/host/online_compilation/CMakeLists.txt
+++ b/host/online_compile/CMakeLists.txt
@@ -67,10 +67,10 @@ else()
     set(OLC_DEBUG 0)
 endif()
 
-configure_file("${PROJECT_SOURCE_DIR}/host/online_compilation/include/config.h.in" "${PROJECT_BINARY_DIR}/host/online_compilation/include/config.h")
+configure_file("${PROJECT_SOURCE_DIR}/host/online_compile/include/config.h.in" "${PROJECT_BINARY_DIR}/host/online_compile/include/config.h")
 
 include_directories(BEFORE
-    ${PROJECT_BINARY_DIR}/host/online_compilation/include
+    ${PROJECT_BINARY_DIR}/host/online_compile/include
 )
 
 message(STATUS "Hip compiler flags: ${HIP_COMPILER_FLAGS}")
@@ -97,7 +97,7 @@ set(ONLINE_COMPILATION_SOURCE
 )
 
 include_directories(BEFORE
-    ${PROJECT_BINARY_DIR}/host/online_compilation/include
+    ${PROJECT_BINARY_DIR}/host/online_compile/include
     include
 )
 
@@ -152,17 +152,17 @@ add_custom_command(
   )
 
 ## the library target
-add_library(online_compilation SHARED ${ONLINE_COMPILATION_SOURCE}) 
+add_library(online_compile SHARED ${ONLINE_COMPILATION_SOURCE}) 
 
-target_include_directories(online_compilation PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/online_compilation/include/)
-target_include_directories(online_compilation PRIVATE ${PROJECT_BINARY_DIR})
-target_include_directories(online_compilation PRIVATE ${PROJECT_SOURCE_DIR}/external/half/include/)
+target_include_directories(online_compile PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/online_compile/include/)
+target_include_directories(online_compile PRIVATE ${PROJECT_BINARY_DIR})
+target_include_directories(online_compile PRIVATE ${PROJECT_SOURCE_DIR}/external/half/include/)
 
-target_link_libraries(online_compilation PRIVATE hip::device)
-target_link_libraries(online_compilation INTERFACE hip::host)
-target_link_libraries(online_compilation PRIVATE Boost::filesystem)
+target_link_libraries(online_compile PRIVATE hip::device)
+target_link_libraries(online_compile INTERFACE hip::host)
+target_link_libraries(online_compile PRIVATE Boost::filesystem)
 
-target_compile_features(online_compilation PUBLIC)
-set_target_properties(online_compilation PROPERTIES POSITION_INDEPENDENT_CODE ON)
+target_compile_features(online_compile PUBLIC)
+set_target_properties(online_compile PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
-install(TARGETS online_compilation LIBRARY DESTINATION lib) 
+install(TARGETS online_compile LIBRARY DESTINATION lib) 
diff --git a/host/online_compilation/addkernels/CMakeLists.txt b/host/online_compile/addkernels/CMakeLists.txt
similarity index 100%
rename from host/online_compilation/addkernels/CMakeLists.txt
rename to host/online_compile/addkernels/CMakeLists.txt
diff --git a/host/online_compilation/addkernels/addkernels.cpp b/host/online_compile/addkernels/addkernels.cpp
similarity index 100%
rename from host/online_compilation/addkernels/addkernels.cpp
rename to host/online_compile/addkernels/addkernels.cpp
diff --git a/host/online_compilation/addkernels/include_inliner.cpp b/host/online_compile/addkernels/include_inliner.cpp
similarity index 100%
rename from host/online_compilation/addkernels/include_inliner.cpp
rename to host/online_compile/addkernels/include_inliner.cpp
diff --git a/host/online_compilation/addkernels/include_inliner.hpp b/host/online_compile/addkernels/include_inliner.hpp
similarity index 100%
rename from host/online_compilation/addkernels/include_inliner.hpp
rename to host/online_compile/addkernels/include_inliner.hpp
diff --git a/host/online_compilation/addkernels/source_file_desc.hpp b/host/online_compile/addkernels/source_file_desc.hpp
similarity index 100%
rename from host/online_compilation/addkernels/source_file_desc.hpp
rename to host/online_compile/addkernels/source_file_desc.hpp
diff --git a/host/online_compilation/hip_utility/binary_cache.cpp b/host/online_compile/hip_utility/binary_cache.cpp
similarity index 89%
rename from host/online_compilation/hip_utility/binary_cache.cpp
rename to host/online_compile/hip_utility/binary_cache.cpp
index f2f47a1a31..b899d1e296 100644
--- a/host/online_compilation/hip_utility/binary_cache.cpp
+++ b/host/online_compile/hip_utility/binary_cache.cpp
@@ -35,7 +35,7 @@
 #include <fstream>
 #include <iostream>
 
-namespace olCompile {
+namespace online_compile {
 
 OLC_DECLARE_ENV_VAR(OLC_DISABLE_CACHE)
 OLC_DECLARE_ENV_VAR(HOME)
@@ -62,14 +62,14 @@ boost::filesystem::path GetCachePath()
     return user_path;
 }
 
-static bool IsCacheDisabled() { return olCompile::IsEnabled(OLC_DISABLE_CACHE{}); }
+static bool IsCacheDisabled() { return online_compile::IsEnabled(OLC_DISABLE_CACHE{}); }
 
 boost::filesystem::path
 GetCacheFile(const std::string& device, const std::string& name, const std::string& args)
 {
-    // std::string filename = (is_kernel_str ? olCompile::md5(name) : name) + ".o";
+    // std::string filename = (is_kernel_str ? online_compile::md5(name) : name) + ".o";
     std::string filename = name + ".o";
-    return GetCachePath() / olCompile::md5(device + ":" + args) / filename;
+    return GetCachePath() / online_compile::md5(device + ":" + args) / filename;
 }
 
 boost::filesystem::path LoadBinary(const TargetProperties& target,
@@ -77,7 +77,7 @@ boost::filesystem::path LoadBinary(const TargetProperties& target,
                                    const std::string& name,
                                    const std::string& args)
 {
-    if(olCompile::IsCacheDisabled())
+    if(online_compile::IsCacheDisabled())
         return {};
 
     (void)num_cu;
@@ -97,7 +97,7 @@ void SaveBinary(const boost::filesystem::path& binary_path,
                 const std::string& name,
                 const std::string& args)
 {
-    if(olCompile::IsCacheDisabled())
+    if(online_compile::IsCacheDisabled())
     {
         boost::filesystem::remove(binary_path);
     }
@@ -109,4 +109,4 @@ void SaveBinary(const boost::filesystem::path& binary_path,
     }
 }
 
-} // namespace olCompile
+} // namespace online_compile
diff --git a/host/online_compilation/hip_utility/exec_utils.cpp b/host/online_compile/hip_utility/exec_utils.cpp
similarity index 91%
rename from host/online_compilation/hip_utility/exec_utils.cpp
rename to host/online_compile/hip_utility/exec_utils.cpp
index 60168c1a54..ec305783f1 100644
--- a/host/online_compilation/hip_utility/exec_utils.cpp
+++ b/host/online_compile/hip_utility/exec_utils.cpp
@@ -38,7 +38,7 @@
 #include <sys/wait.h>
 #endif // __linux__
 
-namespace olCompile {
+namespace online_compile {
 namespace exec {
 
 int Run(const std::string& p, std::istream* in, std::ostream* out)
@@ -53,7 +53,7 @@ int Run(const std::string& p, std::istream* in, std::ostream* out)
     OLC_MANAGE_PTR(FILE*, pclose) pipe{popen(p.c_str(), file_mode)};
 
     if(!pipe)
-        throw std::runtime_error("olCompile::exec::Run(): popen(" + p + ", " + file_mode +
+        throw std::runtime_error("online_compile::exec::Run(): popen(" + p + ", " + file_mode +
                                  ") failed");
 
     if(redirect_stdin || redirect_stdout)
@@ -74,7 +74,7 @@ int Run(const std::string& p, std::istream* in, std::ostream* out)
                 buffer[in->gcount()] = 0;
 
                 if(fputs(buffer.data(), pipe.get()) == EOF)
-                    throw std::runtime_error("olCompile::exec::Run(): fputs() failed");
+                    throw std::runtime_error("online_compile::exec::Run(): fputs() failed");
             }
         }
     }
@@ -90,4 +90,4 @@ int Run(const std::string& p, std::istream* in, std::ostream* out)
 }
 
 } // namespace exec
-} // namespace olCompile
+} // namespace online_compile
diff --git a/host/online_compilation/hip_utility/handlehip.cpp b/host/online_compile/hip_utility/handlehip.cpp
similarity index 93%
rename from host/online_compilation/hip_utility/handlehip.cpp
rename to host/online_compile/hip_utility/handlehip.cpp
index f403b040f4..843957b4ad 100644
--- a/host/online_compilation/hip_utility/handlehip.cpp
+++ b/host/online_compile/hip_utility/handlehip.cpp
@@ -50,7 +50,7 @@
 
 OLC_DECLARE_ENV_VAR(OLC_DEVICE_CU)
 
-namespace olCompile {
+namespace online_compile {
 
 std::size_t GetAvailableMemory()
 {
@@ -182,24 +182,24 @@ KernelInvoke Handle::Run(Kernel k) const { return k.Invoke(this->GetStream()); }
 
 Program Handle::LoadProgram(const std::string& program_name, std::string params) const
 {
-    if((!olCompile::EndsWith(program_name, ".mlir-cpp")) &&
-       (!olCompile::EndsWith(program_name, ".mlir")))
+    if((!online_compile::EndsWith(program_name, ".mlir-cpp")) &&
+       (!online_compile::EndsWith(program_name, ".mlir")))
     {
         params += " -mcpu=" + this->GetTargetProperties().Name();
     }
 
-    auto hsaco = olCompile::LoadBinary(
+    auto hsaco = online_compile::LoadBinary(
         this->GetTargetProperties(), this->GetMaxComputeUnits(), program_name, params);
     if(hsaco.empty())
     {
         auto p = HIPOCProgram{program_name, params, this->GetTargetProperties()};
 
-        auto path = olCompile::GetCachePath() / boost::filesystem::unique_path();
+        auto path = online_compile::GetCachePath() / boost::filesystem::unique_path();
         if(p.IsCodeObjectInMemory())
-            olCompile::WriteFile(p.GetCodeObjectBlob(), path);
+            online_compile::WriteFile(p.GetCodeObjectBlob(), path);
         else
             boost::filesystem::copy_file(p.GetCodeObjectPathname(), path);
-        olCompile::SaveBinary(path, this->GetTargetProperties(), program_name, params);
+        online_compile::SaveBinary(path, this->GetTargetProperties(), program_name, params);
 
         return p;
     }
@@ -245,7 +245,7 @@ std::size_t Handle::GetGlobalMemorySize() const
 std::size_t Handle::GetMaxComputeUnits() const
 {
     int result;
-    const char* const num_cu = olCompile::GetStringEnv(OLC_DEVICE_CU{});
+    const char* const num_cu = online_compile::GetStringEnv(OLC_DEVICE_CU{});
     if(num_cu != nullptr && strlen(num_cu) > 0)
     {
         return boost::lexical_cast<std::size_t>(num_cu);
@@ -282,4 +282,4 @@ std::ostream& Handle::Print(std::ostream& os) const
     return os;
 }
 
-} // namespace olCompile
+} // namespace online_compile
diff --git a/host/online_compilation/hip_utility/hip_build_utils.cpp b/host/online_compile/hip_utility/hip_build_utils.cpp
similarity index 97%
rename from host/online_compilation/hip_utility/hip_build_utils.cpp
rename to host/online_compile/hip_utility/hip_build_utils.cpp
index e73c345937..99b786e606 100644
--- a/host/online_compilation/hip_utility/hip_build_utils.cpp
+++ b/host/online_compile/hip_utility/hip_build_utils.cpp
@@ -45,7 +45,7 @@ OLC_DECLARE_ENV_VAR(OLC_DEBUG_HIP_DUMP)
 
 #define OLC_HIP_COMPILER "/opt/rocm/llvm/bin/clang++"
 
-namespace olCompile {
+namespace online_compile {
 
 bool IsHccCompiler()
 {
@@ -155,12 +155,12 @@ static boost::filesystem::path HipBuildImpl(boost::optional<TmpDir>& tmp_dir,
         params += " -mllvm -amdgpu-function-calls=false";
     }
 
-    if(olCompile::IsEnabled(OLC_DEBUG_HIP_VERBOSE{}))
+    if(online_compile::IsEnabled(OLC_DEBUG_HIP_VERBOSE{}))
     {
         params += " -v";
     }
 
-    if(olCompile::IsEnabled(OLC_DEBUG_HIP_DUMP{}))
+    if(online_compile::IsEnabled(OLC_DEBUG_HIP_DUMP{}))
     {
         if(IsHccCompiler())
         {
@@ -247,7 +247,7 @@ static external_tool_version_t HipCompilerVersionImpl()
                 break;
 
             std::stringstream out;
-            if(olCompile::exec::Run(path + " --version", nullptr, &out) != 0)
+            if(online_compile::exec::Run(path + " --version", nullptr, &out) != 0)
                 break;
 
             std::string line;
@@ -343,4 +343,4 @@ bool operator<=(const external_tool_version_t& lhs, const external_tool_version_
     return !(lhs > rhs);
 }
 
-} // namespace olCompile
+} // namespace online_compile
diff --git a/host/online_compilation/hip_utility/hipoc_kernel.cpp b/host/online_compile/hip_utility/hipoc_kernel.cpp
similarity index 98%
rename from host/online_compilation/hip_utility/hipoc_kernel.cpp
rename to host/online_compile/hip_utility/hipoc_kernel.cpp
index 41fcd92c94..a07d736ac1 100644
--- a/host/online_compilation/hip_utility/hipoc_kernel.cpp
+++ b/host/online_compile/hip_utility/hipoc_kernel.cpp
@@ -34,7 +34,7 @@
 #include <chrono>
 #include <thread>
 
-namespace olCompile {
+namespace online_compile {
 
 void HIPOCKernelInvoke::run(void* args, std::size_t size) const
 {
@@ -81,4 +81,4 @@ HIPOCKernelInvoke HIPOCKernel::Invoke(hipStream_t stream,
 {
     return HIPOCKernelInvoke{stream, fun, ldims, gdims, name, callback};
 }
-} // namespace olCompile
+} // namespace online_compile
diff --git a/host/online_compilation/hip_utility/hipoc_program.cpp b/host/online_compile/hip_utility/hipoc_program.cpp
similarity index 96%
rename from host/online_compilation/hip_utility/hipoc_program.cpp
rename to host/online_compile/hip_utility/hipoc_program.cpp
index d2ea1fcb0c..81e03b72ab 100644
--- a/host/online_compilation/hip_utility/hipoc_program.cpp
+++ b/host/online_compile/hip_utility/hipoc_program.cpp
@@ -39,7 +39,7 @@
 
 #include <unistd.h>
 
-namespace olCompile {
+namespace online_compile {
 
 static hipModulePtr CreateModule(const boost::filesystem::path& hsaco_file)
 {
@@ -89,7 +89,7 @@ void HIPOCProgramImpl::BuildCodeObjectInFile(std::string& params,
     this->dir.emplace(filename);
     hsaco_file = dir->path / (filename + ".o");
 
-    if(olCompile::EndsWith(filename, ".cpp"))
+    if(online_compile::EndsWith(filename, ".cpp"))
     {
         hsaco_file = HipBuild(dir, filename, src, params, target);
     }
@@ -104,7 +104,7 @@ void HIPOCProgramImpl::BuildCodeObject(std::string params)
 {
     std::string filename = program;
 
-    if(olCompile::EndsWith(filename, ".cpp"))
+    if(online_compile::EndsWith(filename, ".cpp"))
     {
         params += " -Wno-everything";
     }
@@ -136,4 +136,4 @@ std::string HIPOCProgram::GetCodeObjectBlob() const
 
 bool HIPOCProgram::IsCodeObjectInMemory() const { return !impl->binary.empty(); };
 
-} // namespace olCompile
+} // namespace online_compile
diff --git a/host/online_compilation/hip_utility/kernel_build_params.cpp b/host/online_compile/hip_utility/kernel_build_params.cpp
similarity index 97%
rename from host/online_compilation/hip_utility/kernel_build_params.cpp
rename to host/online_compile/hip_utility/kernel_build_params.cpp
index f9474796bc..e37974b1a3 100644
--- a/host/online_compilation/hip_utility/kernel_build_params.cpp
+++ b/host/online_compile/hip_utility/kernel_build_params.cpp
@@ -31,7 +31,7 @@
 #include <kernel_build_params.hpp>
 #include <stringutils.hpp>
 
-namespace olCompile {
+namespace online_compile {
 
 static std::string GenerateDefines(const std::vector<KernelBuildParameter>& options,
                                    const std::string& prefix)
@@ -63,4 +63,4 @@ static std::string GenerateDefines(const std::vector<KernelBuildParameter>& opti
     return JoinStrings(strs, " ");
 }
 
-} // namespace olCompile
+} // namespace online_compile
diff --git a/host/online_compilation/hip_utility/kernel_cache.cpp b/host/online_compile/hip_utility/kernel_cache.cpp
similarity index 98%
rename from host/online_compilation/hip_utility/kernel_cache.cpp
rename to host/online_compile/hip_utility/kernel_cache.cpp
index fff57c194e..dceb8de94e 100644
--- a/host/online_compilation/hip_utility/kernel_cache.cpp
+++ b/host/online_compile/hip_utility/kernel_cache.cpp
@@ -46,7 +46,7 @@
 #include <iostream>
 #include <iterator>
 
-namespace olCompile {
+namespace online_compile {
 
 const std::vector<Kernel>& KernelCache::GetKernels(const std::string& algorithm,
                                                    const std::string& network_config)
@@ -151,4 +151,4 @@ void KernelCache::ClearKernels(const std::string& algorithm, const std::string&
 
 KernelCache::KernelCache() {}
 
-} // namespace olCompile
+} // namespace online_compile
diff --git a/host/online_compilation/hip_utility/logger.cpp b/host/online_compile/hip_utility/logger.cpp
similarity index 88%
rename from host/online_compilation/hip_utility/logger.cpp
rename to host/online_compile/hip_utility/logger.cpp
index e8d31562a5..d84bb20908 100644
--- a/host/online_compilation/hip_utility/logger.cpp
+++ b/host/online_compile/hip_utility/logger.cpp
@@ -5,7 +5,7 @@
 
 using namespace std;
 
-namespace olCompile {
+namespace online_compile {
 
 #if OLC_DEBUG
 static LogLevel defLevel = LogLevel::Info2;
@@ -27,7 +27,7 @@ string LogLevelString(LogLevel level)
 
 ostream& fdt_log(LogLevel level, const char* header, const char* content)
 {
-    if(level > olCompile::defLevel)
+    if(level > online_compile::defLevel)
     {
         return (cerr);
     };
@@ -40,4 +40,4 @@ ostream& fdt_log(LogLevel level, const char* header, const char* content)
 ostream& fdt_log() { return (cerr); };
 
 void fdt_log_flush() { cerr << endl; }
-}; // namespace olCompile
+}; // namespace online_compile
diff --git a/host/online_compilation/hip_utility/md5.cpp b/host/online_compile/hip_utility/md5.cpp
similarity index 99%
rename from host/online_compilation/hip_utility/md5.cpp
rename to host/online_compile/hip_utility/md5.cpp
index ad31292ea7..24166decba 100644
--- a/host/online_compilation/hip_utility/md5.cpp
+++ b/host/online_compile/hip_utility/md5.cpp
@@ -298,7 +298,7 @@ static void MD5_Final(unsigned char* result, MD5_CTX* ctx)
     memset(ctx, 0, sizeof(*ctx));
 }
 
-namespace olCompile {
+namespace online_compile {
 
 std::string md5(std::string s)
 {
@@ -316,4 +316,4 @@ std::string md5(std::string s)
 
     return sout.str();
 }
-} // namespace olCompile
+} // namespace online_compile
diff --git a/host/online_compilation/hip_utility/target_properties.cpp b/host/online_compile/hip_utility/target_properties.cpp
similarity index 96%
rename from host/online_compilation/hip_utility/target_properties.cpp
rename to host/online_compile/hip_utility/target_properties.cpp
index 1d2bdef1c1..1de2852c91 100644
--- a/host/online_compilation/hip_utility/target_properties.cpp
+++ b/host/online_compile/hip_utility/target_properties.cpp
@@ -32,7 +32,7 @@
 
 OLC_DECLARE_ENV_VAR(OLC_DEBUG_ENFORCE_DEVICE)
 
-namespace olCompile {
+namespace online_compile {
 
 static std::string GetDeviceNameFromMap(const std::string& in)
 {
@@ -53,7 +53,7 @@ static std::string GetDeviceNameFromMap(const std::string& in)
         {"10.3.0 Sienna_Cichlid 18", "gfx1030"},
     };
 
-    const char* const p_asciz = olCompile::GetStringEnv(OLC_DEBUG_ENFORCE_DEVICE{});
+    const char* const p_asciz = online_compile::GetStringEnv(OLC_DEBUG_ENFORCE_DEVICE{});
     if(p_asciz != nullptr && strlen(p_asciz) > 0)
         return {p_asciz};
 
@@ -116,4 +116,4 @@ void TargetProperties::InitDbId()
         dbId += "_xnack";
 }
 
-} // namespace olCompile
+} // namespace online_compile
diff --git a/host/online_compilation/hip_utility/tmp_dir.cpp b/host/online_compile/hip_utility/tmp_dir.cpp
similarity index 90%
rename from host/online_compilation/hip_utility/tmp_dir.cpp
rename to host/online_compile/hip_utility/tmp_dir.cpp
index 6e5de6935d..bdef7cad06 100644
--- a/host/online_compilation/hip_utility/tmp_dir.cpp
+++ b/host/online_compile/hip_utility/tmp_dir.cpp
@@ -31,7 +31,7 @@
 
 OLC_DECLARE_ENV_VAR(OLC_DEBUG_SAVE_TEMP_DIR)
 
-namespace olCompile {
+namespace online_compile {
 
 void SystemCmd(std::string cmd)
 {
@@ -43,7 +43,7 @@ void SystemCmd(std::string cmd)
 
 TmpDir::TmpDir(std::string prefix)
     : path(boost::filesystem::temp_directory_path() /
-           boost::filesystem::unique_path("olCompile-" + prefix + "-%%%%-%%%%-%%%%-%%%%"))
+           boost::filesystem::unique_path("online_compile-" + prefix + "-%%%%-%%%%-%%%%-%%%%"))
 {
     boost::filesystem::create_directories(this->path);
 }
@@ -57,10 +57,10 @@ void TmpDir::Execute(std::string exe, std::string args) const
 
 TmpDir::~TmpDir()
 {
-    if(!olCompile::IsEnabled(OLC_DEBUG_SAVE_TEMP_DIR{}))
+    if(!online_compile::IsEnabled(OLC_DEBUG_SAVE_TEMP_DIR{}))
     {
         boost::filesystem::remove_all(this->path);
     }
 }
 
-} // namespace olCompile
+} // namespace online_compile
diff --git a/host/online_compilation/include/binary_cache.hpp b/host/online_compile/include/binary_cache.hpp
similarity index 97%
rename from host/online_compilation/include/binary_cache.hpp
rename to host/online_compile/include/binary_cache.hpp
index 5ff9f81093..c146bb9758 100644
--- a/host/online_compilation/include/binary_cache.hpp
+++ b/host/online_compile/include/binary_cache.hpp
@@ -31,7 +31,7 @@
 #include <boost/filesystem/path.hpp>
 #include <string>
 
-namespace olCompile {
+namespace online_compile {
 
 boost::filesystem::path
 GetCacheFile(const std::string& device, const std::string& name, const std::string& args);
@@ -47,6 +47,6 @@ void SaveBinary(const boost::filesystem::path& binary_path,
                 const std::string& name,
                 const std::string& args);
 
-} // namespace olCompile
+} // namespace online_compile
 
 #endif
diff --git a/host/online_compilation/include/config.h.in b/host/online_compile/include/config.h.in
similarity index 100%
rename from host/online_compilation/include/config.h.in
rename to host/online_compile/include/config.h.in
diff --git a/host/online_compilation/include/env.hpp b/host/online_compile/include/env.hpp
similarity index 92%
rename from host/online_compilation/include/env.hpp
rename to host/online_compile/include/env.hpp
index 1d519a44d7..057a863269 100644
--- a/host/online_compilation/include/env.hpp
+++ b/host/online_compile/include/env.hpp
@@ -31,7 +31,7 @@
 #include <string>
 #include <vector>
 
-namespace olCompile {
+namespace online_compile {
 
 /// \todo Rework: Case-insensitive string compare, ODR, (?) move to .cpp
 
@@ -101,23 +101,23 @@ inline const char* GetStringEnv(T)
 template <class T>
 inline bool IsEnabled(T)
 {
-    static const bool result = olCompile::IsEnvvarValueEnabled(T::value());
+    static const bool result = online_compile::IsEnvvarValueEnabled(T::value());
     return result;
 }
 
 template <class T>
 inline bool IsDisabled(T)
 {
-    static const bool result = olCompile::IsEnvvarValueDisabled(T::value());
+    static const bool result = online_compile::IsEnvvarValueDisabled(T::value());
     return result;
 }
 
 template <class T>
 inline unsigned long int Value(T, unsigned long int fallback = 0)
 {
-    static const auto result = olCompile::EnvvarValue(T::value(), fallback);
+    static const auto result = online_compile::EnvvarValue(T::value(), fallback);
     return result;
 }
-} // namespace olCompile
+} // namespace online_compile
 
 #endif
diff --git a/host/online_compilation/include/exec_utils.hpp b/host/online_compile/include/exec_utils.hpp
similarity index 96%
rename from host/online_compilation/include/exec_utils.hpp
rename to host/online_compile/include/exec_utils.hpp
index bbad128d96..e257133ca4 100644
--- a/host/online_compilation/include/exec_utils.hpp
+++ b/host/online_compile/include/exec_utils.hpp
@@ -30,13 +30,13 @@
 #include <ostream>
 #include <string>
 
-namespace olCompile {
+namespace online_compile {
 namespace exec {
 
 /// Redirecting both input and output is not supported.
 int Run(const std::string& p, std::istream* in, std::ostream* out);
 
 } // namespace exec
-} // namespace olCompile
+} // namespace online_compile
 
 #endif // EXEC_UTILS_HPP
diff --git a/host/online_compilation/include/handle.hpp b/host/online_compile/include/handle.hpp
similarity index 98%
rename from host/online_compilation/include/handle.hpp
rename to host/online_compile/include/handle.hpp
index db93ee1445..8eda802a43 100644
--- a/host/online_compilation/include/handle.hpp
+++ b/host/online_compile/include/handle.hpp
@@ -40,7 +40,7 @@
 #include <vector>
 #include <unordered_map>
 
-namespace olCompile {
+namespace online_compile {
 
 struct HandleImpl;
 
@@ -140,6 +140,6 @@ struct Handle
 
 inline std::ostream& operator<<(std::ostream& os, const Handle& handle) { return handle.Print(os); }
 
-} // namespace olCompile
+} // namespace online_compile
 
 #endif // GUARD_OLC_HANDLE_HPP_
diff --git a/host/online_compilation/include/hipCheck.hpp b/host/online_compile/include/hipCheck.hpp
similarity index 100%
rename from host/online_compilation/include/hipCheck.hpp
rename to host/online_compile/include/hipCheck.hpp
diff --git a/host/online_compilation/include/hip_build_utils.hpp b/host/online_compile/include/hip_build_utils.hpp
similarity index 96%
rename from host/online_compilation/include/hip_build_utils.hpp
rename to host/online_compile/include/hip_build_utils.hpp
index af456f846b..f93993edef 100644
--- a/host/online_compilation/include/hip_build_utils.hpp
+++ b/host/online_compile/include/hip_build_utils.hpp
@@ -31,9 +31,9 @@
 #include <boost/optional.hpp>
 #include <string>
 
-namespace olCompile {
+namespace online_compile {
 
-boost::filesystem::path HipBuild(boost::optional<olCompile::TmpDir>& tmp_dir,
+boost::filesystem::path HipBuild(boost::optional<online_compile::TmpDir>& tmp_dir,
                                  const std::string& filename,
                                  std::string src,
                                  std::string params,
@@ -92,6 +92,6 @@ class LcOptionTargetStrings
     }
 };
 
-} // namespace olCompile
+} // namespace online_compile
 
 #endif
diff --git a/host/online_compilation/include/hipoc_kernel.hpp b/host/online_compile/include/hipoc_kernel.hpp
similarity index 99%
rename from host/online_compilation/include/hipoc_kernel.hpp
rename to host/online_compile/include/hipoc_kernel.hpp
index 3bcf88f526..f6c5e1adf5 100644
--- a/host/online_compilation/include/hipoc_kernel.hpp
+++ b/host/online_compile/include/hipoc_kernel.hpp
@@ -36,7 +36,7 @@
 #include <vector>
 #include <memory>
 
-namespace olCompile {
+namespace online_compile {
 
 using HipEventPtr = OLC_MANAGE_PTR(hipEvent_t, hipEventDestroy);
 inline HipEventPtr make_hip_event()
@@ -169,6 +169,6 @@ struct HIPOCKernel
                              std::function<void(hipEvent_t, hipEvent_t)> callback = nullptr) const;
 };
 
-} // namespace olCompile
+} // namespace online_compile
 
 #endif
diff --git a/host/online_compilation/include/hipoc_program.hpp b/host/online_compile/include/hipoc_program.hpp
similarity index 97%
rename from host/online_compilation/include/hipoc_program.hpp
rename to host/online_compile/include/hipoc_program.hpp
index 5296003cb7..c388bb35bf 100644
--- a/host/online_compilation/include/hipoc_program.hpp
+++ b/host/online_compile/include/hipoc_program.hpp
@@ -33,7 +33,7 @@
 #include <hip/hip_runtime_api.h>
 #include <string>
 
-namespace olCompile {
+namespace online_compile {
 
 struct HIPOCProgramImpl;
 struct HIPOCProgram
@@ -59,6 +59,6 @@ struct HIPOCProgram
     /// False if CO resides on filesystem.
     bool IsCodeObjectInMemory() const;
 };
-} // namespace olCompile
+} // namespace online_compile
 
 #endif
diff --git a/host/online_compilation/include/hipoc_program_impl.hpp b/host/online_compile/include/hipoc_program_impl.hpp
similarity index 97%
rename from host/online_compilation/include/hipoc_program_impl.hpp
rename to host/online_compile/include/hipoc_program_impl.hpp
index 2d8706b2e8..4e6b59d265 100644
--- a/host/online_compilation/include/hipoc_program_impl.hpp
+++ b/host/online_compile/include/hipoc_program_impl.hpp
@@ -33,7 +33,7 @@
 #include <boost/optional.hpp>
 #include <hip/hip_runtime_api.h>
 
-namespace olCompile {
+namespace online_compile {
 
 using hipModulePtr = OLC_MANAGE_PTR(hipModule_t, hipModuleUnload);
 
@@ -57,5 +57,5 @@ struct HIPOCProgramImpl
     BuildCodeObjectInFile(std::string& params, const std::string& src, const std::string& filename);
     void BuildCodeObject(std::string params);
 };
-} // namespace olCompile
+} // namespace online_compile
 #endif // GUARD_OLC_HIPOC_PROGRAM_IMPL_HPP
diff --git a/host/online_compilation/include/kernel.hpp b/host/online_compile/include/kernel.hpp
similarity index 96%
rename from host/online_compilation/include/kernel.hpp
rename to host/online_compile/include/kernel.hpp
index 73d6be61ad..7d1fd81242 100644
--- a/host/online_compilation/include/kernel.hpp
+++ b/host/online_compile/include/kernel.hpp
@@ -30,7 +30,7 @@
 #include <vector>
 #include <hipoc_kernel.hpp>
 
-namespace olCompile {
+namespace online_compile {
 std::string GetKernelSrc(std::string name);
 std::string GetKernelInc(std::string key);
 std::vector<std::string> GetKernelIncList();
@@ -40,6 +40,6 @@ using Kernel       = HIPOCKernel;
 using KernelInvoke = HIPOCKernelInvoke;
 using Program      = HIPOCProgram;
 
-} // namespace olCompile
+} // namespace online_compile
 
 #endif
diff --git a/host/online_compilation/include/kernel_build_params.hpp b/host/online_compile/include/kernel_build_params.hpp
similarity index 98%
rename from host/online_compilation/include/kernel_build_params.hpp
rename to host/online_compile/include/kernel_build_params.hpp
index c15769ed27..30315ac9b7 100644
--- a/host/online_compilation/include/kernel_build_params.hpp
+++ b/host/online_compile/include/kernel_build_params.hpp
@@ -32,7 +32,7 @@
 #include <string>
 #include <vector>
 
-namespace olCompile {
+namespace online_compile {
 
 namespace kbp {
 struct Option
@@ -132,6 +132,6 @@ class KernelBuildParameters
     }
 };
 
-} // namespace olCompile
+} // namespace online_compile
 
 #endif
diff --git a/host/online_compilation/include/kernel_cache.hpp b/host/online_compile/include/kernel_cache.hpp
similarity index 98%
rename from host/online_compilation/include/kernel_cache.hpp
rename to host/online_compile/include/kernel_cache.hpp
index 9f88327858..20d26f6102 100644
--- a/host/online_compilation/include/kernel_cache.hpp
+++ b/host/online_compile/include/kernel_cache.hpp
@@ -49,7 +49,7 @@
 #include <unordered_map>
 #include <vector>
 
-namespace olCompile {
+namespace online_compile {
 
 /**
  * @brief The KernelCache class Build and cache kernels
@@ -92,6 +92,6 @@ class KernelCache
     ProgramMap program_map;
 };
 
-} // namespace olCompile
+} // namespace online_compile
 
 #endif // GUARD_OLC_KERNEL_CACHE_HPP_
diff --git a/host/online_compilation/include/logger.hpp b/host/online_compile/include/logger.hpp
similarity index 84%
rename from host/online_compilation/include/logger.hpp
rename to host/online_compile/include/logger.hpp
index cc420d6e34..a397a868ba 100644
--- a/host/online_compilation/include/logger.hpp
+++ b/host/online_compile/include/logger.hpp
@@ -3,7 +3,7 @@
 
 #include <fstream>
 
-namespace olCompile {
+namespace online_compile {
 
 enum class LogLevel
 {
@@ -18,6 +18,6 @@ std::ostream& fdt_log(LogLevel level, const char* header, const char* content);
 std::ostream& fdt_log();
 void fdt_log_flush();
 
-}; // namespace olCompile
+}; // namespace online_compile
 
 #endif
diff --git a/host/online_compilation/include/manage_ptr.hpp b/host/online_compile/include/manage_ptr.hpp
similarity index 93%
rename from host/online_compilation/include/manage_ptr.hpp
rename to host/online_compile/include/manage_ptr.hpp
index c02c712475..f23807686b 100644
--- a/host/online_compilation/include/manage_ptr.hpp
+++ b/host/online_compile/include/manage_ptr.hpp
@@ -29,7 +29,7 @@
 #include <memory>
 #include <type_traits>
 
-namespace olCompile {
+namespace online_compile {
 
 template <class F, F f>
 struct manage_deleter
@@ -68,9 +68,9 @@ using remove_ptr = typename std::
 template <class T>
 using shared = std::shared_ptr<remove_ptr<T>>;
 
-} // namespace olCompile
+} // namespace online_compile
 
 #define OLC_MANAGE_PTR(T, F) \
-    olCompile::manage_ptr<typename std::remove_pointer<T>::type, decltype(&F), &F> // NOLINT
+    online_compile::manage_ptr<typename std::remove_pointer<T>::type, decltype(&F), &F> // NOLINT
 
 #endif
diff --git a/host/online_compilation/include/md5.hpp b/host/online_compile/include/md5.hpp
similarity index 66%
rename from host/online_compilation/include/md5.hpp
rename to host/online_compile/include/md5.hpp
index 1f350766e7..0fa25849a5 100644
--- a/host/online_compilation/include/md5.hpp
+++ b/host/online_compile/include/md5.hpp
@@ -3,10 +3,10 @@
 
 #include <string>
 
-namespace olCompile {
+namespace online_compile {
 
 std::string md5(std::string s);
 
-} // namespace olCompile
+} // namespace online_compile
 
 #endif
diff --git a/host/online_compilation/include/op_kernel_args.hpp b/host/online_compile/include/op_kernel_args.hpp
similarity index 94%
rename from host/online_compilation/include/op_kernel_args.hpp
rename to host/online_compile/include/op_kernel_args.hpp
index 7d0420e8f5..eb483265ea 100644
--- a/host/online_compilation/include/op_kernel_args.hpp
+++ b/host/online_compile/include/op_kernel_args.hpp
@@ -6,6 +6,9 @@
 #include <half.hpp>
 
 #include <boost/container/small_vector.hpp>
+
+namespace online_compile {
+
 struct OpKernelArg
 {
 
@@ -32,4 +35,6 @@ struct OpKernelArg
     bool is_ptr = false;
 };
 
+} // namespace online_compile
+
 #endif
diff --git a/host/online_compilation/include/simple_hash.hpp b/host/online_compile/include/simple_hash.hpp
similarity index 96%
rename from host/online_compilation/include/simple_hash.hpp
rename to host/online_compile/include/simple_hash.hpp
index c7dac54cfc..1afa2e2066 100644
--- a/host/online_compilation/include/simple_hash.hpp
+++ b/host/online_compile/include/simple_hash.hpp
@@ -29,7 +29,7 @@
 
 #include <string>
 
-namespace olCompile {
+namespace online_compile {
 struct SimpleHash
 {
     size_t operator()(const std::pair<std::string, std::string>& p) const
@@ -39,6 +39,6 @@ struct SimpleHash
     }
 };
 
-} // namespace olCompile
+} // namespace online_compile
 
 #endif
diff --git a/host/online_compilation/include/stringutils.hpp b/host/online_compile/include/stringutils.hpp
similarity index 98%
rename from host/online_compilation/include/stringutils.hpp
rename to host/online_compile/include/stringutils.hpp
index 6175c36ff4..71975f430a 100644
--- a/host/online_compilation/include/stringutils.hpp
+++ b/host/online_compile/include/stringutils.hpp
@@ -36,7 +36,7 @@
 #define OLC_STRINGIZE_1(...) #__VA_ARGS__
 #define OLC_STRINGIZE(...) OLC_STRINGIZE_1(__VA_ARGS__)
 
-namespace olCompile {
+namespace online_compile {
 
 inline std::string
 ReplaceString(std::string subject, const std::string& search, const std::string& replace)
@@ -128,6 +128,6 @@ inline std::vector<std::string> SplitSpaceSeparated(const std::string& in,
     return rv;
 }
 
-} // namespace olCompile
+} // namespace online_compile
 
 #endif // GUARD_OLC_STRINGUTILS_HPP
diff --git a/host/online_compilation/include/target_properties.hpp b/host/online_compile/include/target_properties.hpp
similarity index 97%
rename from host/online_compilation/include/target_properties.hpp
rename to host/online_compile/include/target_properties.hpp
index 7918728130..349a63fdd5 100644
--- a/host/online_compilation/include/target_properties.hpp
+++ b/host/online_compile/include/target_properties.hpp
@@ -29,7 +29,7 @@
 #include <boost/optional.hpp>
 #include <string>
 
-namespace olCompile {
+namespace online_compile {
 
 struct Handle;
 
@@ -51,6 +51,6 @@ struct TargetProperties
     boost::optional<bool> sramecc_reported = boost::none;
 };
 
-} // namespace olCompile
+} // namespace online_compile
 
 #endif // GUARD_OLC_TARGET_PROPERTIES_HPP
diff --git a/host/online_compilation/include/tmp_dir.hpp b/host/online_compile/include/tmp_dir.hpp
similarity index 87%
rename from host/online_compilation/include/tmp_dir.hpp
rename to host/online_compile/include/tmp_dir.hpp
index 099a18bf74..3221786061 100644
--- a/host/online_compilation/include/tmp_dir.hpp
+++ b/host/online_compile/include/tmp_dir.hpp
@@ -4,7 +4,7 @@
 #include <string>
 #include <boost/filesystem/path.hpp>
 
-namespace olCompile {
+namespace online_compile {
 
 void SystemCmd(std::string cmd);
 
@@ -21,6 +21,6 @@ struct TmpDir
     ~TmpDir();
 };
 
-} // namespace olCompile
+} // namespace online_compile
 
 #endif
diff --git a/host/online_compilation/include/write_file.hpp b/host/online_compile/include/write_file.hpp
similarity index 94%
rename from host/online_compilation/include/write_file.hpp
rename to host/online_compile/include/write_file.hpp
index f1ddb85237..098ff17abf 100644
--- a/host/online_compilation/include/write_file.hpp
+++ b/host/online_compile/include/write_file.hpp
@@ -5,7 +5,7 @@
 #include <manage_ptr.hpp>
 #include <fstream>
 
-namespace olCompile {
+namespace online_compile {
 
 using FilePtr = OLC_MANAGE_PTR(FILE*, std::fclose);
 
@@ -25,6 +25,6 @@ inline void WriteFile(const std::vector<char>& content, const boost::filesystem:
         throw std::runtime_error("Failed to write to file");
 }
 
-} // namespace olCompile
+} // namespace online_compile
 
 #endif
diff --git a/host/online_compilation/kernel.cpp.in b/host/online_compile/kernel.cpp.in
similarity index 97%
rename from host/online_compilation/kernel.cpp.in
rename to host/online_compile/kernel.cpp.in
index f67e7d1c6e..b9a9805284 100644
--- a/host/online_compilation/kernel.cpp.in
+++ b/host/online_compile/kernel.cpp.in
@@ -31,7 +31,7 @@
 ${KERNELS_DECLS}
 // clang-format on
 
-namespace olCompile {
+namespace online_compile {
 
 const std::map<std::string, std::string>& kernels()
 {
@@ -67,4 +67,4 @@ std::string GetKernelSrc(std::string name)
     return it->second;
 }
 
-} // namespace olCompile
+} // namespace online_compile
diff --git a/host/online_compilation/kernel_includes.cpp.in b/host/online_compile/kernel_includes.cpp.in
similarity index 97%
rename from host/online_compilation/kernel_includes.cpp.in
rename to host/online_compile/kernel_includes.cpp.in
index 24dc09e9ff..a7e6bd689b 100644
--- a/host/online_compilation/kernel_includes.cpp.in
+++ b/host/online_compile/kernel_includes.cpp.in
@@ -29,7 +29,7 @@
 #include <stdexcept>
 #include <vector>
 
-namespace olCompile {
+namespace online_compile {
 
 static inline bool EndsWith(const std::string& value, const std::string& suffix)
 {
@@ -77,4 +77,4 @@ std::vector<std::string> GetHipKernelIncList()
     return keys;
 }
 
-} // namespace olCompile
+} // namespace online_compile
diff --git a/host/online_compilation/kernels_batch.cpp.in b/host/online_compile/kernels_batch.cpp.in
similarity index 100%
rename from host/online_compilation/kernels_batch.cpp.in
rename to host/online_compile/kernels_batch.cpp.in
diff --git a/host/driver_online/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp b/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp
similarity index 100%
rename from host/driver_online/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp
rename to host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp
diff --git a/host/driver_online/include/conv_tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp b/host/solver/include/conv_tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp
similarity index 100%
rename from host/driver_online/include/conv_tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp
rename to host/solver/include/conv_tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp
diff --git a/host/driver_online/include/conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp b/host/solver/include/conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp
similarity index 100%
rename from host/driver_online/include/conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp
rename to host/solver/include/conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp
diff --git a/host/driver_online/include/conv_tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp b/host/solver/include/conv_tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
similarity index 100%
rename from host/driver_online/include/conv_tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
rename to host/solver/include/conv_tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
diff --git a/host/driver_online/include/convolution_problem_descriptor.hpp b/host/solver/include/convolution_problem_descriptor.hpp
similarity index 100%
rename from host/driver_online/include/convolution_problem_descriptor.hpp
rename to host/solver/include/convolution_problem_descriptor.hpp

From d82270826c734f82aadca82577701d77a0ccef66 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Fri, 6 Aug 2021 21:41:22 +0000
Subject: [PATCH 04/57] fix

---
 src/include/miopen/solver/ck_util.hpp            | 2 +-
 src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/include/miopen/solver/ck_util.hpp b/src/include/miopen/solver/ck_util.hpp
index 04a7d88eeb..4ad0ea2d6f 100644
--- a/src/include/miopen/solver/ck_util.hpp
+++ b/src/include/miopen/solver/ck_util.hpp
@@ -34,7 +34,7 @@
 #include <algorithm>
 
 #include "../composable_kernel/composable_kernel/include/utility/data_type_enum.hpp"
-#include "../composable_kernel/host/driver_online/include/convolution_problem_descriptor.hpp"
+#include "../composable_kernel/host/solver/include/convolution_problem_descriptor.hpp"
 #include "../composable_kernel/host/driver_online/include/online_driver_common.hpp"
 
 MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM)
diff --git a/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp b/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp
index fc25d75b77..065051a7ea 100644
--- a/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp
+++ b/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp
@@ -31,7 +31,7 @@
 #include <miopen/solver/ck_util.hpp>
 #include <cstddef>
 
-#include "../composable_kernel/host/driver_online/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp"
+#include "../composable_kernel/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp"
 
 MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW)
 

From 0bf90d45dc1772ac9ca8b3b5272806334bb7d9a9 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Fri, 6 Aug 2021 22:17:51 +0000
Subject: [PATCH 05/57] refactor

---
 .../driver_online/conv_fwd_driver_online.cpp  |  2 +-
 ...mplicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp |  2 +-
 ...plicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp |  2 +-
 ...mplicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp |  9 ++++---
 .../include/online_driver_common.hpp          |  6 +++--
 ...nv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp |  6 +++--
 .../convolution_problem_descriptor.hpp        |  6 +++--
 src/include/miopen/solver/ck_util.hpp         |  2 +-
 .../conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp     | 26 +++++++++----------
 9 files changed, 34 insertions(+), 27 deletions(-)

diff --git a/src/composable_kernel/host/driver_online/conv_fwd_driver_online.cpp b/src/composable_kernel/host/driver_online/conv_fwd_driver_online.cpp
index 29609d5474..53e6179aa6 100644
--- a/src/composable_kernel/host/driver_online/conv_fwd_driver_online.cpp
+++ b/src/composable_kernel/host/driver_online/conv_fwd_driver_online.cpp
@@ -35,7 +35,7 @@ enum ConvForwardAlgo
 int main(int argc, char* argv[])
 {
     using namespace ck;
-    using namespace ck_driver;
+    using namespace ck::driver;
     using size_t = std::size_t;
 
     hipStream_t stream;
diff --git a/src/composable_kernel/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp b/src/composable_kernel/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
index 06412fba0b..419b8ca95d 100644
--- a/src/composable_kernel/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
+++ b/src/composable_kernel/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
@@ -231,7 +231,7 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcy
     ck::index_t nrepeat)
 {
     using namespace ck;
-    using namespace ck_driver;
+    using namespace ck::driver;
     using namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw;
     using size_t = std::size_t;
 
diff --git a/src/composable_kernel/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp b/src/composable_kernel/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
index 61ce41fe84..46d065f615 100644
--- a/src/composable_kernel/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
+++ b/src/composable_kernel/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
@@ -227,7 +227,7 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kc
     ck::index_t nrepeat)
 {
     using namespace ck;
-    using namespace ck_driver;
+    using namespace ck::driver;
     using namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw;
     using size_t = std::size_t;
 
diff --git a/src/composable_kernel/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp b/src/composable_kernel/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
index 92467a7668..7b88ef02b4 100644
--- a/src/composable_kernel/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/src/composable_kernel/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
@@ -31,11 +31,11 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcy
     const Tensor<TInWei>& in_n_c_hi_wi,
     const Tensor<TInWei>& wei_k_c_y_x,
     Tensor<TOut>& out_n_k_ho_wo,
-    const ck_driver::CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw& compile_param,
+    const ck::driver::CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw& compile_param,
     ck::index_t nrepeat)
 {
     using namespace ck;
-    using namespace ck_driver;
+    using namespace ck::driver;
     using size_t = std::size_t;
 
     std::cout << __func__ << std::endl;
@@ -100,8 +100,9 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcy
         "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp";
     std::string algo_name = "implicit_gemm_conv_fwd_v6r1_dlops_nchw";
 
-    std::string compile_param_string = get_ck_hip_online_compile_common_flag() + compile_param.GetCompileParameterString();
-    std::string network_config       = compile_param_string;
+    std::string compile_param_string =
+        get_ck_hip_online_compile_common_flag() + compile_param.GetCompileParameterString();
+    std::string network_config = compile_param_string;
 
     std::vector<float> kernel1_times;
     std::vector<float> kernel2_times;
diff --git a/src/composable_kernel/host/driver_online/include/online_driver_common.hpp b/src/composable_kernel/host/driver_online/include/online_driver_common.hpp
index d05a156d89..508a3594cd 100644
--- a/src/composable_kernel/host/driver_online/include/online_driver_common.hpp
+++ b/src/composable_kernel/host/driver_online/include/online_driver_common.hpp
@@ -1,7 +1,8 @@
 #ifndef ONLINE_DRIVER_COMMON_HPP
 #define ONLINE_DRIVER_COMMON_HPP
 
-namespace ck_driver {
+namespace ck {
+namespace driver {
 
 inline auto get_ck_hip_online_compile_common_flag()
 {
@@ -47,5 +48,6 @@ auto gcd(X x, Ys... ys)
     return gcd(x, gcd(ys...));
 }
 
-} // namespace ck_driver
+} // namespace driver
+} // namespace ck
 #endif
diff --git a/src/composable_kernel/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp b/src/composable_kernel/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp
index b0c4921019..a30c2720ee 100644
--- a/src/composable_kernel/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/src/composable_kernel/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp
@@ -3,7 +3,8 @@
 
 #include <numeric>
 
-namespace ck_driver {
+namespace ck {
+namespace driver {
 
 struct CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw
 {
@@ -669,5 +670,6 @@ struct ConvIgemmFwdV6r1DlopsNchwKcyxNkhw
     }
 };
 
-} // namespace ck_driver
+} // namespace driver
+} // namespace ck
 #endif
diff --git a/src/composable_kernel/host/solver/include/convolution_problem_descriptor.hpp b/src/composable_kernel/host/solver/include/convolution_problem_descriptor.hpp
index df9c110e70..8c0ecbee80 100644
--- a/src/composable_kernel/host/solver/include/convolution_problem_descriptor.hpp
+++ b/src/composable_kernel/host/solver/include/convolution_problem_descriptor.hpp
@@ -1,7 +1,8 @@
 #ifndef CONVOLUTION_PROBLEM_DESCRIPTOR
 #define CONVOLUTION_PROBLEM_DESCRIPTOR
 
-namespace ck_driver {
+namespace ck {
+namespace driver {
 
 struct ConvolutionProblemDescriptor
 {
@@ -75,5 +76,6 @@ struct ConvolutionProblemDescriptor
     std::size_t CalculateFlop() const { return 2L * N * K * C * Y * X * Ho * Wo; }
 };
 
-} // namespace ck_driver
+} // namespace driver
+} // namespace ck
 #endif
diff --git a/src/include/miopen/solver/ck_util.hpp b/src/include/miopen/solver/ck_util.hpp
index 4ad0ea2d6f..2bfaf8ce28 100644
--- a/src/include/miopen/solver/ck_util.hpp
+++ b/src/include/miopen/solver/ck_util.hpp
@@ -312,7 +312,7 @@ static inline auto get_ck_convolution_problem_descriptor(const ConvolutionContex
     else
         ck_datatype = ck::DataTypeEnum_t::Unknown;
 
-    return ck_driver::ConvolutionProblemDescriptor{
+    return ck::driver::ConvolutionProblemDescriptor{
         ConvolutionContextInterpreter::GetBatchN(ctx),
         ConvolutionContextInterpreter::GetOutputChannelK(ctx),
         ConvolutionContextInterpreter::GetInputChannelC(ctx),
diff --git a/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp b/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp
index 065051a7ea..5ce2131e42 100644
--- a/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp
+++ b/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp
@@ -41,14 +41,14 @@ namespace solver {
 static inline auto get_ck_tunable_conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw(
     const PerformanceConvCkIgemmFwdV6r1DlopsNchw& config)
 {
-    return ck_driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetTunableList()[config
-                                                                              .ck_tunable_list_id];
+    return ck::driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetTunableList()[config
+                                                                               .ck_tunable_list_id];
 }
 
 bool PerformanceConvCkIgemmFwdV6r1DlopsNchw::SetNextValue(const ConvolutionContext&)
 {
     if(ck_tunable_list_id <
-       ck_driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetTunableList().size() - 1)
+       ck::driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetTunableList().size() - 1)
     {
         ck_tunable_list_id++;
         return true;
@@ -61,18 +61,18 @@ bool PerformanceConvCkIgemmFwdV6r1DlopsNchw::SetNextValue(const ConvolutionConte
 
 bool PerformanceConvCkIgemmFwdV6r1DlopsNchw::IsValid(const ConvolutionContext& ctx) const
 {
-    auto compile_param = ck_driver::CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{};
+    auto compile_param = ck::driver::CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{};
     bool found         = false;
 
     std::tie(compile_param, found) =
-        ck_driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::CalculateCompileParameterBasedOnTunable(
+        ck::driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::CalculateCompileParameterBasedOnTunable(
             get_ck_convolution_problem_descriptor(ctx),
             get_ck_tunable_conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw(*this));
 
     if(!found)
         return false;
 
-    return ck_driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::IsValidCompileParameter(
+    return ck::driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::IsValidCompileParameter(
         get_ck_convolution_problem_descriptor(ctx), compile_param);
 }
 
@@ -95,14 +95,14 @@ bool ConvCkIgemmFwdV6r1DlopsNchw::IsApplicable(const ConvolutionContext& ctx) co
     if(ctx.group_counts != 1)
         return false;
 
-    return ck_driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::IsApplicable(
+    return ck::driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::IsApplicable(
         get_ck_convolution_problem_descriptor(ctx));
 }
 
 PerformanceConvCkIgemmFwdV6r1DlopsNchw
 ConvCkIgemmFwdV6r1DlopsNchw::GetPerformanceConfig(const ConvolutionContext& ctx) const
 {
-    for(int i = 0; i < ck_driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetTunableList().size(); ++i)
+    for(int i = 0; i < ck::driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetTunableList().size(); ++i)
     {
         if(IsValidPerformanceConfig(ctx, i))
         {
@@ -127,10 +127,10 @@ ConvSolution ConvCkIgemmFwdV6r1DlopsNchw::GetSolution(
 
     const auto ck_conv_problem_desc = get_ck_convolution_problem_descriptor(ctx);
 
-    auto ck_compile_param = ck_driver::CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{};
+    auto ck_compile_param = ck::driver::CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{};
 
     std::tie(ck_compile_param, std::ignore) =
-        ck_driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::CalculateCompileParameterBasedOnTunable(
+        ck::driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::CalculateCompileParameterBasedOnTunable(
             ck_conv_problem_desc, get_ck_tunable_conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw(config));
 
     // kernel0: prepare
@@ -157,11 +157,11 @@ ConvSolution ConvCkIgemmFwdV6r1DlopsNchw::GetSolution(
             "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw";
 
         const auto block_size =
-            std::size_t(ck_driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetBlockSize(
+            std::size_t(ck::driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetBlockSize(
                 ck_conv_problem_desc, ck_compile_param));
 
         const auto grid_size =
-            std::size_t(ck_driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetGridSize(
+            std::size_t(ck::driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetGridSize(
                 ck_conv_problem_desc, ck_compile_param));
 
         kernel1_info.l_wk = {block_size, 1, 1};
@@ -226,7 +226,7 @@ ConvSolution ConvCkIgemmFwdV6r1DlopsNchw::GetSolution(
 
 std::size_t ConvCkIgemmFwdV6r1DlopsNchw::GetWorkspaceSize(const ConvolutionContext& ctx) const
 {
-    return ck_driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetMaxWorkSpaceSize(
+    return ck::driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetMaxWorkSpaceSize(
         get_ck_convolution_problem_descriptor(ctx));
 }
 

From c67b040570c483515b1a49c2d2eeb752436abff7 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Sat, 7 Aug 2021 00:51:05 +0000
Subject: [PATCH 06/57] remove online compilation from CK

---
 src/composable_kernel/CMakeLists.txt          |   5 +-
 src/composable_kernel/cmake/AddKernels.cmake  |  40 --
 src/composable_kernel/cmake/TargetFlags.cmake |  50 --
 src/composable_kernel/host/CMakeLists.txt     |   2 -
 .../host/driver_online/CMakeLists.txt         |  22 -
 .../driver_online/conv_fwd_driver_online.cpp  | 453 ------------------
 ...mplicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp | 395 ---------------
 ...plicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp | 386 ---------------
 ...plicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp | 389 ---------------
 ...mplicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp | 183 -------
 .../host/online_compile/CMakeLists.txt        | 168 -------
 .../online_compile/addkernels/CMakeLists.txt  |  30 --
 .../online_compile/addkernels/addkernels.cpp  | 264 ----------
 .../addkernels/include_inliner.cpp            | 213 --------
 .../addkernels/include_inliner.hpp            | 142 ------
 .../addkernels/source_file_desc.hpp           |  45 --
 .../hip_utility/binary_cache.cpp              | 112 -----
 .../online_compile/hip_utility/exec_utils.cpp |  93 ----
 .../online_compile/hip_utility/handlehip.cpp  | 285 -----------
 .../hip_utility/hip_build_utils.cpp           | 346 -------------
 .../hip_utility/hipoc_kernel.cpp              |  84 ----
 .../hip_utility/hipoc_program.cpp             | 139 ------
 .../hip_utility/kernel_build_params.cpp       |  66 ---
 .../hip_utility/kernel_cache.cpp              | 154 ------
 .../online_compile/hip_utility/logger.cpp     |  43 --
 .../host/online_compile/hip_utility/md5.cpp   | 319 ------------
 .../hip_utility/target_properties.cpp         | 119 -----
 .../online_compile/hip_utility/tmp_dir.cpp    |  66 ---
 .../online_compile/include/binary_cache.hpp   |  52 --
 .../host/online_compile/include/config.h.in   |  47 --
 .../host/online_compile/include/env.hpp       | 123 -----
 .../online_compile/include/exec_utils.hpp     |  42 --
 .../host/online_compile/include/handle.hpp    | 145 ------
 .../host/online_compile/include/hipCheck.hpp  |  22 -
 .../include/hip_build_utils.hpp               |  97 ----
 .../online_compile/include/hipoc_kernel.hpp   | 174 -------
 .../online_compile/include/hipoc_program.hpp  |  64 ---
 .../include/hipoc_program_impl.hpp            |  61 ---
 .../host/online_compile/include/kernel.hpp    |  45 --
 .../include/kernel_build_params.hpp           | 137 ------
 .../online_compile/include/kernel_cache.hpp   |  97 ----
 .../host/online_compile/include/logger.hpp    |  23 -
 .../online_compile/include/manage_ptr.hpp     |  76 ---
 .../host/online_compile/include/md5.hpp       |  12 -
 .../online_compile/include/op_kernel_args.hpp |  40 --
 .../online_compile/include/simple_hash.hpp    |  44 --
 .../online_compile/include/stringutils.hpp    | 133 -----
 .../include/target_properties.hpp             |  56 ---
 .../host/online_compile/include/tmp_dir.hpp   |  26 -
 .../online_compile/include/write_file.hpp     |  30 --
 .../host/online_compile/kernel.cpp.in         |  70 ---
 .../online_compile/kernel_includes.cpp.in     |  80 ----
 .../host/online_compile/kernels_batch.cpp.in  |   1 -
 .../include/solver_common.hpp}                |  11 +-
 src/include/miopen/solver/ck_util.hpp         |   2 +-
 55 files changed, 4 insertions(+), 6319 deletions(-)
 delete mode 100644 src/composable_kernel/cmake/AddKernels.cmake
 delete mode 100644 src/composable_kernel/cmake/TargetFlags.cmake
 delete mode 100644 src/composable_kernel/host/driver_online/CMakeLists.txt
 delete mode 100644 src/composable_kernel/host/driver_online/conv_fwd_driver_online.cpp
 delete mode 100644 src/composable_kernel/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
 delete mode 100644 src/composable_kernel/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
 delete mode 100644 src/composable_kernel/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
 delete mode 100644 src/composable_kernel/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
 delete mode 100644 src/composable_kernel/host/online_compile/CMakeLists.txt
 delete mode 100644 src/composable_kernel/host/online_compile/addkernels/CMakeLists.txt
 delete mode 100644 src/composable_kernel/host/online_compile/addkernels/addkernels.cpp
 delete mode 100644 src/composable_kernel/host/online_compile/addkernels/include_inliner.cpp
 delete mode 100644 src/composable_kernel/host/online_compile/addkernels/include_inliner.hpp
 delete mode 100644 src/composable_kernel/host/online_compile/addkernels/source_file_desc.hpp
 delete mode 100644 src/composable_kernel/host/online_compile/hip_utility/binary_cache.cpp
 delete mode 100644 src/composable_kernel/host/online_compile/hip_utility/exec_utils.cpp
 delete mode 100644 src/composable_kernel/host/online_compile/hip_utility/handlehip.cpp
 delete mode 100644 src/composable_kernel/host/online_compile/hip_utility/hip_build_utils.cpp
 delete mode 100644 src/composable_kernel/host/online_compile/hip_utility/hipoc_kernel.cpp
 delete mode 100644 src/composable_kernel/host/online_compile/hip_utility/hipoc_program.cpp
 delete mode 100644 src/composable_kernel/host/online_compile/hip_utility/kernel_build_params.cpp
 delete mode 100644 src/composable_kernel/host/online_compile/hip_utility/kernel_cache.cpp
 delete mode 100644 src/composable_kernel/host/online_compile/hip_utility/logger.cpp
 delete mode 100644 src/composable_kernel/host/online_compile/hip_utility/md5.cpp
 delete mode 100644 src/composable_kernel/host/online_compile/hip_utility/target_properties.cpp
 delete mode 100644 src/composable_kernel/host/online_compile/hip_utility/tmp_dir.cpp
 delete mode 100644 src/composable_kernel/host/online_compile/include/binary_cache.hpp
 delete mode 100644 src/composable_kernel/host/online_compile/include/config.h.in
 delete mode 100644 src/composable_kernel/host/online_compile/include/env.hpp
 delete mode 100644 src/composable_kernel/host/online_compile/include/exec_utils.hpp
 delete mode 100644 src/composable_kernel/host/online_compile/include/handle.hpp
 delete mode 100644 src/composable_kernel/host/online_compile/include/hipCheck.hpp
 delete mode 100644 src/composable_kernel/host/online_compile/include/hip_build_utils.hpp
 delete mode 100644 src/composable_kernel/host/online_compile/include/hipoc_kernel.hpp
 delete mode 100644 src/composable_kernel/host/online_compile/include/hipoc_program.hpp
 delete mode 100644 src/composable_kernel/host/online_compile/include/hipoc_program_impl.hpp
 delete mode 100644 src/composable_kernel/host/online_compile/include/kernel.hpp
 delete mode 100644 src/composable_kernel/host/online_compile/include/kernel_build_params.hpp
 delete mode 100644 src/composable_kernel/host/online_compile/include/kernel_cache.hpp
 delete mode 100644 src/composable_kernel/host/online_compile/include/logger.hpp
 delete mode 100644 src/composable_kernel/host/online_compile/include/manage_ptr.hpp
 delete mode 100644 src/composable_kernel/host/online_compile/include/md5.hpp
 delete mode 100644 src/composable_kernel/host/online_compile/include/op_kernel_args.hpp
 delete mode 100644 src/composable_kernel/host/online_compile/include/simple_hash.hpp
 delete mode 100644 src/composable_kernel/host/online_compile/include/stringutils.hpp
 delete mode 100644 src/composable_kernel/host/online_compile/include/target_properties.hpp
 delete mode 100644 src/composable_kernel/host/online_compile/include/tmp_dir.hpp
 delete mode 100644 src/composable_kernel/host/online_compile/include/write_file.hpp
 delete mode 100644 src/composable_kernel/host/online_compile/kernel.cpp.in
 delete mode 100644 src/composable_kernel/host/online_compile/kernel_includes.cpp.in
 delete mode 100644 src/composable_kernel/host/online_compile/kernels_batch.cpp.in
 rename src/composable_kernel/host/{driver_online/include/online_driver_common.hpp => solver/include/solver_common.hpp} (79%)

diff --git a/src/composable_kernel/CMakeLists.txt b/src/composable_kernel/CMakeLists.txt
index 0cf342bb45..fa5dcfe3ea 100644
--- a/src/composable_kernel/CMakeLists.txt
+++ b/src/composable_kernel/CMakeLists.txt
@@ -1,11 +1,8 @@
 cmake_minimum_required(VERSION 2.8.3)
-project(modular_convolution)
+project(composable_kernel)
 
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
 
-include(TargetFlags)
-include(AddKernels)
-
 ## C++
 enable_language(CXX)
 set(CMAKE_CXX_STANDARD 17)
diff --git a/src/composable_kernel/cmake/AddKernels.cmake b/src/composable_kernel/cmake/AddKernels.cmake
deleted file mode 100644
index 429ecc47a9..0000000000
--- a/src/composable_kernel/cmake/AddKernels.cmake
+++ /dev/null
@@ -1,40 +0,0 @@
-
-function(add_kernels SRC_DIR KERNEL_FILES)
-    set(INIT_KERNELS_LIST)
-    set(KERNELS_DECLS)
-    foreach(KERNEL_FILE ${KERNEL_FILES})
-        if("${CMAKE_VERSION}" VERSION_LESS 3.0)
-            configure_file(${KERNEL_FILE} ${KERNEL_FILE}.delete)
-        else()
-            set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${KERNEL_FILE})
-        endif()
-        get_filename_component(BASE_NAME ${KERNEL_FILE} NAME_WE)
-        string(TOUPPER "${BASE_NAME}" KEY_NAME)
-        string(MAKE_C_IDENTIFIER "${KEY_NAME}" VAR_NAME)
-	string(APPEND KERNELS_DECLS "extern const size_t APP_KERNEL_${VAR_NAME}_SIZE;\n")
-	string(APPEND KERNELS_DECLS "extern const unsigned char APP_KERNEL_${VAR_NAME}[];\n")
-	list(APPEND INIT_KERNELS_LIST "    { \"${KEY_NAME}\", std::string(reinterpret_cast<const char*>(APP_KERNEL_${VAR_NAME}), APP_KERNEL_${VAR_NAME}_SIZE) }")
-    endforeach()
-    string(REPLACE ";" ",\n" INIT_KERNELS "${INIT_KERNELS_LIST}")
-    configure_file(${SRC_DIR}/kernel.cpp.in ${PROJECT_BINARY_DIR}/kernel.cpp)
-endfunction()
-
-function(add_kernel_includes SRC_DIR KERNEL_FILES)
-    set(INIT_KERNELS_LIST)
-    foreach(KERNEL_FILE ${KERNEL_FILES})
-        if("${CMAKE_VERSION}" VERSION_LESS 3.0)
-            configure_file(${KERNEL_FILE} ${KERNEL_FILE}.delete)
-        else()
-            set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${KERNEL_FILE})
-        endif()
-        get_filename_component(BASE_NAME ${KERNEL_FILE} NAME_WE)
-        get_filename_component(FILE_NAME ${KERNEL_FILE} NAME)
-        string(TOUPPER "${BASE_NAME}" KEY_NAME)
-        string(MAKE_C_IDENTIFIER "${KEY_NAME}" VAR_NAME)
-        list(APPEND INIT_KERNELS_LIST "    { \"${FILE_NAME}\", std::string(reinterpret_cast<const char*>(${VAR_NAME}), ${VAR_NAME}_SIZE) }")
-    endforeach()
-    string(REPLACE ";" ",\n" INIT_KERNELS "${INIT_KERNELS_LIST}")
-    configure_file(${SRC_DIR}/kernel_includes.cpp.in ${PROJECT_BINARY_DIR}/kernel_includes.cpp)
-endfunction()
-
-
diff --git a/src/composable_kernel/cmake/TargetFlags.cmake b/src/composable_kernel/cmake/TargetFlags.cmake
deleted file mode 100644
index 4f83fb5d39..0000000000
--- a/src/composable_kernel/cmake/TargetFlags.cmake
+++ /dev/null
@@ -1,50 +0,0 @@
-
-function(get_target_property2 VAR TARGET PROPERTY)
-    get_target_property(_pflags ${TARGET} ${PROPERTY})
-    if(_pflags)
-        set(${VAR} ${_pflags} PARENT_SCOPE)
-    else()
-        set(${VAR} "" PARENT_SCOPE)
-    endif()
-endfunction()
-
-
-macro(append_flags FLAGS TARGET PROPERTY PREFIX)
-    get_target_property2(_pflags ${TARGET} ${PROPERTY})
-    foreach(FLAG ${_pflags})
-        if(TARGET ${FLAG})
-            target_flags(_pflags2 ${FLAG})
-            string(APPEND ${FLAGS} " ${_pflags2}")
-        else()
-            string(APPEND ${FLAGS} " ${PREFIX}${FLAG}")
-        endif()
-    endforeach()
-endmacro()
-
-macro(append_link_flags FLAGS TARGET PROPERTY)
-    get_target_property2(_pflags ${TARGET} ${PROPERTY})
-    foreach(FLAG ${_pflags})
-        if(TARGET ${FLAG})
-            target_flags(_pflags2 ${FLAG})
-            string(APPEND ${FLAGS} " ${_pflags2}")
-        elseif(FLAG MATCHES "^-.*")
-            string(APPEND ${FLAGS} " ${FLAG}")
-        elseif(EXISTS ${FLAG})
-            string(APPEND ${FLAGS} " ${FLAG}")
-        else()
-            string(APPEND ${FLAGS} " -l${FLAG}")
-        endif()
-    endforeach()
-endmacro()
-
-function(target_flags FLAGS TARGET)
-    set(_flags)
-    append_flags(_flags ${TARGET} "INTERFACE_COMPILE_OPTIONS" "")
-    append_flags(_flags ${TARGET} "INTERFACE_COMPILE_DEFINITIONS" "-D")
-    append_flags(_flags ${TARGET} "INTERFACE_INCLUDE_DIRECTORIES" "-isystem ")
-    append_flags(_flags ${TARGET} "INTERFACE_LINK_DIRECTORIES" "-L ")
-    append_flags(_flags ${TARGET} "INTERFACE_LINK_OPTIONS" "")
-    append_link_flags(_flags ${TARGET} "INTERFACE_LINK_LIBRARIES" "")
-    # message("_flags: ${_flags}")
-    set(${FLAGS} ${_flags} PARENT_SCOPE)
-endfunction()
diff --git a/src/composable_kernel/host/CMakeLists.txt b/src/composable_kernel/host/CMakeLists.txt
index 26739efe34..30cc14d8ca 100644
--- a/src/composable_kernel/host/CMakeLists.txt
+++ b/src/composable_kernel/host/CMakeLists.txt
@@ -1,4 +1,2 @@
 add_subdirectory(host_tensor)
-add_subdirectory(online_compile)
 add_subdirectory(driver_offline)
-add_subdirectory(driver_online)
diff --git a/src/composable_kernel/host/driver_online/CMakeLists.txt b/src/composable_kernel/host/driver_online/CMakeLists.txt
deleted file mode 100644
index 077e3218a0..0000000000
--- a/src/composable_kernel/host/driver_online/CMakeLists.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-include_directories(BEFORE
-    include
-    ${PROJECT_BINARY_DIR}/host/online_compile/include
-    ${PROJECT_SOURCE_DIR}/host/online_compile/include
-    ${PROJECT_SOURCE_DIR}/host/host_tensor/include
-    ${PROJECT_SOURCE_DIR}/host/solver/include
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/utility
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_operation
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/problem_transform
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/driver
-    ${PROJECT_SOURCE_DIR}/external/rocm/include
-    ${PROJECT_SOURCE_DIR}/external/half/include
-)
-
-set(CONV_FWD_DRIVER_ONLINE_SOURCE conv_fwd_driver_online.cpp)
-
-add_executable(conv_fwd_driver_online ${CONV_FWD_DRIVER_ONLINE_SOURCE})
-
-target_link_libraries(conv_fwd_driver_online PRIVATE host_tensor)
-target_link_libraries(conv_fwd_driver_online PRIVATE online_compile)
diff --git a/src/composable_kernel/host/driver_online/conv_fwd_driver_online.cpp b/src/composable_kernel/host/driver_online/conv_fwd_driver_online.cpp
deleted file mode 100644
index 53e6179aa6..0000000000
--- a/src/composable_kernel/host/driver_online/conv_fwd_driver_online.cpp
+++ /dev/null
@@ -1,453 +0,0 @@
-#include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-#include <stdlib.h>
-#include <half.hpp>
-#include "config.hpp"
-#include "print.hpp"
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
-#include "conv_common.hpp"
-#include "host_conv.hpp"
-#include "device_tensor.hpp"
-#include "handle.hpp"
-#include "hipCheck.hpp"
-#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp"
-#include "online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp"
-#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
-#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp"
-
-#define USE_CONV_FWD_V4R4_NCHW 1
-#define USE_CONV_FWD_V6R1_NCHW 1
-#define USE_CONV_FWD_V4R4_XDLOPS_NCHW 1
-#define USE_CONV_FWD_V4R4_XDLOPS_NHWC 1
-
-enum ConvForwardAlgo
-{
-    V4R4NCHW,    // 0
-    V6R1NCHW,    // 1
-    V4R4XDLNCHW, // 2
-    V4R4XDLNHWC  // 3
-};
-
-int main(int argc, char* argv[])
-{
-    using namespace ck;
-    using namespace ck::driver;
-    using size_t = std::size_t;
-
-    hipStream_t stream;
-    online_compile::Handle* handle;
-
-    MY_HIP_CHECK(hipStreamCreate(&stream));
-
-    handle = new online_compile::Handle(stream);
-
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
-    constexpr auto I6 = Number<6>{};
-
-    if(argc != 22)
-    {
-        printf("arg1 to 5: layout, algo, do_verification, init_method, do_log, nrepeat\n");
-        printf("rest: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx\n");
-        exit(1);
-    }
-
-    const ConvTensorLayout layout = static_cast<ConvTensorLayout>(atoi(argv[1]));
-    const ConvForwardAlgo algo    = static_cast<ConvForwardAlgo>(atoi(argv[2]));
-    const bool do_verification    = atoi(argv[3]);
-    const int init_method         = atoi(argv[4]);
-    const bool do_log             = atoi(argv[5]);
-    const int nrepeat             = atoi(argv[6]);
-
-    const index_t N  = atoi(argv[7]);
-    const index_t K  = atoi(argv[8]);
-    const index_t C  = atoi(argv[9]);
-    const index_t Y  = atoi(argv[10]);
-    const index_t X  = atoi(argv[11]);
-    const index_t Hi = atoi(argv[12]);
-    const index_t Wi = atoi(argv[13]);
-
-    const index_t conv_stride_h   = atoi(argv[14]);
-    const index_t conv_stride_w   = atoi(argv[15]);
-    const index_t conv_dilation_h = atoi(argv[16]);
-    const index_t conv_dilation_w = atoi(argv[17]);
-    const index_t in_left_pad_h   = atoi(argv[18]);
-    const index_t in_left_pad_w   = atoi(argv[19]);
-    const index_t in_right_pad_h  = atoi(argv[20]);
-    const index_t in_right_pad_w  = atoi(argv[21]);
-
-    const index_t YEff = (Y - 1) * conv_dilation_h + 1;
-    const index_t XEff = (X - 1) * conv_dilation_w + 1;
-
-    const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
-    const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
-
-#if 1
-    using in_data_t  = float;
-    using acc_data_t = float;
-    using out_data_t = float;
-#elif 0
-    using in_data_t  = half_t;
-    using acc_data_t = float;
-    using out_data_t = half_t;
-#elif 1
-    using in_data_t  = int8_t;
-    using acc_data_t = int32_t;
-    using out_data_t = int8_t;
-#endif
-
-    std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4);
-
-    switch(layout)
-    {
-    case ConvTensorLayout::NCHW:
-        // NCHW
-        in_lengths_host[0] = static_cast<std::size_t>(N);
-        in_lengths_host[1] = static_cast<std::size_t>(C);
-        in_lengths_host[2] = static_cast<std::size_t>(Hi);
-        in_lengths_host[3] = static_cast<std::size_t>(Wi);
-
-        wei_lengths_host[0] = static_cast<std::size_t>(K);
-        wei_lengths_host[1] = static_cast<std::size_t>(C);
-        wei_lengths_host[2] = static_cast<std::size_t>(Y);
-        wei_lengths_host[3] = static_cast<std::size_t>(X);
-
-        out_lengths_host[0] = static_cast<std::size_t>(N);
-        out_lengths_host[1] = static_cast<std::size_t>(K);
-        out_lengths_host[2] = static_cast<std::size_t>(Ho);
-        out_lengths_host[3] = static_cast<std::size_t>(Wo);
-        break;
-    case ConvTensorLayout::NHWC:
-        // NHWC
-        in_lengths_host[0] = static_cast<std::size_t>(N);
-        in_lengths_host[1] = static_cast<std::size_t>(Hi);
-        in_lengths_host[2] = static_cast<std::size_t>(Wi);
-        in_lengths_host[3] = static_cast<std::size_t>(C);
-
-        wei_lengths_host[0] = static_cast<std::size_t>(K);
-        wei_lengths_host[1] = static_cast<std::size_t>(Y);
-        wei_lengths_host[2] = static_cast<std::size_t>(X);
-        wei_lengths_host[3] = static_cast<std::size_t>(C);
-
-        out_lengths_host[0] = static_cast<std::size_t>(N);
-        out_lengths_host[1] = static_cast<std::size_t>(Ho);
-        out_lengths_host[2] = static_cast<std::size_t>(Wo);
-        out_lengths_host[3] = static_cast<std::size_t>(K);
-        break;
-    default: throw std::runtime_error("wrong! not implemented");
-    }
-
-    Tensor<in_data_t> in(in_lengths_host);
-    Tensor<in_data_t> wei(wei_lengths_host);
-    Tensor<out_data_t> out_host(out_lengths_host);
-    Tensor<out_data_t> out_device(out_lengths_host);
-
-    std::cout << "layout: " << layout << std::endl;
-    ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
-    ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
-    ostream_HostTensorDescriptor(out_host.mDesc, std::cout << "out: ");
-    print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
-    print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
-    print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
-    print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
-
-    std::size_t num_thread = std::thread::hardware_concurrency();
-
-    switch(init_method)
-    {
-    case 0:
-        // no initialization
-        break;
-    case 1:
-        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        break;
-    case 2:
-        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        break;
-    case 3:
-        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        break;
-    case 4:
-        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        break;
-    case 5:
-        in.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
-        break;
-    default:
-        in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
-
-        auto gen_wei = [](auto... is) {
-            return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
-        };
-        wei.GenerateTensorValue(gen_wei, num_thread);
-    }
-
-    auto f_make_for_device_nchw = [&]() {
-        const auto in_lengths_dev  = make_tuple(N, C, Hi, Wi);
-        const auto wei_lengths_dev = make_tuple(K, C, Y, X);
-        const auto out_lengths_dev = make_tuple(N, K, Ho, Wo);
-
-        return make_tuple(in_lengths_dev, wei_lengths_dev, out_lengths_dev);
-    };
-
-    auto f_make_for_device_nhwc = [&]() {
-        const auto in_lengths_dev  = make_tuple(N, Hi, Wi, C);
-        const auto wei_lengths_dev = make_tuple(K, Y, X, C);
-        const auto out_lengths_dev = make_tuple(N, Ho, Wo, K);
-
-        return make_tuple(in_lengths_dev, wei_lengths_dev, out_lengths_dev);
-    };
-
-    const auto conv_strides   = make_tuple(conv_stride_h, conv_stride_w);
-    const auto conv_dilations = make_tuple(conv_dilation_h, conv_dilation_w);
-    const auto in_left_pads   = make_tuple(in_left_pad_h, in_left_pad_w);
-    const auto in_right_pads  = make_tuple(in_right_pad_h, in_right_pad_w);
-
-#if USE_CONV_FWD_V4R4_NCHW
-    if(algo == ConvForwardAlgo::V4R4NCHW)
-    {
-        if(layout != ConvTensorLayout::NCHW)
-        {
-            throw std::runtime_error("wrong! layout");
-        }
-
-        const auto tmp = f_make_for_device_nchw();
-
-        tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* tunable =
-            &default_tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw;
-
-        online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw<
-            in_data_t,
-            acc_data_t,
-            out_data_t>(handle,
-                        tmp[I0],
-                        tmp[I1],
-                        tmp[I2],
-                        conv_strides,
-                        conv_dilations,
-                        in_left_pads,
-                        in_right_pads,
-                        in,
-                        wei,
-                        out_device,
-                        tunable,
-                        nrepeat);
-    }
-#endif
-
-#if USE_CONV_FWD_V6R1_NCHW
-    if(algo == ConvForwardAlgo::V6R1NCHW)
-    {
-        if(layout != ConvTensorLayout::NCHW)
-        {
-            throw std::runtime_error("wrong! layout");
-        }
-
-        const auto tmp = f_make_for_device_nchw();
-
-#if 1
-        const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param = {
-            get_datatype_enum_from_type<in_data_t>::value,
-            get_datatype_enum_from_type<acc_data_t>::value,
-            get_datatype_enum_from_type<out_data_t>::value,
-            256,
-            4,
-            1,
-            128,
-            32,
-            8,
-            4,
-            4,
-            1,
-            {8, 2},
-            {8, 2},
-            {4, 1, 1, 1, 1},
-            {2, 1, 1, 128, 1},
-            {4, 1, 1, 1, 1},
-            {1, 1, 1, 1, 1},
-            {1, 4, 1, 1, 1},
-            {8, 1, 1, 32, 1},
-            {1, 1, 1, 1, 1},
-            {1, 1, 1, 1, 1},
-            4,
-            true,
-            true};
-#elif 0
-        const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param = {
-            get_datatype_enum_from_type<in_data_t>::value,
-            get_datatype_enum_from_type<acc_data_t>::value,
-            get_datatype_enum_from_type<out_data_t>::value,
-            256,
-            4,
-            2,
-            128,
-            32,
-            8,
-            4,
-            4,
-            1,
-            {8, 2},
-            {8, 2},
-            {4, 1, 1, 1, 2},
-            {2, 1, 1, 128, 1},
-            {4, 1, 1, 1, 1},
-            {1, 1, 1, 1, 1},
-            {1, 4, 1, 1, 2},
-            {8, 1, 1, 32, 1},
-            {1, 1, 1, 1, 1},
-            {1, 1, 1, 1, 1},
-            4,
-            true,
-            true};
-#elif 1
-        const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param = {
-            get_datatype_enum_from_type<in_data_t>::value,
-            get_datatype_enum_from_type<acc_data_t>::value,
-            get_datatype_enum_from_type<out_data_t>::value,
-            256,
-            4,
-            4,
-            128,
-            32,
-            8,
-            4,
-            4,
-            1,
-            {8, 2},
-            {8, 2},
-            {4, 1, 1, 1, 4},
-            {2, 1, 1, 128, 1},
-            {4, 1, 1, 1, 1},
-            {1, 1, 1, 1, 1},
-            {1, 4, 1, 1, 4},
-            {8, 1, 1, 32, 1},
-            {1, 1, 1, 1, 1},
-            {1, 1, 1, 1, 1},
-            4,
-            true,
-            true};
-#endif
-
-        online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw<
-            in_data_t,
-            acc_data_t,
-            out_data_t>(handle,
-                        tmp[I0],
-                        tmp[I1],
-                        tmp[I2],
-                        conv_strides,
-                        conv_dilations,
-                        in_left_pads,
-                        in_right_pads,
-                        in,
-                        wei,
-                        out_device,
-                        compile_param,
-                        nrepeat);
-    }
-#endif
-
-#if USE_CONV_FWD_V4R4_XDLOPS_NCHW
-    if(algo == ConvForwardAlgo::V4R4XDLNCHW)
-    {
-        if(layout != ConvTensorLayout::NCHW)
-        {
-            throw std::runtime_error("wrong! layout");
-        }
-
-        const auto tmp = f_make_for_device_nchw();
-
-        tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* tunable =
-            &default_tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw;
-
-        online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw<
-            in_data_t,
-            acc_data_t,
-            out_data_t>(handle,
-                        tmp[I0],
-                        tmp[I1],
-                        tmp[I2],
-                        conv_strides,
-                        conv_dilations,
-                        in_left_pads,
-                        in_right_pads,
-                        in,
-                        wei,
-                        out_device,
-                        tunable,
-                        nrepeat);
-    }
-#endif
-
-#if USE_CONV_FWD_V4R4_XDLOPS_NHWC
-    if(algo == ConvForwardAlgo::V4R4XDLNHWC)
-    {
-        if(layout != ConvTensorLayout::NHWC)
-        {
-            throw std::runtime_error("wrong! layout");
-        }
-
-        const auto tmp = f_make_for_device_nhwc();
-
-        tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* tunable =
-            &default_tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk;
-
-        online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk<
-            in_data_t,
-            acc_data_t,
-            out_data_t>(handle,
-                        tmp[I0],
-                        tmp[I1],
-                        tmp[I2],
-                        conv_strides,
-                        conv_dilations,
-                        in_left_pads,
-                        in_right_pads,
-                        in,
-                        wei,
-                        out_device,
-                        tunable,
-                        nrepeat);
-    }
-#endif
-
-    if(do_verification)
-    {
-        host_direct_convolution(in,
-                                wei,
-                                out_host,
-                                make_tuple(conv_stride_h, conv_stride_w),
-                                make_tuple(conv_dilation_h, conv_dilation_w),
-                                make_tuple(in_left_pad_h, in_left_pad_w),
-                                make_tuple(in_right_pad_h, in_right_pad_w),
-                                layout);
-
-        check_error(out_host, out_device);
-
-#if 0
-        if(do_log)
-        {
-            LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
-            LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
-            LogRangeAsType<float>(std::cout << "out_host  : ", out_host.mData, ",") << std::endl;
-            LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
-        }
-#endif
-    }
-
-    delete handle;
-    MY_HIP_CHECK(hipStreamDestroy(stream));
-}
diff --git a/src/composable_kernel/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp b/src/composable_kernel/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
deleted file mode 100644
index 419b8ca95d..0000000000
--- a/src/composable_kernel/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
+++ /dev/null
@@ -1,395 +0,0 @@
-#pragma once
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "handle.hpp"
-#include "online_driver_common.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp"
-#include "conv_tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp"
-
-namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw {
-
-template <typename TInWei, typename TAcc, typename TOut>
-static std::string get_network_config_string_from_types()
-{
-    using namespace ck;
-
-    std::string out;
-
-    out += std::to_string(get_datatype_enum_from_type<TInWei>::value) + "_" +
-           std::to_string(get_datatype_enum_from_type<TAcc>::value) + "_" +
-           std::to_string(get_datatype_enum_from_type<TOut>::value);
-
-    return (out);
-};
-
-static std::string
-get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* pt)
-{
-    std::string out("TUN_");
-
-    out += std::to_string(pt->BlockSize) + "_";
-
-    out += std::to_string(pt->MPerBlock) + "x" + std::to_string(pt->NPerBlock) + "x" +
-           std::to_string(pt->KPerBlock) + "_";
-    out += std::to_string(pt->M1PerThread) + "x" + std::to_string(pt->N1PerThread) + "x" +
-           std::to_string(pt->KPerThread) + "_";
-    out += std::to_string(pt->M1N1ThreadClusterM10) + "x" +
-           std::to_string(pt->M1N1ThreadClusterN10) + "x" +
-           std::to_string(pt->M1N1ThreadClusterM11) + "x" +
-           std::to_string(pt->M1N1ThreadClusterN11) + "_";
-
-    out += std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[2]) + "_";
-
-    out += std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[2]) + "_";
-
-    out += std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "_";
-
-    out += std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "x" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "x" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "_";
-
-    out += std::to_string(pt->ABlockTransferSrcVectorDim) + "_";
-    out += std::to_string(pt->ABlockTransferSrcScalarPerVector) + "_";
-    out += std::to_string(pt->ABlockTransferDstScalarPerVector_M1) + "_";
-    out += std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun) + "_";
-
-    out += std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[2]) + "_";
-
-    out += std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[2]) + "_";
-
-    out += std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "_";
-
-    out += std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "x" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "x" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "_";
-
-    out += std::to_string(pt->BBlockTransferSrcVectorDim) + "_";
-    out += std::to_string(pt->BBlockTransferSrcScalarPerVector) + "_";
-    out += std::to_string(pt->BBlockTransferDstScalarPerVector_N1) + "_";
-    out += std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun) + "_";
-
-    out += std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "_";
-
-    out += std::to_string(pt->CThreadTransferSrcDstVectorDim) + "_";
-    out += std::to_string(pt->CThreadTransferDstScalarPerVector);
-
-    return (out);
-};
-
-template <typename TInWei, typename TAcc, typename TOut>
-static std::string get_definition_string_from_types()
-{
-    using namespace ck;
-
-    std::string out;
-
-    out +=
-        " -DCK_PARAM_ABDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TInWei>::value) +
-        " -DCK_PARAM_AccDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TAcc>::value) +
-        " -DCK_PARAM_CDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TOut>::value);
-
-    return (out);
-};
-
-static std::string
-get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* pt)
-{
-    std::string out;
-
-    out += " -DCK_PARAM_BlockSize=" + std::to_string(pt->BlockSize);
-
-    out += " -DCK_PARAM_MPerBlock=" + std::to_string(pt->MPerBlock) +
-           " -DCK_PARAM_NPerBlock=" + std::to_string(pt->NPerBlock) +
-           " -DCK_PARAM_KPerBlock=" + std::to_string(pt->KPerBlock);
-    out += " -DCK_PARAM_M1PerThread=" + std::to_string(pt->M1PerThread) +
-           " -DCK_PARAM_N1PerThread=" + std::to_string(pt->N1PerThread) +
-           " -DCK_PARAM_KPerThread=" + std::to_string(pt->KPerThread);
-
-    out += " -DCK_PARAM_M1N1ThreadClusterM10=" + std::to_string(pt->M1N1ThreadClusterM10) +
-           " -DCK_PARAM_M1N1ThreadClusterN10=" + std::to_string(pt->M1N1ThreadClusterN10) +
-           " -DCK_PARAM_M1N1ThreadClusterM11=" + std::to_string(pt->M1N1ThreadClusterM11) +
-           " -DCK_PARAM_M1N1ThreadClusterN11=" + std::to_string(pt->M1N1ThreadClusterN11);
-
-    out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_K_M0_M1=" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[2]);
-
-    out += " -DCK_PARAM_ABlockTransferThreadClusterLengths_K_M0_M1=" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[2]);
-
-    out += " -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder=" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]);
-
-    out += " -DCK_PARAM_ABlockTransferSrcAccessOrder=" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "," +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "," +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[2]);
-
-    out +=
-        " -DCK_PARAM_ABlockTransferSrcVectorDim=" + std::to_string(pt->ABlockTransferSrcVectorDim);
-    out += " -DCK_PARAM_ABlockTransferSrcScalarPerVector=" +
-           std::to_string(pt->ABlockTransferSrcScalarPerVector);
-    out += " -DCK_PARAM_ABlockTransferDstScalarPerVector_M1=" +
-           std::to_string(pt->ABlockTransferDstScalarPerVector_M1);
-    out += " -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun=" +
-           std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun);
-
-    out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_K_N0_N1=" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[2]);
-
-    out += " -DCK_PARAM_BBlockTransferThreadClusterLengths_K_N0_N1=" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[2]);
-
-    out += " -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder=" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]);
-
-    out += " -DCK_PARAM_BBlockTransferSrcAccessOrder=" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "," +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "," +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[2]);
-
-    out +=
-        " -DCK_PARAM_BBlockTransferSrcVectorDim=" + std::to_string(pt->BBlockTransferSrcVectorDim);
-    out += " -DCK_PARAM_BBlockTransferSrcScalarPerVector=" +
-           std::to_string(pt->BBlockTransferSrcScalarPerVector);
-    out += " -DCK_PARAM_BBlockTransferDstScalarPerVector_N1=" +
-           std::to_string(pt->BBlockTransferDstScalarPerVector_N1);
-    out += " -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun=" +
-           std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun);
-
-    out += " -DCK_PARAM_CThreadTransferSrcDstAccessOrder=" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]);
-
-    out += " -DCK_PARAM_CThreadTransferSrcDstVectorDim=" +
-           std::to_string(pt->CThreadTransferSrcDstVectorDim);
-    out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
-           std::to_string(pt->CThreadTransferDstScalarPerVector);
-
-    return (out);
-};
-
-} // namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw
-
-template <typename TInWei,
-          typename TAcc,
-          typename TOut,
-          typename InLengths,
-          typename WeiLengths,
-          typename OutLengths,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
-    online_compile::Handle* handle,
-    const InLengths& in_n_c_hi_wi_lengths,
-    const WeiLengths& wei_k_c_y_x_lengths,
-    const OutLengths& out_n_k_ho_wo_lengths,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads,
-    const Tensor<TInWei>& in_n_c_hi_wi,
-    const Tensor<TInWei>& wei_k_c_y_x,
-    Tensor<TOut>& out_n_k_ho_wo,
-    const tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* tunable,
-    ck::index_t nrepeat)
-{
-    using namespace ck;
-    using namespace ck::driver;
-    using namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw;
-    using size_t = std::size_t;
-
-    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // The follow codes are only used for computing the grid_size, hasMainKBlockLoop,
-    // hasDoubleTailKBlockLoop
-
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    const auto in_n_c_hi_wi_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
-    const auto wei_k_c_y_x_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
-    const auto out_n_k_ho_wo_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
-
-    const auto descs =
-        transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
-                                                                        in_n_c_hi_wi_desc,
-                                                                        out_n_k_ho_wo_desc,
-                                                                        conv_strides,
-                                                                        conv_dilations,
-                                                                        in_left_pads,
-                                                                        in_right_pads);
-    const auto a_k_m_grid_desc = descs[I0];
-    const auto c_m_n_grid_desc = descs[I2];
-    const auto M               = c_m_n_grid_desc.GetLength(I0);
-    const auto N               = c_m_n_grid_desc.GetLength(I1);
-    const auto K               = a_k_m_grid_desc.GetLength(I0);
-
-    const index_t grid_size            = (M / tunable->MPerBlock) * (N / tunable->NPerBlock);
-    const bool hasMainKBlockLoop       = ((K + tunable->KPerBlock) / (2 * tunable->KPerBlock) > 1);
-    const bool hasDoubleTailKBlockLoop = ((K / tunable->KPerBlock) % 2 == 0);
-    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-    // these buffers are usually provided by the user application
-    DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
-    DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
-    DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
-
-    in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data());
-    wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data());
-    out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data());
-
-    // these are workspace buffers that should be expressed to the user by the corresponding
-    // workspace API
-    DeviceMem workspace_buf(4096);
-
-    void* a_k_m0_m1_grid_desc_dev_buf = workspace_buf.GetDeviceBuffer();
-    void* b_k_n0_n1_grid_desc_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 1024);
-    void* c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 2048);
-    void* c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 3072);
-
-    const std::vector<size_t> vld  = {static_cast<size_t>(tunable->BlockSize), 1, 1};
-    const std::vector<size_t> vgd1 = {static_cast<size_t>(tunable->BlockSize), 1, 1};
-    const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * tunable->BlockSize), 1, 1};
-
-    std::string program_name =
-        "dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp";
-    std::string algo_name = "implicit_gemm_conv_fwd_v4r4_dlops_nchw";
-
-    std::string param = " -std=c++17 ";
-    std::string network_config;
-
-    param += get_definition_string_from_types<TInWei, TAcc, TOut>() + " " +
-             get_definition_string_from_tunable(tunable) +
-             " -DCK_PARAM_HAS_MAIN_KBLOCK_LOOP=" + std::to_string(hasMainKBlockLoop) +
-             " -DCK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP=" + std::to_string(hasDoubleTailKBlockLoop);
-    network_config = get_network_config_string_from_types<TInWei, TAcc, TOut>() + "_" +
-                     get_network_config_string_from_tunable(tunable) + "_" +
-                     std::to_string(hasMainKBlockLoop) + "_" +
-                     std::to_string(hasDoubleTailKBlockLoop);
-
-    std::vector<float> kernel1_times;
-    std::vector<float> kernel2_times;
-
-    for(index_t i = 0; i < nrepeat; ++i)
-    {
-        KernelTimer timer1, timer2;
-        std::string kernel_name;
-
-        kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw_prepare";
-        auto network_config_1 = network_config + "_1";
-
-        timer1.Start();
-        handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)(
-            static_cast<index_t>(in_n_c_hi_wi_lengths[I0]),
-            static_cast<index_t>(in_n_c_hi_wi_lengths[I1]),
-            static_cast<index_t>(in_n_c_hi_wi_lengths[I2]),
-            static_cast<index_t>(in_n_c_hi_wi_lengths[I3]),
-            static_cast<index_t>(wei_k_c_y_x_lengths[I0]),
-            static_cast<index_t>(wei_k_c_y_x_lengths[I2]),
-            static_cast<index_t>(wei_k_c_y_x_lengths[I3]),
-            conv_strides[I0],
-            conv_strides[I1],
-            conv_dilations[I0],
-            conv_dilations[I1],
-            in_left_pads[I0],
-            in_left_pads[I1],
-            in_right_pads[I0],
-            in_right_pads[I1],
-            a_k_m0_m1_grid_desc_dev_buf,
-            b_k_n0_n1_grid_desc_dev_buf,
-            c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf,
-            c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf);
-        timer1.End();
-
-        kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw";
-        auto network_config_2 = network_config + "_2";
-
-        timer2.Start();
-        handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)(
-            reinterpret_cast<const TInWei*>(wei_k_c_y_x_dev_buf.GetDeviceBuffer()),
-            reinterpret_cast<const TInWei*>(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()),
-            reinterpret_cast<TOut*>(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()),
-            (const void*)(a_k_m0_m1_grid_desc_dev_buf),
-            (const void*)(b_k_n0_n1_grid_desc_dev_buf),
-            (const void*)(c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf),
-            (const void*)(c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf));
-        timer2.End();
-
-        kernel1_times.push_back(timer1.GetElapsedTime());
-        kernel2_times.push_back(timer2.GetElapsedTime());
-    }
-
-    {
-        auto ave_time1 =
-            std::accumulate(
-                std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus<float>{}) /
-            (nrepeat - 1);
-        auto ave_time2 =
-            std::accumulate(
-                std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus<float>{}) /
-            (nrepeat - 1);
-
-        const auto N = in_n_c_hi_wi_lengths[I0];
-        const auto C = in_n_c_hi_wi_lengths[I1];
-
-        const auto K  = out_n_k_ho_wo_lengths[I1];
-        const auto Ho = out_n_k_ho_wo_lengths[I2];
-        const auto Wo = out_n_k_ho_wo_lengths[I3];
-
-        const auto Y = wei_k_c_y_x_lengths[I2];
-        const auto X = wei_k_c_y_x_lengths[I3];
-
-        float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
-                     (std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2);
-
-        std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
-                  << ave_time2 << "), " << perf << " TFlop/s" << std::endl;
-    };
-
-    // copy result back to host
-    out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data());
-}
diff --git a/src/composable_kernel/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp b/src/composable_kernel/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
deleted file mode 100644
index 46d065f615..0000000000
--- a/src/composable_kernel/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
+++ /dev/null
@@ -1,386 +0,0 @@
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "handle.hpp"
-#include "online_driver_common.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
-
-namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw {
-
-template <typename TInWei, typename TAcc, typename TOut>
-static std::string get_network_config_string_from_types()
-{
-    using namespace ck;
-
-    std::string out;
-
-    out += std::to_string(get_datatype_enum_from_type<TInWei>::value) + "_" +
-           std::to_string(get_datatype_enum_from_type<TAcc>::value) + "_" +
-           std::to_string(get_datatype_enum_from_type<TOut>::value);
-
-    return (out);
-};
-
-static std::string
-get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* pt)
-{
-    std::string out("TUN_");
-
-    out += std::to_string(pt->BlockSize) + "_";
-
-    out += std::to_string(pt->MPerBlock) + "x" + std::to_string(pt->NPerBlock) + "x" +
-           std::to_string(pt->KPerBlock) + "_";
-    out += std::to_string(pt->MPerWave) + "x" + std::to_string(pt->NPerWave) + "x" +
-           std::to_string(pt->MRepeat) + "x" + std::to_string(pt->NRepeat) + "x" +
-           std::to_string(pt->K1) + "_";
-
-    out += std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[2]) + "_";
-
-    out += std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[2]) + "_";
-
-    out += std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "_";
-
-    out += std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "x" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "x" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "_";
-
-    out += std::to_string(pt->ABlockTransferSrcVectorDim) + "_";
-    out += std::to_string(pt->ABlockTransferSrcScalarPerVector) + "_";
-    out += std::to_string(pt->ABlockTransferDstScalarPerVector_K1) + "_";
-    out += std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun) + "_";
-
-    out += std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[2]) + "_";
-
-    out += std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[2]) + "_";
-
-    out += std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "_";
-
-    out += std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "x" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "x" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "_";
-
-    out += std::to_string(pt->BBlockTransferSrcVectorDim) + "_";
-    out += std::to_string(pt->BBlockTransferSrcScalarPerVector) + "_";
-    out += std::to_string(pt->BBlockTransferDstScalarPerVector_K1) + "_";
-    out += std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun) + "_";
-
-    out += std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[6]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[7]) + "_";
-
-    out += std::to_string(pt->CThreadTransferSrcDstVectorDim) + "_";
-    out += std::to_string(pt->CThreadTransferDstScalarPerVector);
-
-    return (out);
-};
-
-template <typename TInWei, typename TAcc, typename TOut>
-static std::string get_definition_string_from_types()
-{
-    using namespace ck;
-
-    std::string out;
-
-    out +=
-        " -DCK_PARAM_ABDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TInWei>::value) +
-        " -DCK_PARAM_AccDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TAcc>::value) +
-        " -DCK_PARAM_CDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TOut>::value);
-
-    return (out);
-};
-
-static std::string
-get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* pt)
-{
-    std::string out;
-
-    out += " -DCK_PARAM_BlockSize=" + std::to_string(pt->BlockSize);
-
-    out += " -DCK_PARAM_MPerBlock=" + std::to_string(pt->MPerBlock) +
-           " -DCK_PARAM_NPerBlock=" + std::to_string(pt->NPerBlock) +
-           " -DCK_PARAM_KPerBlock=" + std::to_string(pt->KPerBlock);
-    out += " -DCK_PARAM_MPerWave=" + std::to_string(pt->MPerWave) +
-           " -DCK_PARAM_NPerWave=" + std::to_string(pt->NPerWave) +
-           " -DCK_PARAM_K1=" + std::to_string(pt->K1) +
-           " -DCK_PARAM_MRepeat=" + std::to_string(pt->MRepeat) +
-           " -DCK_PARAM_NRepeat=" + std::to_string(pt->NRepeat);
-
-    out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_K0_M_K1=" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[2]);
-
-    out += " -DCK_PARAM_ABlockTransferThreadClusterLengths_K0_M_K1=" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[2]);
-
-    out += " -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder=" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]);
-
-    out += " -DCK_PARAM_ABlockTransferSrcAccessOrder=" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "," +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "," +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[2]);
-
-    out +=
-        " -DCK_PARAM_ABlockTransferSrcVectorDim=" + std::to_string(pt->ABlockTransferSrcVectorDim);
-    out += " -DCK_PARAM_ABlockTransferSrcScalarPerVector=" +
-           std::to_string(pt->ABlockTransferSrcScalarPerVector);
-    out += " -DCK_PARAM_ABlockTransferDstScalarPerVector_K1=" +
-           std::to_string(pt->ABlockTransferDstScalarPerVector_K1);
-    out += " -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun=" +
-           std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun);
-
-    out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_K0_N_K1=" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[2]);
-
-    out += " -DCK_PARAM_BBlockTransferThreadClusterLengths_K0_N_K1=" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[2]);
-
-    out += " -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder=" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]);
-
-    out += " -DCK_PARAM_BBlockTransferSrcAccessOrder=" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "," +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "," +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[2]);
-
-    out +=
-        " -DCK_PARAM_BBlockTransferSrcVectorDim=" + std::to_string(pt->BBlockTransferSrcVectorDim);
-    out += " -DCK_PARAM_BBlockTransferSrcScalarPerVector=" +
-           std::to_string(pt->BBlockTransferSrcScalarPerVector);
-    out += " -DCK_PARAM_BBlockTransferDstScalarPerVector_K1=" +
-           std::to_string(pt->BBlockTransferDstScalarPerVector_K1);
-    out += " -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun=" +
-           std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun);
-
-    out += " -DCK_PARAM_CThreadTransferSrcDstAccessOrder=" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[6]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[7]);
-
-    out += " -DCK_PARAM_CThreadTransferSrcDstVectorDim=" +
-           std::to_string(pt->CThreadTransferSrcDstVectorDim);
-    out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
-           std::to_string(pt->CThreadTransferDstScalarPerVector);
-
-    return (out);
-};
-
-} // namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
-
-template <typename TInWei,
-          typename TAcc,
-          typename TOut,
-          typename InLengths,
-          typename WeiLengths,
-          typename OutLengths,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(
-    online_compile::Handle* handle,
-    const InLengths& in_n_c_hi_wi_lengths,
-    const WeiLengths& wei_k_c_y_x_lengths,
-    const OutLengths& out_n_k_ho_wo_lengths,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads,
-    const Tensor<TInWei>& in_n_c_hi_wi,
-    const Tensor<TInWei>& wei_k_c_y_x,
-    Tensor<TOut>& out_n_k_ho_wo,
-    const tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* tunable,
-    ck::index_t nrepeat)
-{
-    using namespace ck;
-    using namespace ck::driver;
-    using namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw;
-    using size_t = std::size_t;
-
-    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    const auto in_n_c_hi_wi_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
-    const auto wei_k_c_y_x_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
-    const auto out_n_k_ho_wo_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
-
-    const auto n  = in_n_c_hi_wi_desc.GetLength(I0);
-    const auto c  = in_n_c_hi_wi_desc.GetLength(I1);
-    const auto hi = in_n_c_hi_wi_desc.GetLength(I2);
-    const auto wi = in_n_c_hi_wi_desc.GetLength(I3);
-    const auto k  = wei_k_c_y_x_desc.GetLength(I0);
-    const auto y  = wei_k_c_y_x_desc.GetLength(I2);
-    const auto x  = wei_k_c_y_x_desc.GetLength(I3);
-    const auto ho = out_n_k_ho_wo_desc.GetLength(I2);
-    const auto wo = out_n_k_ho_wo_desc.GetLength(I3);
-
-    const auto M  = k;
-    const auto N  = n * ho * wo;
-    const auto K  = c * y * x;
-    const auto K0 = K / tunable->K1;
-
-    const index_t grid_size = (M / tunable->MPerBlock) * (N / tunable->NPerBlock);
-    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-    // these buffers are usually provided by the user application
-    DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
-    DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
-    DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
-
-    in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data());
-    wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data());
-    out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data());
-
-    // these are workspace buffers that should be expressed to the user by the corresponding
-    // workspace API
-    DeviceMem workspace_buf(4096);
-
-    void* a_k_m0_m1_grid_desc_dev_buf = workspace_buf.GetDeviceBuffer();
-    void* b_k_n0_n1_grid_desc_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 1024);
-    void* c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 2048);
-    void* c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 3072);
-
-    const std::vector<size_t> vld  = {static_cast<size_t>(tunable->BlockSize), 1, 1};
-    const std::vector<size_t> vgd1 = {static_cast<size_t>(tunable->BlockSize), 1, 1};
-    const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * tunable->BlockSize), 1, 1};
-
-    std::string program_name =
-        "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp";
-    std::string algo_name = "implicit_gemm_conv_fwd_v4r4_xdlops_nchw";
-
-    std::string param = " -std=c++17 ";
-    std::string network_config;
-
-    param += get_definition_string_from_types<TInWei, TAcc, TOut>() + " " + " -DCK_USE_AMD_XDLOPS" +
-             get_definition_string_from_tunable(tunable);
-
-    network_config = get_network_config_string_from_types<TInWei, TAcc, TOut>() + "_" +
-                     get_network_config_string_from_tunable(tunable);
-
-    std::vector<float> kernel1_times;
-    std::vector<float> kernel2_times;
-
-    for(index_t i = 0; i < nrepeat; ++i)
-    {
-        KernelTimer timer1, timer2;
-        std::string kernel_name;
-
-        kernel_name =
-            "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_prepare";
-        auto network_config_1 = network_config + "_1";
-
-        timer1.Start();
-        handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)(
-            static_cast<index_t>(in_n_c_hi_wi_lengths[I0]),
-            static_cast<index_t>(in_n_c_hi_wi_lengths[I1]),
-            static_cast<index_t>(in_n_c_hi_wi_lengths[I2]),
-            static_cast<index_t>(in_n_c_hi_wi_lengths[I3]),
-            static_cast<index_t>(wei_k_c_y_x_lengths[I0]),
-            static_cast<index_t>(wei_k_c_y_x_lengths[I2]),
-            static_cast<index_t>(wei_k_c_y_x_lengths[I3]),
-            conv_strides[I0],
-            conv_strides[I1],
-            conv_dilations[I0],
-            conv_dilations[I1],
-            in_left_pads[I0],
-            in_left_pads[I1],
-            in_right_pads[I0],
-            in_right_pads[I1],
-            a_k_m0_m1_grid_desc_dev_buf,
-            b_k_n0_n1_grid_desc_dev_buf,
-            c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf,
-            c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf);
-        timer1.End();
-
-        kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw";
-        auto network_config_2 = network_config + "_2";
-
-        timer2.Start();
-        handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)(
-            reinterpret_cast<const TInWei*>(wei_k_c_y_x_dev_buf.GetDeviceBuffer()),
-            reinterpret_cast<const TInWei*>(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()),
-            reinterpret_cast<TOut*>(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()),
-            (const void*)(a_k_m0_m1_grid_desc_dev_buf),
-            (const void*)(b_k_n0_n1_grid_desc_dev_buf),
-            (const void*)(c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf),
-            (const void*)(c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf));
-        timer2.End();
-
-        kernel1_times.push_back(timer1.GetElapsedTime());
-        kernel2_times.push_back(timer2.GetElapsedTime());
-    }
-
-    {
-        auto ave_time1 =
-            std::accumulate(
-                std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus<float>{}) /
-            (nrepeat - 1);
-        auto ave_time2 =
-            std::accumulate(
-                std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus<float>{}) /
-            (nrepeat - 1);
-
-        const auto N = in_n_c_hi_wi_lengths[I0];
-        const auto C = in_n_c_hi_wi_lengths[I1];
-
-        const auto K  = out_n_k_ho_wo_lengths[I1];
-        const auto Ho = out_n_k_ho_wo_lengths[I2];
-        const auto Wo = out_n_k_ho_wo_lengths[I3];
-
-        const auto Y = wei_k_c_y_x_lengths[I2];
-        const auto X = wei_k_c_y_x_lengths[I3];
-
-        float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
-                     (std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2);
-
-        std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
-                  << ave_time2 << "), " << perf << " TFlop/s" << std::endl;
-    };
-
-    // copy result back to host
-    out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data());
-}
diff --git a/src/composable_kernel/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp b/src/composable_kernel/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
deleted file mode 100644
index 57724c7612..0000000000
--- a/src/composable_kernel/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
+++ /dev/null
@@ -1,389 +0,0 @@
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "handle.hpp"
-#include "online_driver_common.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
-#include "conv_tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp"
-
-namespace detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk {
-
-template <typename TInWei, typename TAcc, typename TOut>
-static std::string get_network_config_string_from_types()
-{
-    using namespace ck;
-
-    std::string out;
-
-    out += std::to_string(get_datatype_enum_from_type<TInWei>::value) + "_" +
-           std::to_string(get_datatype_enum_from_type<TAcc>::value) + "_" +
-           std::to_string(get_datatype_enum_from_type<TOut>::value);
-
-    return (out);
-};
-
-static std::string
-get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* pt)
-{
-    std::string out("TUN_");
-
-    out += std::to_string(pt->BlockSize) + "_";
-
-    out += std::to_string(pt->MPerBlock) + "x" + std::to_string(pt->NPerBlock) + "x" +
-           std::to_string(pt->KPerBlock) + "_";
-    out += std::to_string(pt->MPerWave) + "x" + std::to_string(pt->NPerWave) + "x" +
-           std::to_string(pt->MRepeat) + "x" + std::to_string(pt->NRepeat) + "x" +
-           std::to_string(pt->K1) + "_";
-
-    out += std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[2]) + "_";
-
-    out += std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[2]) + "_";
-
-    out += std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "_";
-
-    out += std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "x" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "x" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "_";
-
-    out += std::to_string(pt->ABlockTransferSrcVectorDim) + "_";
-    out += std::to_string(pt->ABlockTransferSrcScalarPerVector) + "_";
-    out += std::to_string(pt->ABlockTransferDstScalarPerVector_K1) + "_";
-    out += std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun) + "_";
-
-    out += std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[2]) + "_";
-
-    out += std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[2]) + "_";
-
-    out += std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "_";
-
-    out += std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "x" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "x" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "_";
-
-    out += std::to_string(pt->BBlockTransferSrcVectorDim) + "_";
-    out += std::to_string(pt->BBlockTransferSrcScalarPerVector) + "_";
-    out += std::to_string(pt->BBlockTransferDstScalarPerVector_K1) + "_";
-    out += std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun) + "_";
-
-    out += std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[6]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[7]) + "_";
-
-    out += std::to_string(pt->CThreadTransferSrcDstVectorDim) + "_";
-    out += std::to_string(pt->CThreadTransferDstScalarPerVector);
-
-    return (out);
-};
-
-template <typename TInWei, typename TAcc, typename TOut>
-static std::string get_definition_string_from_types()
-{
-    using namespace ck;
-
-    std::string out;
-
-    out +=
-        " -DCK_PARAM_ABDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TInWei>::value) +
-        " -DCK_PARAM_AccDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TAcc>::value) +
-        " -DCK_PARAM_CDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TOut>::value);
-
-    return (out);
-};
-
-static std::string
-get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* pt)
-{
-    std::string out;
-
-    out += " -DCK_PARAM_BlockSize=" + std::to_string(pt->BlockSize);
-
-    out += " -DCK_PARAM_MPerBlock=" + std::to_string(pt->MPerBlock) +
-           " -DCK_PARAM_NPerBlock=" + std::to_string(pt->NPerBlock) +
-           " -DCK_PARAM_KPerBlock=" + std::to_string(pt->KPerBlock);
-    out += " -DCK_PARAM_MPerWave=" + std::to_string(pt->MPerWave) +
-           " -DCK_PARAM_NPerWave=" + std::to_string(pt->NPerWave) +
-           " -DCK_PARAM_K1=" + std::to_string(pt->K1) +
-           " -DCK_PARAM_MRepeat=" + std::to_string(pt->MRepeat) +
-           " -DCK_PARAM_NRepeat=" + std::to_string(pt->NRepeat);
-
-    out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_K0_M_K1=" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[2]);
-
-    out += " -DCK_PARAM_ABlockTransferThreadClusterLengths_K0_M_K1=" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[2]);
-
-    out += " -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder=" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]);
-
-    out += " -DCK_PARAM_ABlockTransferSrcAccessOrder=" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "," +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "," +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[2]);
-
-    out +=
-        " -DCK_PARAM_ABlockTransferSrcVectorDim=" + std::to_string(pt->ABlockTransferSrcVectorDim);
-    out += " -DCK_PARAM_ABlockTransferSrcScalarPerVector=" +
-           std::to_string(pt->ABlockTransferSrcScalarPerVector);
-    out += " -DCK_PARAM_ABlockTransferDstScalarPerVector_K1=" +
-           std::to_string(pt->ABlockTransferDstScalarPerVector_K1);
-    out += " -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun=" +
-           std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun);
-
-    out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_K0_N_K1=" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[2]);
-
-    out += " -DCK_PARAM_BBlockTransferThreadClusterLengths_K0_N_K1=" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[2]);
-
-    out += " -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder=" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]);
-
-    out += " -DCK_PARAM_BBlockTransferSrcAccessOrder=" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "," +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "," +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[2]);
-
-    out +=
-        " -DCK_PARAM_BBlockTransferSrcVectorDim=" + std::to_string(pt->BBlockTransferSrcVectorDim);
-    out += " -DCK_PARAM_BBlockTransferSrcScalarPerVector=" +
-           std::to_string(pt->BBlockTransferSrcScalarPerVector);
-    out += " -DCK_PARAM_BBlockTransferDstScalarPerVector_K1=" +
-           std::to_string(pt->BBlockTransferDstScalarPerVector_K1);
-    out += " -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun=" +
-           std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun);
-
-    out += " -DCK_PARAM_CThreadTransferSrcDstAccessOrder=" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[6]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[7]);
-
-    out += " -DCK_PARAM_CThreadTransferSrcDstVectorDim=" +
-           std::to_string(pt->CThreadTransferSrcDstVectorDim);
-    out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
-           std::to_string(pt->CThreadTransferDstScalarPerVector);
-
-    return (out);
-};
-
-} // namespace detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
-
-template <typename TInWei,
-          typename TAcc,
-          typename TOut,
-          typename InLengths,
-          typename WeiLengths,
-          typename OutLengths,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk(
-    online_compile::Handle* handle,
-    const InLengths& in_n_hi_wi_c_lengths,
-    const WeiLengths& wei_k_y_x_c_lengths,
-    const OutLengths& out_n_ho_wo_k_lengths,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads,
-    const Tensor<TInWei>& in_n_hi_wi_c,
-    const Tensor<TInWei>& wei_k_y_x_c,
-    Tensor<TOut>& out_n_ho_wo_k,
-    const tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* tunable,
-    ck::index_t nrepeat)
-{
-    using namespace ck;
-    using namespace detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk;
-    using size_t = std::size_t;
-
-    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // The follow codes are only used for computing the grid_size, hasMainKBlockLoop,
-    // hasDoubleTailKBlockLoop
-
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    const auto in_n_hi_wi_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths);
-    const auto wei_k_y_x_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths);
-    const auto out_n_ho_wo_k_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths);
-
-    const auto n  = in_n_hi_wi_c_desc.GetLength(I0);
-    const auto hi = in_n_hi_wi_c_desc.GetLength(I1);
-    const auto wi = in_n_hi_wi_c_desc.GetLength(I2);
-    const auto c  = in_n_hi_wi_c_desc.GetLength(I3);
-
-    const auto k = wei_k_y_x_c_desc.GetLength(I0);
-    const auto y = wei_k_y_x_c_desc.GetLength(I1);
-    const auto x = wei_k_y_x_c_desc.GetLength(I2);
-
-    const auto ho = out_n_ho_wo_k_desc.GetLength(I1);
-    const auto wo = out_n_ho_wo_k_desc.GetLength(I2);
-
-    const auto M  = k;
-    const auto N  = n * ho * wo;
-    const auto K  = c * y * x;
-    const auto K0 = K / tunable->K1;
-
-    const index_t grid_size = (M / tunable->MPerBlock) * (N / tunable->NPerBlock);
-
-    // these buffers are usually provided by the user application
-    DeviceMem in_n_hi_wi_c_dev_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
-    DeviceMem wei_k_y_x_c_dev_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
-    DeviceMem out_n_ho_wo_k_dev_buf(sizeof(TOut) * out_n_ho_wo_k.mDesc.GetElementSpace());
-
-    in_n_hi_wi_c_dev_buf.ToDevice(in_n_hi_wi_c.mData.data());
-    wei_k_y_x_c_dev_buf.ToDevice(wei_k_y_x_c.mData.data());
-    out_n_ho_wo_k_dev_buf.ToDevice(out_n_ho_wo_k.mData.data());
-
-    // these are workspace buffers that should be expressed to the user by the corresponding
-    // workspace API
-    DeviceMem workspace_buf(4096);
-
-    void* a_k0_m_k1_grid_desc_dev_buf = workspace_buf.GetDeviceBuffer();
-    void* b_k0_n_k1_grid_desc_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 1024);
-    void* c_m0_m1_m2_n_grid_desc_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 2048);
-    void* c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 3072);
-
-    const std::vector<size_t> vld  = {static_cast<size_t>(tunable->BlockSize), 1, 1};
-    const std::vector<size_t> vgd1 = {static_cast<size_t>(tunable->BlockSize), 1, 1};
-    const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * tunable->BlockSize), 1, 1};
-
-    std::string program_name =
-        "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp";
-    std::string algo_name = "implicit_gemm_conv_fwd_v4r4_xdlops_nhwc";
-
-    std::string param = " -std=c++17 ";
-    std::string network_config;
-
-    param += get_definition_string_from_types<TInWei, TAcc, TOut>() + " -DCK_USE_AMD_XDLOPS ";
-    param += get_definition_string_from_tunable(tunable);
-
-    network_config = get_network_config_string_from_types<TInWei, TAcc, TOut>() + "_" +
-                     get_network_config_string_from_tunable(tunable);
-
-    std::vector<float> kernel1_times;
-    std::vector<float> kernel2_times;
-
-    for(index_t i = 0; i < nrepeat; ++i)
-    {
-        KernelTimer timer1, timer2;
-        std::string kernel_name;
-
-        kernel_name =
-            "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk_prepare";
-        auto network_config_1 = network_config + "_1";
-
-        timer1.Start();
-        handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)(
-            static_cast<index_t>(in_n_hi_wi_c_lengths[I0]),
-            static_cast<index_t>(in_n_hi_wi_c_lengths[I1]),
-            static_cast<index_t>(in_n_hi_wi_c_lengths[I2]),
-            static_cast<index_t>(in_n_hi_wi_c_lengths[I3]),
-            static_cast<index_t>(wei_k_y_x_c_lengths[I0]),
-            static_cast<index_t>(wei_k_y_x_c_lengths[I1]),
-            static_cast<index_t>(wei_k_y_x_c_lengths[I2]),
-            conv_strides[I0],
-            conv_strides[I1],
-            conv_dilations[I0],
-            conv_dilations[I1],
-            in_left_pads[I0],
-            in_left_pads[I1],
-            in_right_pads[I0],
-            in_right_pads[I1],
-            a_k0_m_k1_grid_desc_dev_buf,
-            b_k0_n_k1_grid_desc_dev_buf,
-            c_m0_m1_m2_n_grid_desc_dev_buf,
-            c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf);
-        timer1.End();
-
-        kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk";
-        auto network_config_2 = network_config + "_2";
-
-        timer2.Start();
-        handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)(
-            reinterpret_cast<const TInWei*>(in_n_hi_wi_c_dev_buf.GetDeviceBuffer()),
-            reinterpret_cast<const TInWei*>(wei_k_y_x_c_dev_buf.GetDeviceBuffer()),
-            reinterpret_cast<TOut*>(out_n_ho_wo_k_dev_buf.GetDeviceBuffer()),
-            (const void*)(a_k0_m_k1_grid_desc_dev_buf),
-            (const void*)(b_k0_n_k1_grid_desc_dev_buf),
-            (const void*)(c_m0_m1_m2_n_grid_desc_dev_buf),
-            (const void*)(c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf));
-        timer2.End();
-
-        kernel1_times.push_back(timer1.GetElapsedTime());
-        kernel2_times.push_back(timer2.GetElapsedTime());
-    }
-
-    {
-        auto ave_time1 =
-            std::accumulate(
-                std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus<float>{}) /
-            (nrepeat - 1);
-        auto ave_time2 =
-            std::accumulate(
-                std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus<float>{}) /
-            (nrepeat - 1);
-
-        const auto N = in_n_hi_wi_c_lengths[I0];
-        const auto C = in_n_hi_wi_c_lengths[I3];
-
-        const auto Ho = out_n_ho_wo_k_lengths[I1];
-        const auto Wo = out_n_ho_wo_k_lengths[I2];
-        const auto K  = out_n_ho_wo_k_lengths[I3];
-
-        const auto Y = wei_k_y_x_c_lengths[I1];
-        const auto X = wei_k_y_x_c_lengths[I2];
-
-        float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
-                     (std::size_t(1000) * 1000 * 1000) / ave_time2;
-
-        std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
-                  << ave_time2 << "), " << perf << " TFlop/s" << std::endl;
-    };
-
-    // copy result back to host
-    out_n_ho_wo_k_dev_buf.FromDevice(out_n_ho_wo_k.mData.data());
-}
diff --git a/src/composable_kernel/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp b/src/composable_kernel/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
deleted file mode 100644
index 7b88ef02b4..0000000000
--- a/src/composable_kernel/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
+++ /dev/null
@@ -1,183 +0,0 @@
-#pragma once
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "handle.hpp"
-#include "online_driver_common.hpp"
-#include "convolution_problem_descriptor.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp"
-#include "conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp"
-
-template <typename TInWei,
-          typename TAcc,
-          typename TOut,
-          typename InLengths,
-          typename WeiLengths,
-          typename OutLengths,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-void online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
-    online_compile::Handle* handle,
-    const InLengths& in_n_c_hi_wi_lengths,
-    const WeiLengths& wei_k_c_y_x_lengths,
-    const OutLengths& out_n_k_ho_wo_lengths,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads,
-    const Tensor<TInWei>& in_n_c_hi_wi,
-    const Tensor<TInWei>& wei_k_c_y_x,
-    Tensor<TOut>& out_n_k_ho_wo,
-    const ck::driver::CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw& compile_param,
-    ck::index_t nrepeat)
-{
-    using namespace ck;
-    using namespace ck::driver;
-    using size_t = std::size_t;
-
-    std::cout << __func__ << std::endl;
-
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    ConvolutionProblemDescriptor conv_problem_desc{in_n_c_hi_wi_lengths[I0],
-                                                   out_n_k_ho_wo_lengths[I1],
-                                                   in_n_c_hi_wi_lengths[I1],
-                                                   wei_k_c_y_x_lengths[I2],
-                                                   wei_k_c_y_x_lengths[I3],
-                                                   in_n_c_hi_wi_lengths[I2],
-                                                   in_n_c_hi_wi_lengths[I3],
-                                                   out_n_k_ho_wo_lengths[I2],
-                                                   out_n_k_ho_wo_lengths[I3],
-                                                   conv_strides[I0],
-                                                   conv_strides[I1],
-                                                   conv_dilations[I0],
-                                                   conv_dilations[I1],
-                                                   in_left_pads[I0],
-                                                   in_left_pads[I1],
-                                                   in_right_pads[I0],
-                                                   in_right_pads[I1],
-                                                   get_datatype_enum_from_type<TInWei>::value,
-                                                   get_datatype_enum_from_type<TInWei>::value,
-                                                   get_datatype_enum_from_type<TOut>::value};
-
-    if(!ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::IsValidCompileParameter(conv_problem_desc,
-                                                                   compile_param))
-    {
-        throw std::runtime_error("wrong! IsValidCompileParameter fail");
-    }
-
-    DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
-    DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
-    DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
-
-    in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data());
-    wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data());
-    out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data());
-
-    // workspace is used for save transformed tensor descritpors created by prepare kernel
-    DeviceMem workspace_dev_buf(
-        ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetWorkSpaceSize(conv_problem_desc, compile_param));
-
-    const auto block_size = std::size_t(
-        ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetBlockSize(conv_problem_desc, compile_param));
-
-    const auto grid_size = std::size_t(
-        ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetGridSize(conv_problem_desc, compile_param));
-
-    const std::vector<size_t> vld1 = {1, 1, 1};
-    const std::vector<size_t> vgd1 = {1, 1, 1};
-
-    const std::vector<size_t> vld2 = {static_cast<size_t>(block_size), 1, 1};
-    const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * block_size), 1, 1};
-
-    std::string program_name =
-        "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp";
-    std::string algo_name = "implicit_gemm_conv_fwd_v6r1_dlops_nchw";
-
-    std::string compile_param_string =
-        get_ck_hip_online_compile_common_flag() + compile_param.GetCompileParameterString();
-    std::string network_config = compile_param_string;
-
-    std::vector<float> kernel1_times;
-    std::vector<float> kernel2_times;
-
-    for(index_t i = 0; i < nrepeat + 1; ++i)
-    {
-        KernelTimer timer1, timer2;
-        std::string kernel_name;
-
-        kernel_name = "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare";
-        auto network_config_1 = network_config + "_1";
-
-        timer1.Start();
-        handle->AddKernel(algo_name,
-                          network_config_1,
-                          program_name,
-                          kernel_name,
-                          vld1,
-                          vgd1,
-                          compile_param_string)(static_cast<index_t>(in_n_c_hi_wi_lengths[I0]),
-                                                static_cast<index_t>(in_n_c_hi_wi_lengths[I1]),
-                                                static_cast<index_t>(in_n_c_hi_wi_lengths[I2]),
-                                                static_cast<index_t>(in_n_c_hi_wi_lengths[I3]),
-                                                static_cast<index_t>(wei_k_c_y_x_lengths[I0]),
-                                                static_cast<index_t>(wei_k_c_y_x_lengths[I2]),
-                                                static_cast<index_t>(wei_k_c_y_x_lengths[I3]),
-                                                conv_strides[I0],
-                                                conv_strides[I1],
-                                                conv_dilations[I0],
-                                                conv_dilations[I1],
-                                                in_left_pads[I0],
-                                                in_left_pads[I1],
-                                                in_right_pads[I0],
-                                                in_right_pads[I1],
-                                                (void*)(workspace_dev_buf.GetDeviceBuffer()));
-        timer1.End();
-
-        kernel_name = "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw";
-        auto network_config_2 = network_config + "_2";
-
-        timer2.Start();
-        handle->AddKernel(algo_name,
-                          network_config_2,
-                          program_name,
-                          kernel_name,
-                          vld2,
-                          vgd2,
-                          compile_param_string)(
-            reinterpret_cast<const TInWei*>(wei_k_c_y_x_dev_buf.GetDeviceBuffer()),
-            reinterpret_cast<const TInWei*>(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()),
-            reinterpret_cast<TOut*>(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()),
-            (const void*)(workspace_dev_buf.GetDeviceBuffer()));
-        timer2.End();
-
-        kernel1_times.push_back(timer1.GetElapsedTime());
-        kernel2_times.push_back(timer2.GetElapsedTime());
-    }
-
-    {
-        auto ave_time1 =
-            std::accumulate(
-                std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus<float>{}) /
-            nrepeat;
-        auto ave_time2 =
-            std::accumulate(
-                std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus<float>{}) /
-            nrepeat;
-
-        float perf = (float)(conv_problem_desc.CalculateFlop()) /
-                     (std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2);
-
-        std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
-                  << ave_time2 << "), " << perf << " TFlop/s" << std::endl;
-    };
-
-    // copy result back to host
-    out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data());
-}
diff --git a/src/composable_kernel/host/online_compile/CMakeLists.txt b/src/composable_kernel/host/online_compile/CMakeLists.txt
deleted file mode 100644
index 1b66703fcd..0000000000
--- a/src/composable_kernel/host/online_compile/CMakeLists.txt
+++ /dev/null
@@ -1,168 +0,0 @@
-set(CMAKE_CXX_COMPILER /opt/rocm/llvm/bin/clang++)
-
-## for online-compiling of HIP kernels
-set(OLC_HIP_COMPILER ${CMAKE_CXX_COMPILER} CACHE PATH "")
-
-## reset to avoid the C++ options from the parent project
-set(CMAKE_CXX_FLAGS "")
-message("Compiling options for library and kernels: ${CMAKE_CXX_FLAGS}")
-
-# look for and register clang-offload-bundler
-if(OLC_HIP_COMPILER MATCHES ".*clang\\+\\+$")
-    find_program(OLC_OFFLOADBUNDLER_BIN clang-offload-bundler
-        PATH_SUFFIXES bin
-        PATHS
-	    /opt/rocm/llvm
-	    ${CMAKE_INSTALL_PREFIX}/llvm
-    )
-endif()
-
-if(OLC_OFFLOADBUNDLER_BIN)
-    message(STATUS "clang-offload-bundler found: ${OLC_OFFLOADBUNDLER_BIN}")
-    set(OLC_OFFLOADBUNDLER_BIN "${OLC_OFFLOADBUNDLER_BIN}")
-else()
-    # look for and register extractkernel
-    message(STATUS "clang-offload-bundler not found")
-
-    find_program(EXTRACTKERNEL_BIN extractkernel
-        PATH_SUFFIXES bin
-        PATHS
-            /opt/rocm/hip
-            /opt/rocm/hcc
-            /opt/rocm
-	    ${CMAKE_INSTALL_PREFIX}/hip
-            ${CMAKE_INSTALL_PREFIX}/hcc
-            ${CMAKE_INSTALL_PREFIX}
-
-    )
-    if(EXTRACTKERNEL_BIN)
-        message(STATUS "extractkernel found: ${EXTRACTKERNEL_BIN}")
-        set(EXTRACTKERNEL_BIN "${EXTRACTKERNEL_BIN}")
-    else()
-        message(FATAL_ERROR "extractkernel not found")
-    endif()
-endif()
-
-option(Boost_USE_STATIC_LIBS "Use boost static libraries" OFF)
-set(BOOST_COMPONENTS filesystem)
-add_definitions(-DBOOST_ALL_NO_LIB=1)
-find_package(Boost REQUIRED COMPONENTS ${BOOST_COMPONENTS})
-
-# HIP is always required
-find_package(hip REQUIRED PATHS /opt/rocm)
-message(STATUS "Build with HIP ${hip_VERSION}")
-target_flags(HIP_COMPILER_FLAGS hip::device)
-# Remove cuda arch flags
-string(REGEX REPLACE --cuda-gpu-arch=[a-z0-9]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
-string(REGEX REPLACE --offload-arch=[a-z0-9]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
-
-set(OLC_hip_VERSION_MAJOR "${hip_VERSION_MAJOR}")
-set(OLC_hip_VERSION_MINOR "${hip_VERSION_MINOR}")
-set(OLC_hip_VERSION_PATCH "${hip_VERSION_PATCH}")
-
-option(ENABLE_DEBUG "Build to enable debugging" ON)
-if(ENABLE_DEBUG)
-    set(OLC_DEBUG 1)
-else()
-    set(OLC_DEBUG 0)
-endif()
-
-configure_file("${PROJECT_SOURCE_DIR}/host/online_compile/include/config.h.in" "${PROJECT_BINARY_DIR}/host/online_compile/include/config.h")
-
-include_directories(BEFORE
-    ${PROJECT_BINARY_DIR}/host/online_compile/include
-)
-
-message(STATUS "Hip compiler flags: ${HIP_COMPILER_FLAGS}")
-
-## HIP_COMPILER_FLAGS will be used for on-line compiling of the HIP kernels
-set(HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS} ${HIP_ONLINE_COMPILER_FLAGS}")
-add_definitions("-DHIP_COMPILER_FLAGS=${HIP_COMPILER_FLAGS}")
-
-file(GLOB_RECURSE COMPOSABLE_KERNEL_INCLUDE_1 "${PROJECT_SOURCE_DIR}/composable_kernel/include/*/*.hpp")
-file(GLOB COMPOSABLE_KERNEL_INCLUDE_2 "${PROJECT_SOURCE_DIR}/external/rocm/include/bfloat16_dev.hpp")
-set(MCONV_KERNEL_INCLUDES
-    ${COMPOSABLE_KERNEL_INCLUDE_1}
-    ${COMPOSABLE_KERNEL_INCLUDE_2}
-   )
-
-file(GLOB_RECURSE MCONV_KERNELS "${PROJECT_SOURCE_DIR}/composable_kernel/src/kernel_wrapper/*.cpp")
-
-add_kernels(${CMAKE_CURRENT_SOURCE_DIR} "${MCONV_KERNELS}")
-add_kernel_includes(${CMAKE_CURRENT_SOURCE_DIR} "${MCONV_KERNEL_INCLUDES}")
-
-set(ONLINE_COMPILATION_SOURCE
-     ${PROJECT_BINARY_DIR}/kernel.cpp
-     ${PROJECT_BINARY_DIR}/kernel_includes.cpp
-)
-
-include_directories(BEFORE
-    ${PROJECT_BINARY_DIR}/host/online_compile/include
-    include
-)
-
-set(OLC_HIP_UTILITY_CPPS
-    hip_utility/logger.cpp
-    hip_utility/tmp_dir.cpp
-    hip_utility/md5.cpp  
-    hip_utility/exec_utils.cpp
-    hip_utility/target_properties.cpp  
-    hip_utility/handlehip.cpp
-    hip_utility/kernel_build_params.cpp  
-    hip_utility/hip_build_utils.cpp  
-    hip_utility/hipoc_program.cpp  
-    hip_utility/hipoc_kernel.cpp  
-    hip_utility/kernel_cache.cpp  
-    hip_utility/binary_cache.cpp
-   )
-
-list(APPEND OLC_SOURCES ${OLC_HIP_UTILITY_CPPS} ${OLC_HIP_UTILITY_HEADERS})
-
-## addkernels provide the tool to create inlined kernels in one header
-add_subdirectory(addkernels)
-
-function(inline_kernels_src KERNELS KERNEL_INCLUDES)
-    set(KERNEL_SRC_HPP_FILENAME batch_all.cpp.hpp)
-    set(KERNEL_SRC_HPP_PATH ${PROJECT_BINARY_DIR}/inlined_kernels/${KERNEL_SRC_HPP_FILENAME})
-    set(KERNEL_SRC_CPP_PATH ${PROJECT_BINARY_DIR}/inlined_kernels/batch_all.cpp)
-
-    add_custom_command(
-        OUTPUT ${KERNEL_SRC_HPP_PATH}
-        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-        DEPENDS addkernels ${KERNELS} ${KERNEL_INCLUDES}
-        COMMAND $<TARGET_FILE:addkernels> -target ${KERNEL_SRC_HPP_PATH} -extern -source ${KERNELS}
-	COMMENT "Inlining All kernels"
-    )
-    configure_file(kernels_batch.cpp.in ${KERNEL_SRC_CPP_PATH})
-    list(APPEND OLC_SOURCES ${KERNEL_SRC_CPP_PATH} ${KERNEL_SRC_HPP_PATH})
-
-    set(OLC_SOURCES ${OLC_SOURCES} PARENT_SCOPE)
-endfunction()
-
-inline_kernels_src("${MCONV_KERNELS}" "${MCONV_KERNEL_INCLUDES}")
-
-list(APPEND ONLINE_COMPILATION_SOURCE ${OLC_SOURCES} ${PROJECT_BINARY_DIR}/olc_kernel_includes.h)
-
-add_custom_command(
-    OUTPUT ${PROJECT_BINARY_DIR}/olc_kernel_includes.h
-    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-    DEPENDS addkernels ${MCONV_KERNEL_INCLUDES}
-    COMMAND $<TARGET_FILE:addkernels> -no-recurse -guard GUARD_OLC_KERNEL_INCLUDES_HPP_ -target ${PROJECT_BINARY_DIR}/olc_kernel_includes.h -source ${MCONV_KERNEL_INCLUDES}
-    COMMENT "Inlining HIP kernel includes"
-  )
-
-## the library target
-add_library(online_compile SHARED ${ONLINE_COMPILATION_SOURCE}) 
-
-target_include_directories(online_compile PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/online_compile/include/)
-target_include_directories(online_compile PRIVATE ${PROJECT_BINARY_DIR})
-target_include_directories(online_compile PRIVATE ${PROJECT_SOURCE_DIR}/external/half/include/)
-
-target_link_libraries(online_compile PRIVATE hip::device)
-target_link_libraries(online_compile INTERFACE hip::host)
-target_link_libraries(online_compile PRIVATE Boost::filesystem)
-
-target_compile_features(online_compile PUBLIC)
-set_target_properties(online_compile PROPERTIES POSITION_INDEPENDENT_CODE ON)
-
-install(TARGETS online_compile LIBRARY DESTINATION lib) 
diff --git a/src/composable_kernel/host/online_compile/addkernels/CMakeLists.txt b/src/composable_kernel/host/online_compile/addkernels/CMakeLists.txt
deleted file mode 100644
index 874cba6a5e..0000000000
--- a/src/composable_kernel/host/online_compile/addkernels/CMakeLists.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-################################################################################
-# 
-# MIT License
-# 
-# Copyright (c) 2017 Advanced Micro Devices, Inc.
-# 
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-# 
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-# 
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-# 
-################################################################################
-
-set(ADD_KERNELS_SOURCE include_inliner.cpp addkernels.cpp)
-
-add_executable(addkernels EXCLUDE_FROM_ALL ${ADD_KERNELS_SOURCE})
-
diff --git a/src/composable_kernel/host/online_compile/addkernels/addkernels.cpp b/src/composable_kernel/host/online_compile/addkernels/addkernels.cpp
deleted file mode 100644
index 5be523d97b..0000000000
--- a/src/composable_kernel/host/online_compile/addkernels/addkernels.cpp
+++ /dev/null
@@ -1,264 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include "include_inliner.hpp"
-#include <algorithm>
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include <memory>
-#include <sstream>
-#include <string>
-
-void Bin2Hex(std::istream& source,
-             std::ostream& target,
-             const std::string& variable,
-             bool nullTerminate,
-             size_t bufferSize,
-             size_t lineSize)
-{
-    source.seekg(0, std::ios::end);
-    std::unique_ptr<unsigned char[]> buffer(new unsigned char[bufferSize]);
-    std::streamoff sourceSize = source.tellg();
-    std::streamoff blockStart = 0;
-
-    if(variable.length() != 0)
-    {
-        target << "extern const size_t " << variable << "_SIZE;" << std::endl;
-        target << "extern const unsigned char " << variable << "[];" << std::endl;
-        target << "const size_t " << variable << "_SIZE = " << std::setbase(10) << sourceSize << ";"
-               << std::endl;
-        target << "const unsigned char " << variable << "[] = {" << std::endl;
-    }
-
-    target << std::setbase(16) << std::setfill('0');
-    source.seekg(0, std::ios::beg);
-
-    while(blockStart < sourceSize)
-    {
-        source.read(reinterpret_cast<char*>(buffer.get()), bufferSize);
-
-        std::streamoff pos       = source.tellg();
-        std::streamoff blockSize = (pos < 0 ? sourceSize : pos) - blockStart;
-        std::streamoff i         = 0;
-
-        while(i < blockSize)
-        {
-            size_t j   = i;
-            size_t end = std::min<size_t>(i + lineSize, blockSize);
-
-            for(; j < end; j++)
-                target << "0x" << std::setw(2) << static_cast<unsigned>(buffer[j]) << ",";
-
-            target << std::endl;
-            i = end;
-        }
-
-        blockStart += blockSize;
-    }
-
-    if(nullTerminate)
-        target << "0x00," << std::endl;
-
-    if(variable.length() != 0)
-    {
-        target << "};" << std::endl;
-    }
-}
-
-void PrintHelp()
-{
-    std::cout << "Usage: bin2hex {<option>}" << std::endl;
-    std::cout << "Option format: -<option name>[ <option value>]" << std::endl;
-    std::cout << std::endl;
-    std::cout << "Options:" << std::endl;
-    std::cout
-        << "[REQUIRED] -s[ource] {<path to file>}: files to be processed. Must be last argument."
-        << std::endl;
-    std::cout << "           -t[arget] <path>: target file. Default: std out." << std::endl;
-    std::cout << "           -l[ine-size] <number>: bytes in one line. Default: 16." << std::endl;
-    std::cout << "           -b[uffer] <number>: read buffer size. Default: 512." << std::endl;
-    std::cout << "           -g[uard] <string>: guard name. Default: no guard" << std::endl;
-    std::cout << "           -n[o-recurse] : dont expand include files recursively. Default: off"
-              << std::endl;
-}
-
-[[gnu::noreturn]] void WrongUsage(const std::string& error)
-{
-    std::cout << "Wrong usage: " << error << std::endl;
-    std::cout << std::endl;
-    PrintHelp();
-    std::exit(1);
-}
-
-[[gnu::noreturn]] void UnknownArgument(const std::string& arg)
-{
-    std::ostringstream ss;
-    ss << "unknown argument - " << arg;
-    WrongUsage(ss.str());
-}
-
-void Process(const std::string& sourcePath,
-             std::ostream& target,
-             size_t bufferSize,
-             size_t lineSize,
-             bool recurse,
-             bool as_extern)
-{
-    std::string fileName(sourcePath);
-    std::string extension, root;
-    std::stringstream inlinerTemp;
-    auto extPos   = fileName.rfind('.');
-    auto slashPos = fileName.rfind('/');
-
-    if(extPos != std::string::npos)
-    {
-        extension = fileName.substr(extPos + 1);
-        fileName  = fileName.substr(0, extPos);
-    }
-
-    if(slashPos != std::string::npos)
-    {
-        root     = fileName.substr(0, slashPos + 1);
-        fileName = fileName.substr(slashPos + 1);
-    }
-
-    std::string variable(fileName);
-    std::ifstream sourceFile(sourcePath, std::ios::in | std::ios::binary);
-    std::istream* source = &sourceFile;
-
-    if(!sourceFile.good())
-    {
-        std::cerr << "File not found: " << sourcePath << std::endl;
-        std::exit(1);
-    }
-
-    const auto is_asm    = extension == "s";
-    const auto is_cl     = extension == "cl";
-    const auto is_hip    = extension == "cpp";
-    const auto is_header = extension == "hpp";
-
-    if(is_asm || is_cl || is_hip || is_header)
-    {
-        IncludeInliner inliner;
-
-        try
-        {
-            if(is_asm)
-                inliner.Process(
-                    sourceFile, inlinerTemp, root, sourcePath, ".include", false, recurse);
-            else if(is_cl || is_header)
-                inliner.Process(
-                    sourceFile, inlinerTemp, root, sourcePath, "#include", true, recurse);
-            else if(is_hip)
-                inliner.Process(
-                    sourceFile, inlinerTemp, root, sourcePath, "<#not_include>", true, false);
-        }
-        catch(const InlineException& ex)
-        {
-            std::cerr << ex.What() << std::endl;
-            std::cerr << ex.GetTrace() << std::endl;
-            std::exit(1);
-        }
-
-        source = &inlinerTemp;
-    }
-
-    std::transform(variable.begin(), variable.end(), variable.begin(), ::toupper);
-
-    if(as_extern && variable.length() != 0)
-    {
-        variable = "APP_KERNEL_" + variable;
-    }
-
-    Bin2Hex(*source, target, variable, true, bufferSize, lineSize);
-}
-
-int main(int argsn, char** args)
-{
-    if(argsn == 1)
-    {
-        PrintHelp();
-        return 2;
-    }
-
-    std::string guard;
-    size_t bufferSize = 512;
-    size_t lineSize   = 16;
-
-    std::ofstream targetFile;
-    std::ostream* target = &std::cout;
-    bool recurse         = true;
-    bool as_extern       = false;
-
-    int i = 0;
-    while(++i < argsn && **args != '-')
-    {
-        std::string arg(args[i] + 1);
-        std::transform(arg.begin(), arg.end(), arg.begin(), ::tolower);
-
-        if(arg == "s" || arg == "source")
-        {
-            if(guard.length() > 0)
-            {
-                *target << "#ifndef " << guard << std::endl;
-                *target << "#define " << guard << std::endl;
-            }
-
-            *target << "#include <cstddef>" << std::endl;
-
-            while(++i < argsn)
-            {
-                Process(args[i], *target, bufferSize, lineSize, recurse, as_extern);
-            }
-
-            if(guard.length() > 0)
-            {
-                *target << "#endif" << std::endl;
-            }
-
-            return 0;
-        }
-        else if(arg == "t" || arg == "target")
-        {
-            targetFile.open(args[++i], std::ios::out);
-            target = &targetFile;
-        }
-        else if(arg == "l" || arg == "line-size")
-            lineSize = std::stol(args[++i]);
-        else if(arg == "b" || arg == "buffer")
-            bufferSize = std::stol(args[++i]);
-        else if(arg == "g" || arg == "guard")
-            guard = args[++i];
-        else if(arg == "n" || arg == "no-recurse")
-            recurse = false;
-        else if(arg == "e" || arg == "extern")
-            as_extern = true;
-        else
-            UnknownArgument(arg);
-    }
-
-    WrongUsage("source key is required");
-}
diff --git a/src/composable_kernel/host/online_compile/addkernels/include_inliner.cpp b/src/composable_kernel/host/online_compile/addkernels/include_inliner.cpp
deleted file mode 100644
index e5aec7dd77..0000000000
--- a/src/composable_kernel/host/online_compile/addkernels/include_inliner.cpp
+++ /dev/null
@@ -1,213 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include <algorithm>
-#include <exception>
-#include <fstream>
-#include <sstream>
-
-#ifdef _WIN32
-#include <windows.h>
-#endif
-#ifdef __linux__
-#include <linux/limits.h>
-#include <cstdlib>
-#endif // !WIN32
-
-#include "include_inliner.hpp"
-
-namespace PathHelpers {
-static int GetMaxPath()
-{
-#ifdef _WIN32
-    return MAX_PATH;
-#else
-    return PATH_MAX;
-#endif
-}
-
-static std::string GetAbsolutePath(const std::string& path)
-{
-    std::string result(GetMaxPath(), ' ');
-#ifdef _WIN32
-    const auto retval = GetFullPathName(path.c_str(), result.size(), &result[0], nullptr);
-
-    if(retval == 0)
-        return "";
-#else
-    auto* const retval = realpath(path.c_str(), &result[0]);
-
-    if(retval == nullptr)
-        return "";
-#endif
-    return result;
-}
-} // namespace PathHelpers
-
-std::string IncludeFileExceptionBase::What() const
-{
-    std::ostringstream ss;
-    ss << GetMessage() << ": <" << _file << ">";
-
-    return ss.str();
-}
-
-void IncludeInliner::Process(std::istream& input,
-                             std::ostream& output,
-                             const std::string& root,
-                             const std::string& file_name,
-                             const std::string& directive,
-                             bool allow_angle_brackets,
-                             bool recurse)
-{
-    ProcessCore(input, output, root, file_name, 0, directive, allow_angle_brackets, recurse);
-}
-
-void IncludeInliner::ProcessCore(std::istream& input,
-                                 std::ostream& output,
-                                 const std::string& root,
-                                 const std::string& file_name,
-                                 int line_number,
-                                 const std::string& directive,
-                                 bool allow_angle_brackets,
-                                 bool recurse)
-{
-    if(_include_depth >= include_depth_limit)
-        throw InlineStackOverflowException(GetIncludeStackTrace(0));
-
-    _include_depth++;
-    _included_stack_head =
-        std::make_shared<SourceFileDesc>(file_name, _included_stack_head, line_number);
-    auto current_line          = 0;
-    auto next_include_optional = false;
-
-    while(!input.eof())
-    {
-        std::string line;
-        std::string word;
-        std::getline(input, line);
-        std::istringstream line_parser(line);
-        line_parser >> word;
-        current_line++;
-        std::transform(word.begin(), word.end(), word.begin(), ::tolower);
-
-        const auto include_optional = next_include_optional;
-        next_include_optional       = false;
-
-        if(!word.empty() && word == "//inliner-include-optional")
-        {
-            if(include_optional)
-                throw IncludeExpectedException(GetIncludeStackTrace(current_line));
-            next_include_optional = true;
-            continue;
-        }
-
-        if(!word.empty() && word == directive && recurse)
-        {
-            auto first_quote_pos = line.find('"', static_cast<int>(line_parser.tellg()) + 1);
-            std::string::size_type second_quote_pos;
-
-            if(first_quote_pos != std::string::npos)
-            {
-                second_quote_pos = line.find('"', first_quote_pos + 1);
-                if(second_quote_pos == std::string::npos)
-                    throw WrongInlineDirectiveException(GetIncludeStackTrace(current_line));
-            }
-            else
-            {
-                if(!allow_angle_brackets)
-                    throw WrongInlineDirectiveException(GetIncludeStackTrace(current_line));
-
-                first_quote_pos = line.find('<', static_cast<int>(line_parser.tellg()) + 1);
-                if(first_quote_pos == std::string::npos)
-                    throw WrongInlineDirectiveException(GetIncludeStackTrace(current_line));
-
-                second_quote_pos = line.find('>', first_quote_pos + 1);
-                if(second_quote_pos == std::string::npos)
-                    throw WrongInlineDirectiveException(GetIncludeStackTrace(current_line));
-            }
-
-            const std::string include_file_path =
-                line.substr(first_quote_pos + 1, second_quote_pos - first_quote_pos - 1);
-            const std::string abs_include_file_path(
-                PathHelpers::GetAbsolutePath(root + "/" + include_file_path)); // NOLINT
-
-            if(abs_include_file_path.empty())
-            {
-                if(include_optional)
-                    continue;
-                throw IncludeNotFoundException(include_file_path,
-                                               GetIncludeStackTrace(current_line));
-            }
-            std::ifstream include_file(abs_include_file_path, std::ios::in);
-
-            if(!include_file.good())
-                throw IncludeCantBeOpenedException(include_file_path,
-                                                   GetIncludeStackTrace(current_line));
-
-            ProcessCore(include_file,
-                        output,
-                        root,
-                        include_file_path,
-                        current_line,
-                        directive,
-                        allow_angle_brackets,
-                        recurse);
-        }
-        else
-        {
-            if(include_optional)
-                throw IncludeExpectedException(GetIncludeStackTrace(current_line));
-
-            if(output.tellp() > 0)
-                output << std::endl;
-
-            output << line;
-        }
-    }
-
-    auto prev_file       = _included_stack_head->included_from;
-    _included_stack_head = prev_file;
-    _include_depth--;
-}
-
-std::string IncludeInliner::GetIncludeStackTrace(int line)
-{
-    std::ostringstream ss;
-
-    if(_included_stack_head == nullptr)
-        return "";
-
-    auto item = _included_stack_head;
-    ss << "    " << item->path << ":" << line;
-
-    while(item->included_from != nullptr)
-    {
-        ss << std::endl << "    from " << item->included_from->path << ":" << item->included_line;
-        item = item->included_from;
-    }
-
-    return ss.str();
-}
diff --git a/src/composable_kernel/host/online_compile/addkernels/include_inliner.hpp b/src/composable_kernel/host/online_compile/addkernels/include_inliner.hpp
deleted file mode 100644
index 501ad7cc78..0000000000
--- a/src/composable_kernel/host/online_compile/addkernels/include_inliner.hpp
+++ /dev/null
@@ -1,142 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef SOURCE_INLINER_HPP
-
-#define SOURCE_INLINER_HPP
-#include "source_file_desc.hpp"
-#include <ostream>
-#include <memory>
-#include <stack>
-
-class InlineException : public std::exception
-{
-    public:
-    InlineException(const std::string& trace) : _trace(trace) {}
-
-    virtual std::string What() const = 0;
-    const std::string& GetTrace() const { return _trace; }
-
-    private:
-    std::string _trace;
-};
-
-class InlineStackOverflowException : public InlineException
-{
-    public:
-    InlineStackOverflowException(const std::string& trace) : InlineException(trace) {}
-
-    std::string What() const override
-    {
-        return "Include stack depth limit has been reached, possible circle includes";
-    }
-};
-
-class IncludeExpectedException : public InlineException
-{
-    public:
-    IncludeExpectedException(const std::string& trace) : InlineException(trace) {}
-
-    std::string What() const override { return "Include directive expected"; }
-};
-
-class WrongInlineDirectiveException : public InlineException
-{
-    public:
-    WrongInlineDirectiveException(const std::string& trace) : InlineException(trace) {}
-
-    std::string What() const override { return "Include directive has wrong format"; }
-};
-
-class IncludeFileExceptionBase : public InlineException
-{
-    public:
-    IncludeFileExceptionBase(const std::string& file, const std::string& trace)
-        : InlineException(trace), _file(file)
-    {
-    }
-
-    std::string What() const override;
-    virtual std::string GetMessage() const = 0;
-
-    private:
-    std::string _file;
-};
-
-class IncludeNotFoundException : public IncludeFileExceptionBase
-{
-    public:
-    IncludeNotFoundException(const std::string& file, const std::string& trace)
-        : IncludeFileExceptionBase(file, trace)
-    {
-    }
-
-    std::string GetMessage() const override
-    {
-        return "Include file not found (if it is optional put //inliner-include-optional on line "
-               "before it)";
-    }
-};
-
-class IncludeCantBeOpenedException : public IncludeFileExceptionBase
-{
-    public:
-    IncludeCantBeOpenedException(const std::string& file, const std::string& trace)
-        : IncludeFileExceptionBase(file, trace)
-    {
-    }
-
-    std::string GetMessage() const override { return "Can not open include file"; }
-};
-
-class IncludeInliner
-{
-    public:
-    int include_depth_limit = 256;
-
-    void Process(std::istream& input,
-                 std::ostream& output,
-                 const std::string& root,
-                 const std::string& file_name,
-                 const std::string& directive,
-                 bool allow_angle_brackets,
-                 bool recurse);
-    std::string GetIncludeStackTrace(int line);
-
-    private:
-    int _include_depth                                   = 0;
-    std::shared_ptr<SourceFileDesc> _included_stack_head = nullptr;
-
-    void ProcessCore(std::istream& input,
-                     std::ostream& output,
-                     const std::string& root,
-                     const std::string& file_name,
-                     int line_number,
-                     const std::string& directive,
-                     bool allow_angle_brackets,
-                     bool recurse);
-};
-
-#endif // !SOURCE_INLINER_HPP
diff --git a/src/composable_kernel/host/online_compile/addkernels/source_file_desc.hpp b/src/composable_kernel/host/online_compile/addkernels/source_file_desc.hpp
deleted file mode 100644
index f0fbf5938f..0000000000
--- a/src/composable_kernel/host/online_compile/addkernels/source_file_desc.hpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef SOURCE_FILE_DESC_HPP
-
-#define SOURCE_FILE_DESC_HPP
-#include <string>
-#include <memory>
-
-class SourceFileDesc
-{
-    public:
-    const std::string path;
-    int included_line;
-    std::shared_ptr<SourceFileDesc> included_from;
-
-    SourceFileDesc(const std::string& path_, std::shared_ptr<SourceFileDesc> from, int line)
-        : path(path_), included_line(line), included_from(from)
-    {
-    }
-};
-
-#endif // SOURCE_FILE_DESC_HPP
diff --git a/src/composable_kernel/host/online_compile/hip_utility/binary_cache.cpp b/src/composable_kernel/host/online_compile/hip_utility/binary_cache.cpp
deleted file mode 100644
index b899d1e296..0000000000
--- a/src/composable_kernel/host/online_compile/hip_utility/binary_cache.cpp
+++ /dev/null
@@ -1,112 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
-#include <binary_cache.hpp>
-#include <handle.hpp>
-#include <md5.hpp>
-#include <env.hpp>
-#include <stringutils.hpp>
-#include <logger.hpp>
-#include <target_properties.hpp>
-#include <boost/filesystem.hpp>
-#include <fstream>
-#include <iostream>
-
-namespace online_compile {
-
-OLC_DECLARE_ENV_VAR(OLC_DISABLE_CACHE)
-OLC_DECLARE_ENV_VAR(HOME)
-
-static boost::filesystem::path ComputeCachePath()
-{
-    const char* home_dir = GetStringEnv(HOME{});
-    if(home_dir == nullptr || home_dir == std::string("/") || home_dir == std::string(""))
-    {
-        home_dir = "/tmp";
-    }
-
-    auto p = boost::filesystem::path{home_dir} / "_hip_binary_kernels_";
-
-    if(!boost::filesystem::exists(p))
-        boost::filesystem::create_directories(p);
-    return p;
-}
-
-boost::filesystem::path GetCachePath()
-{
-    static const boost::filesystem::path user_path = ComputeCachePath();
-
-    return user_path;
-}
-
-static bool IsCacheDisabled() { return online_compile::IsEnabled(OLC_DISABLE_CACHE{}); }
-
-boost::filesystem::path
-GetCacheFile(const std::string& device, const std::string& name, const std::string& args)
-{
-    // std::string filename = (is_kernel_str ? online_compile::md5(name) : name) + ".o";
-    std::string filename = name + ".o";
-    return GetCachePath() / online_compile::md5(device + ":" + args) / filename;
-}
-
-boost::filesystem::path LoadBinary(const TargetProperties& target,
-                                   const size_t num_cu,
-                                   const std::string& name,
-                                   const std::string& args)
-{
-    if(online_compile::IsCacheDisabled())
-        return {};
-
-    (void)num_cu;
-    auto f = GetCacheFile(target.DbId(), name, args);
-    if(boost::filesystem::exists(f))
-    {
-        return f.string();
-    }
-    else
-    {
-        return {};
-    }
-}
-
-void SaveBinary(const boost::filesystem::path& binary_path,
-                const TargetProperties& target,
-                const std::string& name,
-                const std::string& args)
-{
-    if(online_compile::IsCacheDisabled())
-    {
-        boost::filesystem::remove(binary_path);
-    }
-    else
-    {
-        auto p = GetCacheFile(target.DbId(), name, args);
-        boost::filesystem::create_directories(p.parent_path());
-        boost::filesystem::rename(binary_path, p);
-    }
-}
-
-} // namespace online_compile
diff --git a/src/composable_kernel/host/online_compile/hip_utility/exec_utils.cpp b/src/composable_kernel/host/online_compile/hip_utility/exec_utils.cpp
deleted file mode 100644
index ec305783f1..0000000000
--- a/src/composable_kernel/host/online_compile/hip_utility/exec_utils.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2019 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include <exec_utils.hpp>
-#include <manage_ptr.hpp>
-#include <istream>
-#include <ostream>
-#include <string>
-#include <cstdio>
-#include <array>
-#include <cassert>
-
-#ifdef __linux__
-#include <unistd.h>
-#include <cstdio>
-#include <sys/wait.h>
-#endif // __linux__
-
-namespace online_compile {
-namespace exec {
-
-int Run(const std::string& p, std::istream* in, std::ostream* out)
-{
-#ifdef __linux__
-    const auto redirect_stdin  = (in != nullptr);
-    const auto redirect_stdout = (out != nullptr);
-
-    assert(!(redirect_stdin && redirect_stdout));
-
-    const auto file_mode = redirect_stdout ? "r" : "w";
-    OLC_MANAGE_PTR(FILE*, pclose) pipe{popen(p.c_str(), file_mode)};
-
-    if(!pipe)
-        throw std::runtime_error("online_compile::exec::Run(): popen(" + p + ", " + file_mode +
-                                 ") failed");
-
-    if(redirect_stdin || redirect_stdout)
-    {
-        std::array<char, 1024> buffer{};
-
-        if(redirect_stdout)
-        {
-            while(feof(pipe.get()) == 0)
-                if(fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr)
-                    *out << buffer.data();
-        }
-        else
-        {
-            while(!in->eof())
-            {
-                in->read(buffer.data(), buffer.size() - 1);
-                buffer[in->gcount()] = 0;
-
-                if(fputs(buffer.data(), pipe.get()) == EOF)
-                    throw std::runtime_error("online_compile::exec::Run(): fputs() failed");
-            }
-        }
-    }
-
-    auto status = pclose(pipe.release());
-    return WEXITSTATUS(status);
-#else
-    (void)p;
-    (void)in;
-    (void)out;
-    return -1;
-#endif // __linux__
-}
-
-} // namespace exec
-} // namespace online_compile
diff --git a/src/composable_kernel/host/online_compile/hip_utility/handlehip.cpp b/src/composable_kernel/host/online_compile/hip_utility/handlehip.cpp
deleted file mode 100644
index 843957b4ad..0000000000
--- a/src/composable_kernel/host/online_compile/hip_utility/handlehip.cpp
+++ /dev/null
@@ -1,285 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
-#include <handle.hpp>
-
-#include <binary_cache.hpp>
-#include <env.hpp>
-#include <kernel_cache.hpp>
-#include <stringutils.hpp>
-#include <target_properties.hpp>
-
-#include <hipCheck.hpp>
-
-#include <write_file.hpp>
-
-#include <boost/filesystem.hpp>
-#include <boost/lexical_cast.hpp>
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <algorithm>
-#include <cassert>
-#include <chrono>
-#include <thread>
-
-OLC_DECLARE_ENV_VAR(OLC_DEVICE_CU)
-
-namespace online_compile {
-
-std::size_t GetAvailableMemory()
-{
-    size_t free, total;
-    MY_HIP_CHECK(hipMemGetInfo(&free, &total));
-    return free;
-}
-
-int get_device_id() // Get random device
-{
-    int device;
-
-    MY_HIP_CHECK(hipGetDevice(&device));
-
-    return device;
-}
-
-void set_device(int id) { MY_HIP_CHECK(hipSetDevice(id)); }
-
-int set_default_device()
-{
-    int n;
-
-    MY_HIP_CHECK(hipGetDeviceCount(&n));
-
-    // Pick device based on process id
-    auto pid = ::getpid();
-    assert(pid > 0);
-    set_device(pid % n);
-    return (pid % n);
-}
-
-struct HandleImpl
-{
-    using StreamPtr = std::shared_ptr<typename std::remove_pointer<hipStream_t>::type>;
-
-    HandleImpl() {}
-
-    StreamPtr create_stream()
-    {
-        hipStream_t result;
-
-        MY_HIP_CHECK(hipStreamCreate(&result));
-
-        return StreamPtr{result, &hipStreamDestroy};
-    }
-
-    static StreamPtr reference_stream(hipStream_t s) { return StreamPtr{s, null_deleter{}}; }
-
-    std::string get_device_name() const
-    {
-        hipDeviceProp_t props;
-
-        MY_HIP_CHECK(hipGetDeviceProperties(&props, device));
-
-        const std::string name(props.gcnArchName);
-        return name;
-    }
-
-    StreamPtr stream = nullptr;
-    int device       = -1;
-    KernelCache cache;
-    TargetProperties target_properties;
-};
-
-Handle::Handle(hipStream_t stream) : impl(new HandleImpl())
-{
-    this->impl->device = get_device_id();
-
-    if(stream == nullptr)
-        this->impl->stream = HandleImpl::reference_stream(nullptr);
-    else
-        this->impl->stream = HandleImpl::reference_stream(stream);
-
-    this->impl->target_properties.Init(this);
-}
-
-Handle::Handle() : impl(new HandleImpl())
-{
-    this->impl->device = get_device_id();
-    this->impl->stream = HandleImpl::reference_stream(nullptr);
-
-    this->impl->target_properties.Init(this);
-}
-
-Handle::~Handle() {}
-
-void Handle::SetStream(hipStream_t streamID) const
-{
-    this->impl->stream = HandleImpl::reference_stream(streamID);
-
-    this->impl->target_properties.Init(this);
-}
-
-hipStream_t Handle::GetStream() const { return impl->stream.get(); }
-
-KernelInvoke Handle::AddKernel(const std::string& algorithm,
-                               const std::string& network_config,
-                               const std::string& program_name,
-                               const std::string& kernel_name,
-                               const std::vector<size_t>& vld,
-                               const std::vector<size_t>& vgd,
-                               const std::string& params,
-                               std::size_t cache_index) const
-{
-
-    auto obj = this->impl->cache.AddKernel(
-        *this, algorithm, network_config, program_name, kernel_name, vld, vgd, params, cache_index);
-    return this->Run(obj);
-}
-
-void Handle::ClearKernels(const std::string& algorithm, const std::string& network_config) const
-{
-    this->impl->cache.ClearKernels(algorithm, network_config);
-}
-
-const std::vector<Kernel>& Handle::GetKernelsImpl(const std::string& algorithm,
-                                                  const std::string& network_config) const
-{
-    return this->impl->cache.GetKernels(algorithm, network_config);
-}
-
-bool Handle::HasKernel(const std::string& algorithm, const std::string& network_config) const
-{
-    return this->impl->cache.HasKernels(algorithm, network_config);
-}
-
-KernelInvoke Handle::Run(Kernel k) const { return k.Invoke(this->GetStream()); }
-
-Program Handle::LoadProgram(const std::string& program_name, std::string params) const
-{
-    if((!online_compile::EndsWith(program_name, ".mlir-cpp")) &&
-       (!online_compile::EndsWith(program_name, ".mlir")))
-    {
-        params += " -mcpu=" + this->GetTargetProperties().Name();
-    }
-
-    auto hsaco = online_compile::LoadBinary(
-        this->GetTargetProperties(), this->GetMaxComputeUnits(), program_name, params);
-    if(hsaco.empty())
-    {
-        auto p = HIPOCProgram{program_name, params, this->GetTargetProperties()};
-
-        auto path = online_compile::GetCachePath() / boost::filesystem::unique_path();
-        if(p.IsCodeObjectInMemory())
-            online_compile::WriteFile(p.GetCodeObjectBlob(), path);
-        else
-            boost::filesystem::copy_file(p.GetCodeObjectPathname(), path);
-        online_compile::SaveBinary(path, this->GetTargetProperties(), program_name, params);
-
-        return p;
-    }
-    else
-    {
-        return HIPOCProgram{program_name, hsaco};
-    }
-}
-
-bool Handle::HasProgram(const std::string& program_name, const std::string& params) const
-{
-    return this->impl->cache.HasProgram(program_name, params);
-}
-
-void Handle::AddProgram(Program prog,
-                        const std::string& program_name,
-                        const std::string& params) const
-{
-    this->impl->cache.AddProgram(prog, program_name, params);
-}
-
-void Handle::Finish() const { MY_HIP_CHECK(hipStreamSynchronize(this->GetStream())); }
-
-std::size_t Handle::GetLocalMemorySize() const
-{
-    int result;
-
-    MY_HIP_CHECK(hipDeviceGetAttribute(
-        &result, hipDeviceAttributeMaxSharedMemoryPerBlock, this->impl->device));
-
-    return result;
-}
-
-std::size_t Handle::GetGlobalMemorySize() const
-{
-    size_t result;
-
-    MY_HIP_CHECK(hipDeviceTotalMem(&result, this->impl->device));
-
-    return result;
-}
-
-std::size_t Handle::GetMaxComputeUnits() const
-{
-    int result;
-    const char* const num_cu = online_compile::GetStringEnv(OLC_DEVICE_CU{});
-    if(num_cu != nullptr && strlen(num_cu) > 0)
-    {
-        return boost::lexical_cast<std::size_t>(num_cu);
-    }
-
-    MY_HIP_CHECK(
-        hipDeviceGetAttribute(&result, hipDeviceAttributeMultiprocessorCount, this->impl->device));
-
-    return result;
-}
-
-std::size_t Handle::GetWavefrontWidth() const
-{
-    hipDeviceProp_t props{};
-
-    MY_HIP_CHECK(hipGetDeviceProperties(&props, this->impl->device));
-
-    auto result = static_cast<size_t>(props.warpSize);
-    return result;
-}
-
-std::string Handle::GetDeviceNameImpl() const { return this->impl->get_device_name(); }
-
-std::string Handle::GetDeviceName() const { return this->impl->target_properties.Name(); }
-
-const TargetProperties& Handle::GetTargetProperties() const
-{
-    return this->impl->target_properties;
-}
-
-std::ostream& Handle::Print(std::ostream& os) const
-{
-    os << "stream: " << this->impl->stream << ", device_id: " << this->impl->device;
-    return os;
-}
-
-} // namespace online_compile
diff --git a/src/composable_kernel/host/online_compile/hip_utility/hip_build_utils.cpp b/src/composable_kernel/host/online_compile/hip_utility/hip_build_utils.cpp
deleted file mode 100644
index 99b786e606..0000000000
--- a/src/composable_kernel/host/online_compile/hip_utility/hip_build_utils.cpp
+++ /dev/null
@@ -1,346 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2019 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
-#include <hip_build_utils.hpp>
-#include <stringutils.hpp>
-#include <tmp_dir.hpp>
-#include <env.hpp>
-#include <target_properties.hpp>
-#include <write_file.hpp>
-#include <exec_utils.hpp>
-#include <logger.hpp>
-#include <config.h>
-#include <boost/optional.hpp>
-#include <sstream>
-#include <string>
-#include <stdexcept>
-
-#include <iostream>
-
-OLC_DECLARE_ENV_VAR(OLC_DEBUG_HIP_VERBOSE)
-OLC_DECLARE_ENV_VAR(OLC_DEBUG_HIP_DUMP)
-
-#define OLC_HIP_COMPILER "/opt/rocm/llvm/bin/clang++"
-
-namespace online_compile {
-
-bool IsHccCompiler()
-{
-    static const auto isHcc = EndsWith(OLC_HIP_COMPILER, "hcc");
-    return isHcc;
-}
-
-bool IsHipClangCompiler()
-{
-    static const auto isClangXX = EndsWith(OLC_HIP_COMPILER, "clang++");
-    return isClangXX;
-}
-
-namespace {
-
-inline bool ProduceCoV3()
-{
-    // Otherwise, let's enable CO v3 for HIP kernels since ROCm 3.0.
-    return (HipCompilerVersion() >= external_tool_version_t{3, 0, -1});
-}
-
-/// Returns option for enabling/disabling CO v3 generation for the compiler
-/// that builds HIP kernels, depending on compiler version etc.
-inline const std::string& GetCoV3Option(const bool enable)
-{
-    /// \note PR #2166 uses the "--hcc-cov3" option when isHCC is true.
-    /// It's unclear why... HCC included in ROCm 2.8 does not support it,
-    /// perhaps it suits for some older HCC?
-    ///
-    /// These options are Ok for ROCm 3.0:
-    static const std::string option_enable{"-mcode-object-v3"};
-    static const std::string no_option{};
-    if(enable)
-        return option_enable;
-    else
-        return no_option;
-}
-} // namespace
-
-static boost::filesystem::path HipBuildImpl(boost::optional<TmpDir>& tmp_dir,
-                                            const std::string& filename,
-                                            std::string src,
-                                            std::string params,
-                                            const TargetProperties& target,
-                                            const bool testing_mode,
-                                            const bool sources_already_reside_on_filesystem)
-{
-#ifdef __linux__
-    // Write out the include files
-    // Let's assume includes are overkill for feature tests & optimize'em out.
-    if(!testing_mode)
-    {
-        auto inc_list = GetHipKernelIncList();
-        auto inc_path = tmp_dir->path;
-        boost::filesystem::create_directories(inc_path);
-        for(auto inc_file : inc_list)
-        {
-            auto inc_src = GetKernelInc(inc_file);
-            WriteFile(inc_src, inc_path / inc_file);
-        }
-    }
-
-    // Sources produced by MLIR-cpp already reside in tmp dir.
-    if(!sources_already_reside_on_filesystem)
-    {
-        src += "\nint main() {}\n";
-        WriteFile(src, tmp_dir->path / filename);
-    }
-
-    // cppcheck-suppress unreadVariable
-    const LcOptionTargetStrings lots(target);
-
-    auto env = std::string("");
-    if(IsHccCompiler())
-    {
-        params += " -amdgpu-target=" + target.Name();
-        params += " " + GetCoV3Option(ProduceCoV3());
-    }
-    else if(IsHipClangCompiler())
-    {
-        if(params.find("-std=") == std::string::npos)
-            params += " --std=c++11";
-
-        if(HipCompilerVersion() < external_tool_version_t{4, 1, 0})
-            params += " --cuda-gpu-arch=" + lots.device;
-        else
-            params += " --cuda-gpu-arch=" + lots.device + lots.xnack;
-
-        params += " --cuda-device-only";
-        params += " -c";
-        params += " -O3 ";
-    }
-
-    params += " -Wno-unused-command-line-argument -I. ";
-    params += OLC_STRINGIZE(HIP_COMPILER_FLAGS);
-    if(IsHccCompiler())
-    {
-        env += std::string("KMOPTLLC=\"-mattr=+enable-ds128 ");
-        if(HipCompilerVersion() >= external_tool_version_t{2, 8, 0})
-            env += " --amdgpu-spill-vgpr-to-agpr=0";
-        env += '\"';
-    }
-    else if(IsHipClangCompiler())
-    {
-        params += " -mllvm --amdgpu-spill-vgpr-to-agpr=0";
-        params += " -mllvm -amdgpu-early-inline-all=true";
-        params += " -mllvm -amdgpu-function-calls=false";
-    }
-
-    if(online_compile::IsEnabled(OLC_DEBUG_HIP_VERBOSE{}))
-    {
-        params += " -v";
-    }
-
-    if(online_compile::IsEnabled(OLC_DEBUG_HIP_DUMP{}))
-    {
-        if(IsHccCompiler())
-        {
-            params += " -gline-tables-only";
-            env += " KMDUMPISA=1";
-            env += " KMDUMPLLVM=1";
-        }
-        else if(IsHipClangCompiler())
-        {
-            params += " -gline-tables-only";
-            params += " -save-temps";
-        }
-    }
-
-    // hip version
-    params +=
-        std::string(" -DHIP_PACKAGE_VERSION_FLAT=") + std::to_string(HIP_PACKAGE_VERSION_FLAT);
-
-    params += " ";
-    auto bin_file = tmp_dir->path / (filename + ".o");
-
-    // compile
-    const std::string redirector = testing_mode ? " 1>/dev/null 2>&1" : "";
-    tmp_dir->Execute(env + std::string(" ") + OLC_HIP_COMPILER,
-                     params + filename + " -o " + bin_file.string() + redirector);
-    if(!boost::filesystem::exists(bin_file))
-        throw std::runtime_error(filename + " failed to compile");
-#ifdef EXTRACTKERNEL_BIN
-    if(IsHccCompiler())
-    {
-        // call extract kernel
-        tmp_dir->Execute(EXTRACTKERNEL_BIN, " -i " + bin_file.string());
-        auto hsaco =
-            std::find_if(boost::filesystem::directory_iterator{tmp_dir->path}, {}, [](auto entry) {
-                return (entry.path().extension() == ".hsaco");
-            });
-
-        if(hsaco == boost::filesystem::directory_iterator{})
-        {
-            fdt_log(LogLevel::Info, "HipBuild", "failed to find *.hsaco in ")
-                << hsaco->path().string() << std::endl;
-        }
-
-        return hsaco->path();
-    }
-#endif
-    return bin_file;
-#else
-    (void)filename;
-    (void)params;
-    throw std::runtimer_error("HIP kernels are only supported in Linux");
-#endif
-}
-
-boost::filesystem::path HipBuild(boost::optional<TmpDir>& tmp_dir,
-                                 const std::string& filename,
-                                 std::string src,
-                                 std::string params,
-                                 const TargetProperties& target,
-                                 const bool sources_already_reside_on_filesystem)
-{
-    return HipBuildImpl(
-        tmp_dir, filename, src, params, target, false, sources_already_reside_on_filesystem);
-}
-
-void bin_file_to_str(const boost::filesystem::path& file, std::string& buf)
-{
-    std::ifstream bin_file_ptr(file.string().c_str(), std::ios::binary);
-    std::ostringstream bin_file_strm;
-    bin_file_strm << bin_file_ptr.rdbuf();
-    buf = bin_file_strm.str();
-}
-
-static external_tool_version_t HipCompilerVersionImpl()
-{
-    external_tool_version_t version;
-    if(IsHccCompiler())
-    {
-        const std::string path(OLC_HIP_COMPILER);
-        const std::string mandatory_prefix("(based on HCC ");
-        do
-        {
-            if(path.empty() || !std::ifstream(path).good())
-                break;
-
-            std::stringstream out;
-            if(online_compile::exec::Run(path + " --version", nullptr, &out) != 0)
-                break;
-
-            std::string line;
-            while(!out.eof())
-            {
-                std::getline(out, line);
-                fdt_log() << line;
-                auto begin = line.find(mandatory_prefix);
-                if(begin == std::string::npos)
-                    continue;
-
-                begin += mandatory_prefix.size();
-                int v3, v2, v1 = v2 = v3 = -1;
-                char c2, c1 = c2 = 'X';
-                std::istringstream iss(line.substr(begin));
-                iss >> v1 >> c1 >> v2 >> c2 >> v3;
-                if(!iss.fail() && v1 >= 0)
-                {
-                    version.major = v1;
-                    if(c1 == '.' && v2 >= 0)
-                    {
-                        version.minor = v2;
-                        if(c2 == '.' && v3 >= 0)
-                            version.patch = v3;
-                    }
-                }
-                break;
-            }
-        } while(false);
-    }
-    else
-    {
-#ifdef HIP_PACKAGE_VERSION_MAJOR
-        fdt_log(
-            LogLevel::Info, "HipCompilerVersion", "Read version information from HIP package...");
-        version.major = HIP_PACKAGE_VERSION_MAJOR;
-#ifdef HIP_PACKAGE_VERSION_MINOR
-        version.minor = HIP_PACKAGE_VERSION_MINOR;
-#else
-        version.minor = 0;
-#endif
-#ifdef HIP_PACKAGE_VERSION_PATCH
-        version.patch = HIP_PACKAGE_VERSION_PATCH;
-#else
-        version.patch = 0;
-#endif
-#else // HIP_PACKAGE_VERSION_MAJOR is not defined. CMake failed to find HIP package.
-        fdt_log(LogLevel::Info, "HipCompilerVersion", "...assuming 3.2.0 (hip-clang RC)");
-        version.major = 3;
-        version.minor = 2;
-        version.patch = 0;
-#endif
-    }
-    fdt_log() << version.major << '.' << version.minor << '.' << version.patch << std::endl;
-    return version;
-}
-
-external_tool_version_t HipCompilerVersion()
-{
-    // NOLINTNEXTLINE (cppcoreguidelines-avoid-non-const-global-variables)
-    static auto once = HipCompilerVersionImpl();
-    return once;
-}
-
-bool operator>(const external_tool_version_t& lhs, const external_tool_version_t& rhs)
-{
-    if(lhs.major > rhs.major)
-        return true;
-    else if(lhs.major == rhs.major)
-    {
-        if(lhs.minor > rhs.minor)
-            return true;
-        else if(lhs.minor == rhs.minor)
-            return (lhs.patch > rhs.patch);
-        else
-            return false;
-    }
-    else
-        return false;
-}
-
-bool operator<(const external_tool_version_t& lhs, const external_tool_version_t& rhs)
-{
-    return rhs > lhs;
-}
-bool operator>=(const external_tool_version_t& lhs, const external_tool_version_t& rhs)
-{
-    return !(lhs < rhs);
-}
-
-bool operator<=(const external_tool_version_t& lhs, const external_tool_version_t& rhs)
-{
-    return !(lhs > rhs);
-}
-
-} // namespace online_compile
diff --git a/src/composable_kernel/host/online_compile/hip_utility/hipoc_kernel.cpp b/src/composable_kernel/host/online_compile/hip_utility/hipoc_kernel.cpp
deleted file mode 100644
index a07d736ac1..0000000000
--- a/src/composable_kernel/host/online_compile/hip_utility/hipoc_kernel.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
-#include <env.hpp>
-#include <hipoc_kernel.hpp>
-#include <hipCheck.hpp>
-
-#include <hip/hip_ext.h>
-#include <hip/hip_runtime.h>
-
-#include <chrono>
-#include <thread>
-
-namespace online_compile {
-
-void HIPOCKernelInvoke::run(void* args, std::size_t size) const
-{
-    HipEventPtr start = nullptr;
-    HipEventPtr stop  = nullptr;
-    void* config[]    = {// HIP_LAUNCH_PARAM_* are macros that do horrible things
-                      // NOLINTNEXTLINE cppcoreguidelines-pro-type-cstyle-cast
-                      HIP_LAUNCH_PARAM_BUFFER_POINTER,
-                      args,
-                      // NOLINTNEXTLINE cppcoreguidelines-pro-type-cstyle-cast
-                      HIP_LAUNCH_PARAM_BUFFER_SIZE,
-                      &size,
-                      // NOLINTNEXTLINE cppcoreguidelines-pro-type-cstyle-cast
-                      HIP_LAUNCH_PARAM_END};
-    if(callback)
-    {
-        start = make_hip_event();
-        stop  = make_hip_event();
-    }
-
-    MY_HIP_CHECK(hipExtModuleLaunchKernel(fun,
-                                          gdims[0],
-                                          gdims[1],
-                                          gdims[2],
-                                          ldims[0],
-                                          ldims[1],
-                                          ldims[2],
-                                          0,
-                                          stream,
-                                          nullptr,
-                                          reinterpret_cast<void**>(&config),
-                                          start.get(),
-                                          stop.get()));
-
-    if(callback)
-    {
-        MY_HIP_CHECK(hipEventSynchronize(stop.get()));
-        callback(start.get(), stop.get());
-    }
-}
-
-HIPOCKernelInvoke HIPOCKernel::Invoke(hipStream_t stream,
-                                      std::function<void(hipEvent_t, hipEvent_t)> callback) const
-{
-    return HIPOCKernelInvoke{stream, fun, ldims, gdims, name, callback};
-}
-} // namespace online_compile
diff --git a/src/composable_kernel/host/online_compile/hip_utility/hipoc_program.cpp b/src/composable_kernel/host/online_compile/hip_utility/hipoc_program.cpp
deleted file mode 100644
index 81e03b72ab..0000000000
--- a/src/composable_kernel/host/online_compile/hip_utility/hipoc_program.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include <hip_build_utils.hpp>
-#include <hipoc_program.hpp>
-#include <kernel.hpp>
-#include <stringutils.hpp>
-#include <target_properties.hpp>
-#include <env.hpp>
-#include <write_file.hpp>
-#include <boost/optional.hpp>
-#include <boost/filesystem/operations.hpp>
-
-#include <cstring>
-#include <mutex>
-#include <sstream>
-
-#include <unistd.h>
-
-namespace online_compile {
-
-static hipModulePtr CreateModule(const boost::filesystem::path& hsaco_file)
-{
-    hipModule_t raw_m;
-    MY_HIP_CHECK(hipModuleLoad(&raw_m, hsaco_file.string().c_str()));
-    hipModulePtr m{raw_m};
-    return m;
-}
-
-template <typename T> /// intended for std::string and std::vector<char>
-hipModulePtr CreateModuleInMem(const T& blob)
-{
-    hipModule_t raw_m;
-    MY_HIP_CHECK(hipModuleLoadData(&raw_m, reinterpret_cast<const void*>(blob.data())));
-    hipModulePtr m{raw_m};
-    return m;
-}
-
-HIPOCProgramImpl::HIPOCProgramImpl(const std::string& program_name,
-                                   const boost::filesystem::path& filespec)
-    : program(program_name), hsaco_file(filespec)
-{
-    this->module = CreateModule(hsaco_file);
-}
-
-HIPOCProgramImpl::HIPOCProgramImpl(const std::string& program_name,
-                                   std::string params,
-                                   const TargetProperties& target_)
-    : program(program_name), target(target_)
-{
-    BuildCodeObject(params);
-    if(!binary.empty())
-    {
-        module = CreateModuleInMem(this->binary);
-    }
-    else
-    {
-        module = CreateModule(this->hsaco_file);
-    }
-}
-
-void HIPOCProgramImpl::BuildCodeObjectInFile(std::string& params,
-                                             const std::string& src,
-                                             const std::string& filename)
-{
-
-    this->dir.emplace(filename);
-    hsaco_file = dir->path / (filename + ".o");
-
-    if(online_compile::EndsWith(filename, ".cpp"))
-    {
-        hsaco_file = HipBuild(dir, filename, src, params, target);
-    }
-    else
-        throw std::runtime_error("Only HIP kernel source of .cpp file is supported");
-
-    if(!boost::filesystem::exists(hsaco_file))
-        throw std::runtime_error("Cant find file: " + hsaco_file.string());
-}
-
-void HIPOCProgramImpl::BuildCodeObject(std::string params)
-{
-    std::string filename = program;
-
-    if(online_compile::EndsWith(filename, ".cpp"))
-    {
-        params += " -Wno-everything";
-    }
-
-    BuildCodeObjectInFile(params, GetKernelSrc(this->program), filename);
-}
-
-HIPOCProgram::HIPOCProgram() {}
-HIPOCProgram::HIPOCProgram(const std::string& program_name,
-                           std::string params,
-                           const TargetProperties& target)
-    : impl(std::make_shared<HIPOCProgramImpl>(program_name, params, target))
-{
-}
-
-HIPOCProgram::HIPOCProgram(const std::string& program_name, const boost::filesystem::path& hsaco)
-    : impl(std::make_shared<HIPOCProgramImpl>(program_name, hsaco))
-{
-}
-
-hipModule_t HIPOCProgram::GetModule() const { return impl->module.get(); }
-
-boost::filesystem::path HIPOCProgram::GetCodeObjectPathname() const { return impl->hsaco_file; }
-
-std::string HIPOCProgram::GetCodeObjectBlob() const
-{
-    return {impl->binary.data(), impl->binary.size()};
-}
-
-bool HIPOCProgram::IsCodeObjectInMemory() const { return !impl->binary.empty(); };
-
-} // namespace online_compile
diff --git a/src/composable_kernel/host/online_compile/hip_utility/kernel_build_params.cpp b/src/composable_kernel/host/online_compile/hip_utility/kernel_build_params.cpp
deleted file mode 100644
index e37974b1a3..0000000000
--- a/src/composable_kernel/host/online_compile/hip_utility/kernel_build_params.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2019 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
-#include <sstream>
-
-#include <boost/range/adaptor/transformed.hpp>
-
-#include <kernel_build_params.hpp>
-#include <stringutils.hpp>
-
-namespace online_compile {
-
-static std::string GenerateDefines(const std::vector<KernelBuildParameter>& options,
-                                   const std::string& prefix)
-{
-    const auto strs =
-        options | boost::adaptors::transformed([&prefix](const KernelBuildParameter& define) {
-            std::ostringstream ss;
-
-            ss << '-';
-            if(define.type == ParameterTypes::Define)
-                ss << prefix;
-
-            ss << define.name;
-
-            if(!define.value.empty())
-            {
-                switch(define.type)
-                {
-                case ParameterTypes::Define: ss << '='; break;
-                case ParameterTypes::Option: ss << ' '; break;
-                }
-
-                ss << define.value;
-            }
-
-            return ss.str();
-        });
-
-    return JoinStrings(strs, " ");
-}
-
-} // namespace online_compile
diff --git a/src/composable_kernel/host/online_compile/hip_utility/kernel_cache.cpp b/src/composable_kernel/host/online_compile/hip_utility/kernel_cache.cpp
deleted file mode 100644
index dceb8de94e..0000000000
--- a/src/composable_kernel/host/online_compile/hip_utility/kernel_cache.cpp
+++ /dev/null
@@ -1,154 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-/* ************************************************************************
- * Copyright 2015 Vratis, Ltd.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * ************************************************************************ */
-
-#include <env.hpp>
-#include <kernel_cache.hpp>
-#include <stringutils.hpp>
-
-#include <iostream>
-#include <iterator>
-
-namespace online_compile {
-
-const std::vector<Kernel>& KernelCache::GetKernels(const std::string& algorithm,
-                                                   const std::string& network_config)
-{
-
-    std::pair<std::string, std::string> key = std::make_pair(algorithm, network_config);
-
-    const auto it = kernel_map.find(key);
-    if(it != kernel_map.end())
-    {
-        return it->second;
-    }
-
-    static const std::vector<Kernel> empty{};
-    return empty;
-}
-
-bool KernelCache::HasKernels(const std::string& algorithm, const std::string& network_config) const
-{
-    const auto key = std::make_pair(algorithm, network_config);
-    const auto it  = kernel_map.find(key);
-    if(it == kernel_map.end())
-        return false;
-
-    if(it->second.empty())
-    {
-        throw std::runtime_error(
-            "There should be at least one kernel in kernel cache if an entry exists");
-    }
-
-    return true;
-}
-
-bool KernelCache::HasProgram(const std::string& name, const std::string& params) const
-{
-    const auto key = std::make_pair(name, params);
-    return program_map.count(key) > 0;
-}
-
-void KernelCache::AddProgram(Program prog, const std::string& program_name, std::string params)
-{
-    program_map[std::make_pair(program_name, params)] = prog;
-}
-
-Kernel KernelCache::AddKernel(const Handle& h,
-                              const std::string& algorithm,
-                              const std::string& network_config,
-                              const std::string& program_name,
-                              const std::string& kernel_name,
-                              const std::vector<size_t>& vld,
-                              const std::vector<size_t>& vgd,
-                              std::string params,
-                              std::size_t cache_index)
-{
-    const std::pair<std::string, std::string> key = std::make_pair(algorithm, network_config);
-
-    Program program;
-
-    auto program_it = program_map.find(std::make_pair(program_name, params));
-    if(program_it != program_map.end())
-    {
-        program = program_it->second;
-    }
-    else
-    {
-        program                                           = h.LoadProgram(program_name, params);
-        program_map[std::make_pair(program_name, params)] = program;
-    }
-
-    Kernel kernel{};
-
-    kernel = Kernel{program, kernel_name, vld, vgd};
-
-    if(!network_config.empty() && !algorithm.empty())
-    {
-        this->AddKernel(key, kernel, cache_index);
-    }
-    return kernel;
-}
-
-void KernelCache::AddKernel(Key key, Kernel k, std::size_t cache_index)
-{
-    auto&& v = kernel_map[key];
-    if(cache_index >= v.size())
-    {
-        v.resize(cache_index + 1);
-    }
-    v[cache_index] = k;
-}
-
-void KernelCache::ClearKernels(const std::string& algorithm, const std::string& network_config)
-{
-    if(network_config.empty() || algorithm.empty())
-    {
-        throw std::runtime_error("Network config or algorithm empty.");
-    }
-    const std::pair<std::string, std::string> key = std::make_pair(algorithm, network_config);
-    auto&& v                                      = this->kernel_map[key];
-    if(!v.empty()) {}
-    v.clear();
-}
-
-KernelCache::KernelCache() {}
-
-} // namespace online_compile
diff --git a/src/composable_kernel/host/online_compile/hip_utility/logger.cpp b/src/composable_kernel/host/online_compile/hip_utility/logger.cpp
deleted file mode 100644
index d84bb20908..0000000000
--- a/src/composable_kernel/host/online_compile/hip_utility/logger.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-#include <config.h>
-#include <logger.hpp>
-#include <iostream>
-#include <string>
-
-using namespace std;
-
-namespace online_compile {
-
-#if OLC_DEBUG
-static LogLevel defLevel = LogLevel::Info2;
-#else
-static LogLevel defLevel = LogLevel::Error;
-#endif
-
-string LogLevelString(LogLevel level)
-{
-    switch(level)
-    {
-    case LogLevel::Error: return ("Error");
-    case LogLevel::Warning: return ("Warning");
-    case LogLevel::Info: return ("Info");
-    case LogLevel::Info2: return ("Info2");
-    default: return ("Unknown");
-    };
-};
-
-ostream& fdt_log(LogLevel level, const char* header, const char* content)
-{
-    if(level > online_compile::defLevel)
-    {
-        return (cerr);
-    };
-
-    cerr << endl << LogLevelString(level) << ":" << header << ", " << content;
-
-    return (cerr);
-}
-
-ostream& fdt_log() { return (cerr); };
-
-void fdt_log_flush() { cerr << endl; }
-}; // namespace online_compile
diff --git a/src/composable_kernel/host/online_compile/hip_utility/md5.cpp b/src/composable_kernel/host/online_compile/hip_utility/md5.cpp
deleted file mode 100644
index 24166decba..0000000000
--- a/src/composable_kernel/host/online_compile/hip_utility/md5.cpp
+++ /dev/null
@@ -1,319 +0,0 @@
-/*
- * Derived from a public-domain MD5 implementation. Original license
- * below.
- *
- * This is an OpenSSL-compatible implementation of the RSA Data Security, Inc.
- * MD5 Message-Digest Algorithm (RFC 1321).
- *
- * Homepage:
- * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5
- *
- * Author:
- * Alexander Peslyak, better known as Solar Designer <solar at openwall.com>
- *
- * This software was written by Alexander Peslyak in 2001.  No copyright is
- * claimed, and the software is hereby placed in the public domain.
- * In case this attempt to disclaim copyright and place the software in the
- * public domain is deemed null and void, then the software is
- * Copyright (c) 2001 Alexander Peslyak and it is hereby released to the
- * general public under the following terms:
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted.
- *
- * There's ABSOLUTELY NO WARRANTY, express or implied.
- *
- * (This is a heavily cut-down "BSD license".)
- *
- * This differs from Colin Plumb's older public domain implementation in that
- * no exactly 32-bit integer data type is required (any 32-bit or wider
- * unsigned integer data type will do), there's no compile-time endianness
- * configuration, and the function prototypes match OpenSSL's.  No code from
- * Colin Plumb's implementation has been reused; this comment merely compares
- * the properties of the two independent implementations.
- *
- * The primary goals of this implementation are portability and ease of use.
- * It is meant to be fast, but not as fast as possible.  Some known
- * optimizations are not included to reduce source code size and avoid
- * compile-time configuration.
- */
-#include <md5.hpp>
-#include <array>
-#include <cstring>
-#include <cstdint>
-#include <sstream>
-#include <iomanip>
-
-#define MD5_DIGEST_LENGTH 16
-
-struct MD5_CTX
-{
-    uint32_t lo, hi;
-    uint32_t a, b, c, d;
-    unsigned char buffer[64];
-    uint32_t block[MD5_DIGEST_LENGTH];
-};
-
-/*
- * The basic MD5 functions.
- *
- * F and G are optimized compared to their RFC 1321 definitions for
- * architectures that lack an AND-NOT instruction, just like in Colin Plumb's
- * implementation.
- */
-#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
-#define G(x, y, z) ((y) ^ ((z) & ((x) ^ (y))))
-#define H(x, y, z) (((x) ^ (y)) ^ (z))
-#define H2(x, y, z) ((x) ^ ((y) ^ (z)))
-#define I(x, y, z) ((y) ^ ((x) | ~(z)))
-
-/*
- * The MD5 transformation for all four rounds.
- */
-#define STEP(f, a, b, c, d, x, t, s)                         \
-    (a) += f((b), (c), (d)) + (x) + (t);                     \
-    (a) = (((a) << (s)) | (((a)&0xffffffff) >> (32 - (s)))); \
-    (a) += (b);
-
-/*
- * SET reads 4 input bytes in little-endian byte order and stores them in a
- * properly aligned word in host byte order.
- *
- * The check for little-endian architectures that tolerate unaligned memory
- * accesses is just an optimization.  Nothing will break if it fails to detect
- * a suitable architecture.
- *
- * Unfortunately, this optimization may be a C strict aliasing rules violation
- * if the caller's data buffer has effective type that cannot be aliased by
- * uint32_t.  In practice, this problem may occur if these MD5 routines are
- * inlined into a calling function, or with future and dangerously advanced
- * link-time optimizations.  For the time being, keeping these MD5 routines in
- * their own translation unit avoids the problem.
- */
-#if defined(__i386__) || defined(__x86_64__) || defined(__vax__)
-#define SET(n) (*reinterpret_cast<const uint32_t*>(&ptr[(n)*4]))
-#define GET(n) SET(n)
-#else
-#define SET(n)                                                         \
-    (ctx->block[(n)] = static_cast<uint32_t>(ptr[(n)*4]) |             \
-                       (static_cast<uint32_t>(ptr[(n)*4 + 1]) << 8) |  \
-                       (static_cast<uint32_t>(ptr[(n)*4 + 2]) << 16) | \
-                       (static_cast<uint32_t>(ptr[(n)*4 + 3]) << 24))
-#define GET(n) (ctx->block[(n)])
-#endif
-
-/*
- * This processes one or more 64-byte data blocks, but does NOT update the bit
- * counters.  There are no alignment requirements.
- */
-static const void* body(MD5_CTX* ctx, const void* data, size_t size)
-{
-    const unsigned char* ptr;
-    uint32_t a, b, c, d;
-
-    ptr = static_cast<const unsigned char*>(data);
-
-    a = ctx->a;
-    b = ctx->b;
-    c = ctx->c;
-    d = ctx->d;
-
-    do
-    {
-        uint32_t saved_a = a, saved_b = b, saved_c = c, saved_d = d;
-
-        /* Round 1 */
-        STEP(F, a, b, c, d, SET(0), 0xd76aa478, 7)
-        STEP(F, d, a, b, c, SET(1), 0xe8c7b756, 12)
-        STEP(F, c, d, a, b, SET(2), 0x242070db, 17)
-        STEP(F, b, c, d, a, SET(3), 0xc1bdceee, 22)
-        STEP(F, a, b, c, d, SET(4), 0xf57c0faf, 7)
-        STEP(F, d, a, b, c, SET(5), 0x4787c62a, 12)
-        STEP(F, c, d, a, b, SET(6), 0xa8304613, 17)
-        STEP(F, b, c, d, a, SET(7), 0xfd469501, 22)
-        STEP(F, a, b, c, d, SET(8), 0x698098d8, 7)
-        STEP(F, d, a, b, c, SET(9), 0x8b44f7af, 12)
-        STEP(F, c, d, a, b, SET(10), 0xffff5bb1, 17)
-        STEP(F, b, c, d, a, SET(11), 0x895cd7be, 22)
-        STEP(F, a, b, c, d, SET(12), 0x6b901122, 7)
-        STEP(F, d, a, b, c, SET(13), 0xfd987193, 12)
-        STEP(F, c, d, a, b, SET(14), 0xa679438e, 17)
-        STEP(F, b, c, d, a, SET(15), 0x49b40821, 22)
-
-        /* Round 2 */
-        STEP(G, a, b, c, d, GET(1), 0xf61e2562, 5)
-        STEP(G, d, a, b, c, GET(6), 0xc040b340, 9)
-        STEP(G, c, d, a, b, GET(11), 0x265e5a51, 14)
-        STEP(G, b, c, d, a, GET(0), 0xe9b6c7aa, 20)
-        STEP(G, a, b, c, d, GET(5), 0xd62f105d, 5)
-        STEP(G, d, a, b, c, GET(10), 0x02441453, 9)
-        STEP(G, c, d, a, b, GET(15), 0xd8a1e681, 14)
-        STEP(G, b, c, d, a, GET(4), 0xe7d3fbc8, 20)
-        STEP(G, a, b, c, d, GET(9), 0x21e1cde6, 5)
-        STEP(G, d, a, b, c, GET(14), 0xc33707d6, 9)
-        STEP(G, c, d, a, b, GET(3), 0xf4d50d87, 14)
-        STEP(G, b, c, d, a, GET(8), 0x455a14ed, 20)
-        STEP(G, a, b, c, d, GET(13), 0xa9e3e905, 5)
-        STEP(G, d, a, b, c, GET(2), 0xfcefa3f8, 9)
-        STEP(G, c, d, a, b, GET(7), 0x676f02d9, 14)
-        STEP(G, b, c, d, a, GET(12), 0x8d2a4c8a, 20)
-
-        /* Round 3 */
-        STEP(H, a, b, c, d, GET(5), 0xfffa3942, 4)
-        STEP(H2, d, a, b, c, GET(8), 0x8771f681, 11)
-        STEP(H, c, d, a, b, GET(11), 0x6d9d6122, 16)
-        STEP(H2, b, c, d, a, GET(14), 0xfde5380c, 23)
-        STEP(H, a, b, c, d, GET(1), 0xa4beea44, 4)
-        STEP(H2, d, a, b, c, GET(4), 0x4bdecfa9, 11)
-        STEP(H, c, d, a, b, GET(7), 0xf6bb4b60, 16)
-        STEP(H2, b, c, d, a, GET(10), 0xbebfbc70, 23)
-        STEP(H, a, b, c, d, GET(13), 0x289b7ec6, 4)
-        STEP(H2, d, a, b, c, GET(0), 0xeaa127fa, 11)
-        STEP(H, c, d, a, b, GET(3), 0xd4ef3085, 16)
-        STEP(H2, b, c, d, a, GET(6), 0x04881d05, 23)
-        STEP(H, a, b, c, d, GET(9), 0xd9d4d039, 4)
-        STEP(H2, d, a, b, c, GET(12), 0xe6db99e5, 11)
-        STEP(H, c, d, a, b, GET(15), 0x1fa27cf8, 16)
-        STEP(H2, b, c, d, a, GET(2), 0xc4ac5665, 23)
-
-        /* Round 4 */
-        STEP(I, a, b, c, d, GET(0), 0xf4292244, 6)
-        STEP(I, d, a, b, c, GET(7), 0x432aff97, 10)
-        STEP(I, c, d, a, b, GET(14), 0xab9423a7, 15)
-        STEP(I, b, c, d, a, GET(5), 0xfc93a039, 21)
-        STEP(I, a, b, c, d, GET(12), 0x655b59c3, 6)
-        STEP(I, d, a, b, c, GET(3), 0x8f0ccc92, 10)
-        STEP(I, c, d, a, b, GET(10), 0xffeff47d, 15)
-        STEP(I, b, c, d, a, GET(1), 0x85845dd1, 21)
-        STEP(I, a, b, c, d, GET(8), 0x6fa87e4f, 6)
-        STEP(I, d, a, b, c, GET(15), 0xfe2ce6e0, 10)
-        STEP(I, c, d, a, b, GET(6), 0xa3014314, 15)
-        STEP(I, b, c, d, a, GET(13), 0x4e0811a1, 21)
-        STEP(I, a, b, c, d, GET(4), 0xf7537e82, 6)
-        STEP(I, d, a, b, c, GET(11), 0xbd3af235, 10)
-        STEP(I, c, d, a, b, GET(2), 0x2ad7d2bb, 15)
-        STEP(I, b, c, d, a, GET(9), 0xeb86d391, 21)
-
-        a += saved_a;
-        b += saved_b;
-        c += saved_c;
-        d += saved_d;
-
-        ptr += 64;
-    } while((size -= 64) != 0u);
-
-    ctx->a = a;
-    ctx->b = b;
-    ctx->c = c;
-    ctx->d = d;
-
-    return ptr;
-}
-
-static void MD5_Init(MD5_CTX* ctx)
-{
-    ctx->a = 0x67452301;
-    ctx->b = 0xefcdab89;
-    ctx->c = 0x98badcfe;
-    ctx->d = 0x10325476;
-
-    ctx->lo = 0;
-    ctx->hi = 0;
-}
-
-static void MD5_Update(MD5_CTX* ctx, const void* data, size_t size)
-{
-    uint32_t saved_lo;
-    size_t used;
-
-    saved_lo = ctx->lo;
-    if((ctx->lo = (saved_lo + size) & 0x1fffffff) < saved_lo)
-        ctx->hi++;
-    ctx->hi += size >> 29;
-
-    used = saved_lo & 0x3f;
-
-    if(used != 0u)
-    {
-        size_t available = 64 - used;
-
-        if(size < available)
-        {
-            memcpy(&ctx->buffer[used], data, size);
-            return;
-        }
-
-        memcpy(&ctx->buffer[used], data, available);
-        data = static_cast<const unsigned char*>(data) + available;
-        size -= available;
-        body(ctx, ctx->buffer, 64);
-    }
-
-    if(size >= 64)
-    {
-        data = body(ctx, data, size & ~size_t{0x3f});
-        size &= 0x3f;
-    }
-
-    memcpy(ctx->buffer, data, size);
-}
-
-#define OUT(dst, src)                                   \
-    (dst)[0] = static_cast<unsigned char>(src);         \
-    (dst)[1] = static_cast<unsigned char>((src) >> 8);  \
-    (dst)[2] = static_cast<unsigned char>((src) >> 16); \
-    (dst)[3] = static_cast<unsigned char>((src) >> 24);
-
-static void MD5_Final(unsigned char* result, MD5_CTX* ctx)
-{
-    size_t used, available;
-
-    used = ctx->lo & 0x3f;
-
-    ctx->buffer[used++] = 0x80;
-
-    available = 64 - used;
-
-    if(available < 8)
-    {
-        memset(&ctx->buffer[used], 0, available);
-        body(ctx, ctx->buffer, 64);
-        used      = 0;
-        available = 64;
-    }
-
-    memset(&ctx->buffer[used], 0, available - 8);
-
-    ctx->lo <<= 3;
-    OUT(&ctx->buffer[56], ctx->lo)
-    OUT(&ctx->buffer[60], ctx->hi)
-
-    body(ctx, ctx->buffer, 64);
-
-    OUT(&result[0], ctx->a)
-    OUT(&result[4], ctx->b)
-    OUT(&result[8], ctx->c)
-    OUT(&result[12], ctx->d)
-
-    memset(ctx, 0, sizeof(*ctx));
-}
-
-namespace online_compile {
-
-std::string md5(std::string s)
-{
-    std::array<unsigned char, MD5_DIGEST_LENGTH> result{};
-
-    MD5_CTX ctx{};
-    MD5_Init(&ctx);
-    MD5_Update(&ctx, s.data(), s.length());
-    MD5_Final(result.data(), &ctx);
-
-    std::ostringstream sout;
-    sout << std::hex << std::setfill('0');
-    for(auto c : result)
-        sout << std::setw(2) << int{c};
-
-    return sout.str();
-}
-} // namespace online_compile
diff --git a/src/composable_kernel/host/online_compile/hip_utility/target_properties.cpp b/src/composable_kernel/host/online_compile/hip_utility/target_properties.cpp
deleted file mode 100644
index 1de2852c91..0000000000
--- a/src/composable_kernel/host/online_compile/hip_utility/target_properties.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2020 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include <env.hpp>
-#include <handle.hpp>
-#include <stringutils.hpp>
-#include <target_properties.hpp>
-#include <map>
-#include <string>
-
-OLC_DECLARE_ENV_VAR(OLC_DEBUG_ENFORCE_DEVICE)
-
-namespace online_compile {
-
-static std::string GetDeviceNameFromMap(const std::string& in)
-{
-    // NOLINTNEXTLINE (cppcoreguidelines-avoid-non-const-global-variables)
-    static std::map<std::string, std::string> device_name_map = {
-        {"Ellesmere", "gfx803"},
-        {"Baffin", "gfx803"},
-        {"RacerX", "gfx803"},
-        {"Polaris10", "gfx803"},
-        {"Polaris11", "gfx803"},
-        {"Tonga", "gfx803"},
-        {"Fiji", "gfx803"},
-        {"gfx800", "gfx803"},
-        {"gfx802", "gfx803"},
-        {"gfx804", "gfx803"},
-        {"Vega10", "gfx900"},
-        {"gfx901", "gfx900"},
-        {"10.3.0 Sienna_Cichlid 18", "gfx1030"},
-    };
-
-    const char* const p_asciz = online_compile::GetStringEnv(OLC_DEBUG_ENFORCE_DEVICE{});
-    if(p_asciz != nullptr && strlen(p_asciz) > 0)
-        return {p_asciz};
-
-    const auto name = in.substr(0, in.find(':')); // str.substr(0, npos) returns str.
-
-    auto match = device_name_map.find(name);
-    if(match != device_name_map.end())
-        return match->second;
-    return name; // NOLINT (performance-no-automatic-move)
-}
-
-void TargetProperties::Init(const Handle* const handle)
-{
-    const auto rawName = [&]() -> std::string { return handle->GetDeviceNameImpl(); }();
-    name               = GetDeviceNameFromMap(rawName);
-    // DKMS driver older than 5.9 may report incorrect state of SRAMECC feature.
-    // Therefore we compute default SRAMECC and rely on it for now.
-    sramecc = [&]() -> boost::optional<bool> {
-        if(name == "gfx906" || name == "gfx908")
-            return {true};
-        return {};
-    }();
-    // However we need to store the reported state, even if it is incorrect,
-    // to use together with COMGR.
-    sramecc_reported = [&]() -> boost::optional<bool> {
-        if(rawName.find(":sramecc+") != std::string::npos)
-            return true;
-        if(rawName.find(":sramecc-") != std::string::npos)
-            return false;
-        return sramecc; // default
-    }();
-    xnack = [&]() -> boost::optional<bool> {
-        if(rawName.find(":xnack+") != std::string::npos)
-            return true;
-        if(rawName.find(":xnack-") != std::string::npos)
-            return false;
-        return {}; // default
-    }();
-    InitDbId();
-}
-
-void TargetProperties::InitDbId()
-{
-    dbId = name;
-    if(name == "gfx906" || name == "gfx908")
-    {
-        // Let's stay compatible with existing gfx906/908 databases.
-        // When feature equal to the default (SRAMECC ON), do not
-        // append feature suffix. This is for backward compatibility
-        // with legacy databases ONLY!
-        if(!sramecc || !(*sramecc))
-            dbId += "_nosramecc";
-    }
-    else
-    {
-        if(sramecc && *sramecc)
-            dbId += "_sramecc";
-    }
-    if(xnack && *xnack)
-        dbId += "_xnack";
-}
-
-} // namespace online_compile
diff --git a/src/composable_kernel/host/online_compile/hip_utility/tmp_dir.cpp b/src/composable_kernel/host/online_compile/hip_utility/tmp_dir.cpp
deleted file mode 100644
index bdef7cad06..0000000000
--- a/src/composable_kernel/host/online_compile/hip_utility/tmp_dir.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2019 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
-#include <tmp_dir.hpp>
-#include <env.hpp>
-#include <boost/filesystem.hpp>
-#include <logger.hpp>
-
-OLC_DECLARE_ENV_VAR(OLC_DEBUG_SAVE_TEMP_DIR)
-
-namespace online_compile {
-
-void SystemCmd(std::string cmd)
-{
-    fdt_log(LogLevel::Info, "SystemCmd", cmd.c_str());
-    fdt_log_flush();
-    if(std::system(cmd.c_str()) != 0)
-        throw std::runtime_error("Can't execute " + cmd);
-}
-
-TmpDir::TmpDir(std::string prefix)
-    : path(boost::filesystem::temp_directory_path() /
-           boost::filesystem::unique_path("online_compile-" + prefix + "-%%%%-%%%%-%%%%-%%%%"))
-{
-    boost::filesystem::create_directories(this->path);
-}
-
-void TmpDir::Execute(std::string exe, std::string args) const
-{
-    std::string cd  = "cd " + this->path.string() + "; ";
-    std::string cmd = cd + exe + " " + args; // + " > /dev/null";
-    SystemCmd(cmd);
-}
-
-TmpDir::~TmpDir()
-{
-    if(!online_compile::IsEnabled(OLC_DEBUG_SAVE_TEMP_DIR{}))
-    {
-        boost::filesystem::remove_all(this->path);
-    }
-}
-
-} // namespace online_compile
diff --git a/src/composable_kernel/host/online_compile/include/binary_cache.hpp b/src/composable_kernel/host/online_compile/include/binary_cache.hpp
deleted file mode 100644
index c146bb9758..0000000000
--- a/src/composable_kernel/host/online_compile/include/binary_cache.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
-#ifndef GUARD_OLC_BINARY_CACHE_HPP
-#define GUARD_OLC_BINARY_CACHE_HPP
-
-#include <target_properties.hpp>
-#include <boost/filesystem/path.hpp>
-#include <string>
-
-namespace online_compile {
-
-boost::filesystem::path
-GetCacheFile(const std::string& device, const std::string& name, const std::string& args);
-
-boost::filesystem::path GetCachePath();
-
-boost::filesystem::path LoadBinary(const TargetProperties& target,
-                                   std::size_t num_cu,
-                                   const std::string& name,
-                                   const std::string& args);
-void SaveBinary(const boost::filesystem::path& binary_path,
-                const TargetProperties& target,
-                const std::string& name,
-                const std::string& args);
-
-} // namespace online_compile
-
-#endif
diff --git a/src/composable_kernel/host/online_compile/include/config.h.in b/src/composable_kernel/host/online_compile/include/config.h.in
deleted file mode 100644
index b36486de2f..0000000000
--- a/src/composable_kernel/host/online_compile/include/config.h.in
+++ /dev/null
@@ -1,47 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_CONFIG_H_IN
-#define GUARD_CONFIG_H_IN
-
-// "_PACKAGE_" to avoid name contentions: the macros like
-// HIP_VERSION_MAJOR are defined in hip_version.h.
-// clang-format off
-#define HIP_PACKAGE_VERSION_MAJOR @OLC_hip_VERSION_MAJOR@
-#define HIP_PACKAGE_VERSION_MINOR @OLC_hip_VERSION_MINOR@
-#define HIP_PACKAGE_VERSION_PATCH @OLC_hip_VERSION_PATCH@
-// clang-format on
-
-#define HIP_PACKAGE_VERSION_FLAT                                                   \
-    ((HIP_PACKAGE_VERSION_MAJOR * 1000ULL + HIP_PACKAGE_VERSION_MINOR) * 1000000 + \
-     HIP_PACKAGE_VERSION_PATCH)
-
-#cmakedefine01 OLC_DEBUG
-
-#cmakedefine OLC_HIP_COMPILER "@OLC_HIP_COMPILER@"
-#cmakedefine EXTRACTKERNEL_BIN "@EXTRACTKERNEL_BIN@"
-#cmakedefine OLC_OFFLOADBUNDLER_BIN "@OLC_OFFLOADBUNDLER_BIN@"
-
-#endif
diff --git a/src/composable_kernel/host/online_compile/include/env.hpp b/src/composable_kernel/host/online_compile/include/env.hpp
deleted file mode 100644
index 057a863269..0000000000
--- a/src/composable_kernel/host/online_compile/include/env.hpp
+++ /dev/null
@@ -1,123 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_OLC_ENV_HPP
-#define GUARD_OLC_ENV_HPP
-
-#include <cstdlib>
-#include <cstring>
-#include <string>
-#include <vector>
-
-namespace online_compile {
-
-/// \todo Rework: Case-insensitive string compare, ODR, (?) move to .cpp
-
-// Declare a cached environment variable
-#define OLC_DECLARE_ENV_VAR(x)                    \
-    struct x                                      \
-    {                                             \
-        static const char* value() { return #x; } \
-    };
-
-/*
- * Returns false if a feature-controlling environment variable is defined
- * and set to something which disables a feature.
- */
-inline bool IsEnvvarValueDisabled(const char* name)
-{
-    const auto value_env_p = std::getenv(name);
-    return value_env_p != nullptr &&
-           (std::strcmp(value_env_p, "disable") == 0 || std::strcmp(value_env_p, "disabled") == 0 ||
-            std::strcmp(value_env_p, "0") == 0 || std::strcmp(value_env_p, "no") == 0 ||
-            std::strcmp(value_env_p, "false") == 0);
-}
-
-inline bool IsEnvvarValueEnabled(const char* name)
-{
-    const auto value_env_p = std::getenv(name);
-    return value_env_p != nullptr &&
-           (std::strcmp(value_env_p, "enable") == 0 || std::strcmp(value_env_p, "enabled") == 0 ||
-            std::strcmp(value_env_p, "1") == 0 || std::strcmp(value_env_p, "yes") == 0 ||
-            std::strcmp(value_env_p, "true") == 0);
-}
-
-// Return 0 if env is enabled else convert environment var to an int.
-// Supports hexadecimal with leading 0x or decimal
-inline unsigned long int EnvvarValue(const char* name, unsigned long int fallback = 0)
-{
-    const auto value_env_p = std::getenv(name);
-    if(value_env_p == nullptr)
-    {
-        return fallback;
-    }
-    else
-    {
-        return strtoul(value_env_p, nullptr, 0);
-    }
-}
-
-inline std::vector<std::string> GetEnv(const char* name)
-{
-    const auto p = std::getenv(name);
-    if(p == nullptr)
-        return {};
-    else
-        return {{p}};
-}
-
-template <class T>
-inline const char* GetStringEnv(T)
-{
-    static const std::vector<std::string> result = GetEnv(T::value());
-    if(result.empty())
-        return nullptr;
-    else
-        return result.front().c_str();
-}
-
-template <class T>
-inline bool IsEnabled(T)
-{
-    static const bool result = online_compile::IsEnvvarValueEnabled(T::value());
-    return result;
-}
-
-template <class T>
-inline bool IsDisabled(T)
-{
-    static const bool result = online_compile::IsEnvvarValueDisabled(T::value());
-    return result;
-}
-
-template <class T>
-inline unsigned long int Value(T, unsigned long int fallback = 0)
-{
-    static const auto result = online_compile::EnvvarValue(T::value(), fallback);
-    return result;
-}
-} // namespace online_compile
-
-#endif
diff --git a/src/composable_kernel/host/online_compile/include/exec_utils.hpp b/src/composable_kernel/host/online_compile/include/exec_utils.hpp
deleted file mode 100644
index e257133ca4..0000000000
--- a/src/composable_kernel/host/online_compile/include/exec_utils.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2019 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef EXEC_OLC_UTILS_HPP
-#define EXEC_OLC_UTILS_HPP
-
-#include <istream>
-#include <ostream>
-#include <string>
-
-namespace online_compile {
-namespace exec {
-
-/// Redirecting both input and output is not supported.
-int Run(const std::string& p, std::istream* in, std::ostream* out);
-
-} // namespace exec
-} // namespace online_compile
-
-#endif // EXEC_UTILS_HPP
diff --git a/src/composable_kernel/host/online_compile/include/handle.hpp b/src/composable_kernel/host/online_compile/include/handle.hpp
deleted file mode 100644
index 8eda802a43..0000000000
--- a/src/composable_kernel/host/online_compile/include/handle.hpp
+++ /dev/null
@@ -1,145 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_OLC_HANDLE_HPP_
-#define GUARD_OLC_HANDLE_HPP_
-
-#include <kernel.hpp>
-#include <stringutils.hpp>
-#include <target_properties.hpp>
-
-#include <boost/range/adaptor/transformed.hpp>
-
-#include <cstdio>
-#include <cstring>
-#include <ios>
-#include <sstream>
-#include <memory>
-#include <vector>
-#include <unordered_map>
-
-namespace online_compile {
-
-struct HandleImpl;
-
-struct Handle
-{
-    friend struct TargetProperties;
-
-    Handle();
-    Handle(hipStream_t stream);
-    Handle(Handle&&) noexcept;
-    ~Handle();
-
-    hipStream_t GetStream() const;
-    void SetStream(hipStream_t streamID) const;
-
-    KernelInvoke AddKernel(const std::string& algorithm,
-                           const std::string& network_config,
-                           const std::string& program_name,
-                           const std::string& kernel_name,
-                           const std::vector<size_t>& vld,
-                           const std::vector<size_t>& vgd,
-                           const std::string& params,
-                           std::size_t cache_index = 0) const;
-
-    bool HasKernel(const std::string& algorithm, const std::string& network_config) const;
-
-    void ClearKernels(const std::string& algorithm, const std::string& network_config) const;
-
-    auto GetKernels(const std::string& algorithm, const std::string& network_config) const
-    {
-        return this->GetKernelsImpl(algorithm, network_config) |
-               boost::adaptors::transformed([this](Kernel k) { return this->Run(k); });
-    }
-    KernelInvoke GetKernel(const std::string& algorithm, const std::string& network_config) const
-    {
-        auto ks = this->GetKernelsImpl(algorithm, network_config);
-        if(ks.empty())
-        {
-            throw std::runtime_error("looking for default kernel (does not exist): " + algorithm +
-                                     ", " + network_config);
-        }
-        return this->Run(ks.front());
-    }
-
-    KernelInvoke Run(Kernel k) const;
-
-    Program LoadProgram(const std::string& program_name, std::string params) const;
-
-    bool HasProgram(const std::string& program_name, const std::string& params) const;
-
-    void AddProgram(Program prog, const std::string& program_name, const std::string& params) const;
-
-    void Finish() const;
-
-    std::size_t GetLocalMemorySize() const;
-    std::size_t GetGlobalMemorySize() const;
-    std::size_t GetWavefrontWidth() const;
-    std::size_t GetMaxComputeUnits() const;
-    std::size_t GetMaxHardwareComputeUnits() const
-    {
-        std::size_t num_cu = this->GetMaxComputeUnits();
-        std::string name   = this->GetDeviceName();
-        return StartsWith(name, "gfx1") ? num_cu * 2 /* CUs per WGP */ : num_cu;
-    }
-
-    std::string GetDeviceName() const;
-    const TargetProperties& GetTargetProperties() const;
-
-    private:
-    std::string GetDeviceNameImpl() const;
-    const std::vector<Kernel>& GetKernelsImpl(const std::string& algorithm,
-                                              const std::string& network_config) const;
-
-    public:
-    std::ostream& Print(std::ostream& os) const;
-
-    static std::string GetDbBasename(const TargetProperties& target, size_t num_cu)
-    {
-        auto ret = target.DbId() + [&]() {
-            std::ostringstream ss;
-            if(num_cu <= 64)
-                ss << '_' << num_cu;
-            else
-                ss << std::hex << num_cu;
-            return std::string(ss.str());
-        }();
-        return ret;
-    }
-
-    std::string GetDbBasename() const
-    {
-        return GetDbBasename(GetTargetProperties(), GetMaxComputeUnits());
-    }
-
-    std::unique_ptr<HandleImpl> impl;
-};
-
-inline std::ostream& operator<<(std::ostream& os, const Handle& handle) { return handle.Print(os); }
-
-} // namespace online_compile
-
-#endif // GUARD_OLC_HANDLE_HPP_
diff --git a/src/composable_kernel/host/online_compile/include/hipCheck.hpp b/src/composable_kernel/host/online_compile/include/hipCheck.hpp
deleted file mode 100644
index 61959c8fa6..0000000000
--- a/src/composable_kernel/host/online_compile/include/hipCheck.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef _HIP_OLC_CHECK_HPP_
-#define _HIP_OLC_CHECK_HPP_
-
-#include <hip/hip_runtime.h>
-#include <sstream>
-#include <vector>
-
-// Here flag can be a constant, variable or function call
-#define MY_HIP_CHECK(flag)                                                         \
-    do                                                                             \
-    {                                                                              \
-        hipError_t _tmpVal;                                                        \
-        if((_tmpVal = flag) != hipSuccess)                                         \
-        {                                                                          \
-            std::ostringstream ostr;                                               \
-            ostr << "HIP Function Failed (" << __FILE__ << "," << __LINE__ << ") " \
-                 << hipGetErrorString(_tmpVal);                                    \
-            throw std::runtime_error(ostr.str());                                  \
-        }                                                                          \
-    } while(0)
-
-#endif
diff --git a/src/composable_kernel/host/online_compile/include/hip_build_utils.hpp b/src/composable_kernel/host/online_compile/include/hip_build_utils.hpp
deleted file mode 100644
index f93993edef..0000000000
--- a/src/composable_kernel/host/online_compile/include/hip_build_utils.hpp
+++ /dev/null
@@ -1,97 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2019 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef OLC_GUARD_OLC_HIP_BUILD_UTILS_HPP
-#define OLC_GUARD_OLC_HIP_BUILD_UTILS_HPP
-
-#include <target_properties.hpp>
-#include <kernel.hpp>
-#include <boost/optional.hpp>
-#include <string>
-
-namespace online_compile {
-
-boost::filesystem::path HipBuild(boost::optional<online_compile::TmpDir>& tmp_dir,
-                                 const std::string& filename,
-                                 std::string src,
-                                 std::string params,
-                                 const TargetProperties& target,
-                                 bool sources_already_reside_on_filesystem = false);
-
-void bin_file_to_str(const boost::filesystem::path& file, std::string& buf);
-
-struct external_tool_version_t
-{
-    int major = -1;
-    int minor = -1;
-    int patch = -1;
-    friend bool operator>(const external_tool_version_t& lhs, const external_tool_version_t& rhs);
-    friend bool operator<(const external_tool_version_t& lhs, const external_tool_version_t& rhs);
-    friend bool operator>=(const external_tool_version_t& lhs, const external_tool_version_t& rhs);
-    friend bool operator<=(const external_tool_version_t& lhs, const external_tool_version_t& rhs);
-};
-
-external_tool_version_t HipCompilerVersion();
-
-bool IsHccCompiler();
-bool IsHipClangCompiler();
-
-class LcOptionTargetStrings
-{
-    public:
-    const std::string& device;
-    const std::string xnack;
-
-    private:
-    const std::string sramecc;
-    const std::string sramecc_reported;
-
-    public:
-    const std::string targetId;
-    LcOptionTargetStrings(const TargetProperties& target)
-        : device(target.Name()),
-          xnack([&]() -> std::string {
-              if(target.Xnack())
-                  return std::string{":xnack"} + (*target.Xnack() ? "+" : "-");
-              return {};
-          }()),
-          sramecc([&]() -> std::string {
-              if(target.Sramecc())
-                  return std::string{":sramecc"} + (*target.Sramecc() ? "+" : "-");
-              return {};
-          }()),
-          sramecc_reported([&]() -> std::string {
-              if(target.SrameccReported())
-                  return std::string{":sramecc"} + (*target.SrameccReported() ? "+" : "-");
-              return {};
-          }()),
-          targetId(device + sramecc + xnack)
-    {
-    }
-};
-
-} // namespace online_compile
-
-#endif
diff --git a/src/composable_kernel/host/online_compile/include/hipoc_kernel.hpp b/src/composable_kernel/host/online_compile/include/hipoc_kernel.hpp
deleted file mode 100644
index f6c5e1adf5..0000000000
--- a/src/composable_kernel/host/online_compile/include/hipoc_kernel.hpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_OLC_HIPOC_KERNEL_HPP
-#define GUARD_OLC_HIPOC_KERNEL_HPP
-
-#include <array>
-#include <cassert>
-#include <hipoc_program.hpp>
-#include <stringutils.hpp>
-#include <manage_ptr.hpp>
-#include <op_kernel_args.hpp>
-#include <hipCheck.hpp>
-#include <vector>
-#include <memory>
-
-namespace online_compile {
-
-using HipEventPtr = OLC_MANAGE_PTR(hipEvent_t, hipEventDestroy);
-inline HipEventPtr make_hip_event()
-{
-    hipEvent_t result = nullptr;
-    MY_HIP_CHECK(hipEventCreate(&result));
-    return HipEventPtr{result};
-}
-
-template <class T, class U>
-struct KernelArgsPair
-{
-    static const int alignment    = sizeof(U);
-    static const int padding      = (alignment - (sizeof(T) % alignment)) % alignment;
-    static const int second_index = sizeof(T) + padding;
-    KernelArgsPair(T x, U y)
-    {
-
-        new(buffer) T(x); // NOLINT (clang-analyzer-cplusplus.PlacementNew)
-        new(buffer + second_index) U(y);
-    }
-    char buffer[second_index + sizeof(U)] = {};
-};
-
-template <class... Ts>
-struct KernelArgsPack;
-
-template <class T, class U, class... Ts>
-struct KernelArgsPack<T, U, Ts...>
-{
-    using data_t = KernelArgsPack<KernelArgsPair<T, U>, Ts...>;
-    KernelArgsPack(T x, U y, Ts... xs) : data(KernelArgsPair<T, U>(x, y), xs...) {}
-    data_t data;
-};
-
-template <class T>
-struct KernelArgsPack<T>
-{
-    KernelArgsPack(T x) : head(x) {}
-    T head;
-};
-
-template <class... Ts>
-struct KernelArgs
-{
-    KernelArgs(Ts... xs) : pack(xs...) { std::fill(std::begin(hidden), std::end(hidden), 0); }
-    KernelArgsPack<Ts...> pack;
-    uint64_t hidden[6] = {};
-};
-
-struct HIPOCKernelInvoke
-{
-    hipStream_t stream          = nullptr;
-    hipFunction_t fun           = nullptr;
-    std::array<size_t, 3> ldims = {};
-    std::array<size_t, 3> gdims = {};
-    std::string name;
-    std::function<void(hipEvent_t, hipEvent_t)> callback;
-
-    // Workaround for aggregate types in c++11
-    HIPOCKernelInvoke() {}
-    HIPOCKernelInvoke(hipStream_t pstream,
-                      hipFunction_t pfun,
-                      std::array<size_t, 3> pldims,
-                      std::array<size_t, 3> pgdims,
-                      std::string pname,
-                      std::function<void(hipEvent_t, hipEvent_t)> pcallback)
-        : stream(pstream), fun(pfun), ldims(pldims), gdims(pgdims), name(pname), callback(pcallback)
-    {
-    }
-    void operator()(std::vector<OpKernelArg>& any_args) const
-    {
-        char hip_args[256] = {0};
-        auto sz_left       = any_args[0].size();
-
-        memcpy(hip_args, &(any_args[0].buffer[0]), any_args[0].size());
-
-        for(unsigned long idx = 1; idx < any_args.size(); idx++)
-        {
-            auto& any_arg              = any_args[idx];
-            unsigned long alignment    = any_arg.size();
-            unsigned long padding      = (alignment - (sz_left % alignment)) % alignment;
-            unsigned long second_index = sz_left + padding;
-            memcpy(hip_args + second_index, &(any_arg.buffer[0]), any_arg.size());
-            sz_left = second_index + alignment;
-        }
-        run(hip_args, sz_left);
-    }
-
-    template <class... Ts>
-    void operator()(Ts... xs) const
-    {
-        KernelArgs<Ts...> args{xs...};
-        run(&args, sizeof(args));
-    }
-
-    void run(void* args, std::size_t size) const;
-
-    const std::string& GetName() const { return name; }
-};
-
-struct HIPOCKernel
-{
-    HIPOCProgram program;
-    std::string name;
-    std::array<size_t, 3> ldims = {};
-    std::array<size_t, 3> gdims = {};
-    std::string kernel_module;
-    hipFunction_t fun = nullptr;
-
-    HIPOCKernel() {}
-    HIPOCKernel(HIPOCProgram p,
-                const std::string kernel_name,
-                std::vector<size_t> local_dims,
-                std::vector<size_t> global_dims)
-        : program(p), name(kernel_name)
-    {
-        assert(!local_dims.empty() && local_dims.size() <= 3);
-        assert(!global_dims.empty() && global_dims.size() <= 3);
-        ldims.fill(1);
-        gdims.fill(1);
-        std::copy(local_dims.begin(), local_dims.end(), ldims.begin());
-        std::copy(global_dims.begin(), global_dims.end(), gdims.begin());
-
-        kernel_module = name;
-        MY_HIP_CHECK(hipModuleGetFunction(&fun, program.GetModule(), kernel_module.c_str()));
-    }
-
-    HIPOCKernelInvoke Invoke(hipStream_t stream,
-                             std::function<void(hipEvent_t, hipEvent_t)> callback = nullptr) const;
-};
-
-} // namespace online_compile
-
-#endif
diff --git a/src/composable_kernel/host/online_compile/include/hipoc_program.hpp b/src/composable_kernel/host/online_compile/include/hipoc_program.hpp
deleted file mode 100644
index c388bb35bf..0000000000
--- a/src/composable_kernel/host/online_compile/include/hipoc_program.hpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_OLC_HIPOC_PROGRAM_HPP
-#define GUARD_OLC_HIPOC_PROGRAM_HPP
-
-#include <target_properties.hpp>
-#include <manage_ptr.hpp>
-#include <hipoc_program_impl.hpp>
-#include <boost/filesystem/path.hpp>
-#include <hip/hip_runtime_api.h>
-#include <string>
-
-namespace online_compile {
-
-struct HIPOCProgramImpl;
-struct HIPOCProgram
-{
-    HIPOCProgram();
-    /// This ctor builds the program from source, initializes module.
-    /// Also either CO pathname (typically if offline tools were used)
-    /// or binary blob (if comgr was used to build the program)
-    /// is initialized. GetModule(), GetCodeObjectPathname(),
-    /// GetCodeObjectBlob() return appropriate data after this ctor.
-    /// Other ctors only guarantee to initialize module.
-    HIPOCProgram(const std::string& program_name,
-                 std::string params,
-                 const TargetProperties& target);
-    HIPOCProgram(const std::string& program_name, const boost::filesystem::path& hsaco);
-    std::shared_ptr<const HIPOCProgramImpl> impl;
-    hipModule_t GetModule() const;
-    /// \return Pathname of CO file, if it resides on the filesystem.
-    boost::filesystem::path GetCodeObjectPathname() const;
-    /// \return Copy of in-memory CO blob.
-    std::string GetCodeObjectBlob() const;
-    /// \return True if CO blob resides in-memory.
-    /// False if CO resides on filesystem.
-    bool IsCodeObjectInMemory() const;
-};
-} // namespace online_compile
-
-#endif
diff --git a/src/composable_kernel/host/online_compile/include/hipoc_program_impl.hpp b/src/composable_kernel/host/online_compile/include/hipoc_program_impl.hpp
deleted file mode 100644
index 4e6b59d265..0000000000
--- a/src/composable_kernel/host/online_compile/include/hipoc_program_impl.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_OLC_HIPOC_PROGRAM_IMPL_HPP
-#define GUARD_OLC_HIPOC_PROGRAM_IMPL_HPP
-
-#include <target_properties.hpp>
-#include <manage_ptr.hpp>
-#include <tmp_dir.hpp>
-#include <boost/filesystem/path.hpp>
-#include <boost/optional.hpp>
-#include <hip/hip_runtime_api.h>
-
-namespace online_compile {
-
-using hipModulePtr = OLC_MANAGE_PTR(hipModule_t, hipModuleUnload);
-
-struct HIPOCProgramImpl
-{
-    HIPOCProgramImpl(){};
-    HIPOCProgramImpl(const std::string& program_name, const boost::filesystem::path& filespec);
-
-    HIPOCProgramImpl(const std::string& program_name,
-                     std::string params,
-                     const TargetProperties& target_);
-
-    std::string program;
-    TargetProperties target;
-    boost::filesystem::path hsaco_file;
-    hipModulePtr module;
-    boost::optional<TmpDir> dir;
-    std::vector<char> binary;
-
-    void
-    BuildCodeObjectInFile(std::string& params, const std::string& src, const std::string& filename);
-    void BuildCodeObject(std::string params);
-};
-} // namespace online_compile
-#endif // GUARD_OLC_HIPOC_PROGRAM_IMPL_HPP
diff --git a/src/composable_kernel/host/online_compile/include/kernel.hpp b/src/composable_kernel/host/online_compile/include/kernel.hpp
deleted file mode 100644
index 7d1fd81242..0000000000
--- a/src/composable_kernel/host/online_compile/include/kernel.hpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_OLC_KERNEL_HPP
-#define GUARD_OLC_KERNEL_HPP
-
-#include <string>
-#include <vector>
-#include <hipoc_kernel.hpp>
-
-namespace online_compile {
-std::string GetKernelSrc(std::string name);
-std::string GetKernelInc(std::string key);
-std::vector<std::string> GetKernelIncList();
-std::vector<std::string> GetHipKernelIncList();
-
-using Kernel       = HIPOCKernel;
-using KernelInvoke = HIPOCKernelInvoke;
-using Program      = HIPOCProgram;
-
-} // namespace online_compile
-
-#endif
diff --git a/src/composable_kernel/host/online_compile/include/kernel_build_params.hpp b/src/composable_kernel/host/online_compile/include/kernel_build_params.hpp
deleted file mode 100644
index 30315ac9b7..0000000000
--- a/src/composable_kernel/host/online_compile/include/kernel_build_params.hpp
+++ /dev/null
@@ -1,137 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2019 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_OLC_KERNEL_BUILD_PARAMETERS_HPP_
-#define GUARD_OLC_KERNEL_BUILD_PARAMETERS_HPP_
-
-#include <algorithm>
-#include <cassert>
-#include <initializer_list>
-#include <string>
-#include <vector>
-
-namespace online_compile {
-
-namespace kbp {
-struct Option
-{
-};
-} // namespace kbp
-
-enum class ParameterTypes
-{
-    Define,
-    Option,
-};
-
-struct KernelBuildParameter
-{
-    ParameterTypes type;
-    std::string name;
-    std::string value;
-};
-
-class KernelBuildParameters
-{
-    public:
-    struct KBPInit
-    {
-        friend class KernelBuildParameters;
-
-        KBPInit(const std::string& name, const std::string& value = "")
-            : data{ParameterTypes::Define, name, value}
-        {
-        }
-
-        template <class TValue, class = decltype(std::to_string(std::declval<TValue>()))>
-        KBPInit(const std::string& name, const TValue& value) : KBPInit(name, std::to_string(value))
-        {
-        }
-
-        KBPInit(kbp::Option, const std::string& name, const std::string& value = "")
-            : data{ParameterTypes::Option, name, value}
-        {
-        }
-
-        template <class TValue, class = decltype(std::to_string(std::declval<TValue>()))>
-        KBPInit(kbp::Option, const std::string& name, const TValue& value)
-            : KBPInit(kbp::Option{}, name, std::to_string(value))
-        {
-        }
-
-        private:
-        KernelBuildParameter data{};
-    };
-
-    KernelBuildParameters() = default;
-    KernelBuildParameters(const std::initializer_list<KBPInit>& defines_)
-    {
-        options.reserve(defines_.size());
-        for(const auto& define : defines_)
-        {
-            assert(ValidateUniqueness(define.data.name));
-            options.push_back(define.data);
-        }
-    }
-
-    bool Empty() const { return options.empty(); }
-
-    void Define(const std::string& name, const std::string& value = "")
-    {
-        assert(ValidateUniqueness(name));
-        options.push_back({ParameterTypes::Define, name, value});
-    }
-
-    template <class TValue, class = decltype(std::to_string(std::declval<TValue>()))>
-    void Define(const std::string& name, const TValue& value)
-    {
-        Define(name, std::to_string(value));
-    }
-
-    KernelBuildParameters& operator<<(const KernelBuildParameters& other)
-    {
-        std::copy(other.options.begin(), other.options.end(), std::back_inserter(options));
-        return *this;
-    }
-
-    template <class TFor>
-    std::string GenerateFor(TFor&&) const
-    {
-        return TFor::Generate(options);
-    }
-
-    private:
-    std::vector<KernelBuildParameter> options = {};
-
-    bool ValidateUniqueness(const std::string& name) const
-    {
-        const auto eq = [=](const auto& item) { return item.name == name; };
-        return std::find_if(options.begin(), options.end(), eq) == options.end();
-    }
-};
-
-} // namespace online_compile
-
-#endif
diff --git a/src/composable_kernel/host/online_compile/include/kernel_cache.hpp b/src/composable_kernel/host/online_compile/include/kernel_cache.hpp
deleted file mode 100644
index 20d26f6102..0000000000
--- a/src/composable_kernel/host/online_compile/include/kernel_cache.hpp
+++ /dev/null
@@ -1,97 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-/* ************************************************************************
- * Copyright 2015 Vratis, Ltd.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * ************************************************************************ */
-
-#ifndef GUARD_OLC_KERNEL_CACHE_HPP_
-#define GUARD_OLC_KERNEL_CACHE_HPP_
-
-#include <handle.hpp>
-#include <kernel.hpp>
-#include <simple_hash.hpp>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-namespace online_compile {
-
-/**
- * @brief The KernelCache class Build and cache kernels
- *
- */
-class KernelCache
-{
-    public:
-    using Key        = std::pair<std::string, std::string>;
-    using KernelMap  = std::unordered_map<Key, std::vector<Kernel>, SimpleHash>;
-    using ProgramMap = std::unordered_map<Key, Program, SimpleHash>;
-
-    Kernel AddKernel(const Handle& h,
-                     const std::string& algorithm,
-                     const std::string& network_config,
-                     const std::string& program_name,
-                     const std::string& kernel_name,
-                     const std::vector<size_t>& vld,
-                     const std::vector<size_t>& vgd,
-                     std::string params      = "",
-                     std::size_t cache_index = 0);
-
-    void AddKernel(Key key, Kernel k, std::size_t cache_index);
-
-    void ClearKernels(const std::string& algorithm, const std::string& network_config);
-
-    const std::vector<Kernel>& GetKernels(const std::string& algorithm,
-                                          const std::string& network_config);
-
-    bool HasKernels(const std::string& algorithm, const std::string& network_config) const;
-
-    bool HasProgram(const std::string& name, const std::string& params) const;
-
-    void AddProgram(Program prog, const std::string& program_name, std::string params);
-
-    KernelCache();
-
-    private:
-    KernelMap kernel_map;
-    ProgramMap program_map;
-};
-
-} // namespace online_compile
-
-#endif // GUARD_OLC_KERNEL_CACHE_HPP_
diff --git a/src/composable_kernel/host/online_compile/include/logger.hpp b/src/composable_kernel/host/online_compile/include/logger.hpp
deleted file mode 100644
index a397a868ba..0000000000
--- a/src/composable_kernel/host/online_compile/include/logger.hpp
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef _OLC_LOGGER_HPP_
-#define _OLC_LOGGER_HPP_
-
-#include <fstream>
-
-namespace online_compile {
-
-enum class LogLevel
-{
-    Quiet   = 1,
-    Error   = 2,
-    Warning = 3,
-    Info    = 4,
-    Info2   = 5
-};
-
-std::ostream& fdt_log(LogLevel level, const char* header, const char* content);
-std::ostream& fdt_log();
-void fdt_log_flush();
-
-}; // namespace online_compile
-
-#endif
diff --git a/src/composable_kernel/host/online_compile/include/manage_ptr.hpp b/src/composable_kernel/host/online_compile/include/manage_ptr.hpp
deleted file mode 100644
index f23807686b..0000000000
--- a/src/composable_kernel/host/online_compile/include/manage_ptr.hpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_OLC_MANAGE_PTR_HPP
-#define GUARD_OLC_MANAGE_PTR_HPP
-
-#include <memory>
-#include <type_traits>
-
-namespace online_compile {
-
-template <class F, F f>
-struct manage_deleter
-{
-    template <class T>
-    void operator()(T* x) const
-    {
-        if(x != nullptr)
-        {
-            (void)f(x); // NOLINT (cppcoreguidelines-owning-memory)
-        }
-    }
-};
-
-struct null_deleter
-{
-    template <class T>
-    void operator()(T* /*x*/) const
-    {
-    }
-};
-
-template <class T, class F, F f>
-using manage_ptr = std::unique_ptr<T, manage_deleter<F, f>>;
-
-template <class T>
-struct element_type
-{
-    using type = typename T::element_type;
-};
-
-template <class T>
-using remove_ptr = typename std::
-    conditional<std::is_pointer<T>::value, std::remove_pointer<T>, element_type<T>>::type::type;
-
-template <class T>
-using shared = std::shared_ptr<remove_ptr<T>>;
-
-} // namespace online_compile
-
-#define OLC_MANAGE_PTR(T, F) \
-    online_compile::manage_ptr<typename std::remove_pointer<T>::type, decltype(&F), &F> // NOLINT
-
-#endif
diff --git a/src/composable_kernel/host/online_compile/include/md5.hpp b/src/composable_kernel/host/online_compile/include/md5.hpp
deleted file mode 100644
index 0fa25849a5..0000000000
--- a/src/composable_kernel/host/online_compile/include/md5.hpp
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef GUARD_OLC_MD5_HPP
-#define GUARD_OLC_MD5_HPP
-
-#include <string>
-
-namespace online_compile {
-
-std::string md5(std::string s);
-
-} // namespace online_compile
-
-#endif
diff --git a/src/composable_kernel/host/online_compile/include/op_kernel_args.hpp b/src/composable_kernel/host/online_compile/include/op_kernel_args.hpp
deleted file mode 100644
index eb483265ea..0000000000
--- a/src/composable_kernel/host/online_compile/include/op_kernel_args.hpp
+++ /dev/null
@@ -1,40 +0,0 @@
-#ifndef OLC_GUARD_MLOPEN_OP_KERNEL_ARGS_HPP
-#define OLC_GUARD_MLOPEN_OP_KERNEL_ARGS_HPP
-
-#include <type_traits>
-#include <cstdint>
-#include <half.hpp>
-
-#include <boost/container/small_vector.hpp>
-
-namespace online_compile {
-
-struct OpKernelArg
-{
-
-    OpKernelArg(char val, size_t sz) : buffer(sz) { std::fill(buffer.begin(), buffer.end(), val); }
-
-    template <typename T>
-    OpKernelArg(T arg) : buffer(sizeof(T))
-    {
-        static_assert(std::is_trivial<T>{} || std::is_same<T, half_float::half>{},
-                      "Only for trivial types");
-        *(reinterpret_cast<T*>(buffer.data())) = arg;
-    }
-
-    template <typename T>
-    OpKernelArg(T* arg) // NOLINT
-        : buffer(sizeof(T*))
-    {
-        *(reinterpret_cast<T**>(buffer.data())) = arg;
-        is_ptr                                  = true;
-    }
-
-    std::size_t size() const { return buffer.size(); };
-    boost::container::small_vector<char, 8> buffer;
-    bool is_ptr = false;
-};
-
-} // namespace online_compile
-
-#endif
diff --git a/src/composable_kernel/host/online_compile/include/simple_hash.hpp b/src/composable_kernel/host/online_compile/include/simple_hash.hpp
deleted file mode 100644
index 1afa2e2066..0000000000
--- a/src/composable_kernel/host/online_compile/include/simple_hash.hpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2018 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
-#ifndef GUARD_OLC_SIMPLE_HASH_HPP
-#define GUARD_OLC_SIMPLE_HASH_HPP
-
-#include <string>
-
-namespace online_compile {
-struct SimpleHash
-{
-    size_t operator()(const std::pair<std::string, std::string>& p) const
-    {
-        using std::hash;
-        return (hash<std::string>()(p.first) ^ hash<std::string>()(p.second));
-    }
-};
-
-} // namespace online_compile
-
-#endif
diff --git a/src/composable_kernel/host/online_compile/include/stringutils.hpp b/src/composable_kernel/host/online_compile/include/stringutils.hpp
deleted file mode 100644
index 71975f430a..0000000000
--- a/src/composable_kernel/host/online_compile/include/stringutils.hpp
+++ /dev/null
@@ -1,133 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_OLC_STRINGUTILS_HPP
-#define GUARD_OLC_STRINGUTILS_HPP
-
-#include <algorithm>
-#include <iterator>
-#include <numeric>
-#include <string>
-#include <vector>
-#include <sstream>
-
-#define OLC_STRINGIZE_1(...) #__VA_ARGS__
-#define OLC_STRINGIZE(...) OLC_STRINGIZE_1(__VA_ARGS__)
-
-namespace online_compile {
-
-inline std::string
-ReplaceString(std::string subject, const std::string& search, const std::string& replace)
-{
-    size_t pos = 0;
-    while((pos = subject.find(search, pos)) != std::string::npos)
-    {
-        subject.replace(pos, search.length(), replace);
-        pos += replace.length();
-    }
-    return subject;
-}
-
-inline bool EndsWith(const std::string& value, const std::string& suffix)
-{
-    if(suffix.size() > value.size())
-        return false;
-    else
-        return std::equal(suffix.rbegin(), suffix.rend(), value.rbegin());
-}
-
-template <class Strings>
-inline std::string JoinStrings(Strings strings, std::string delim)
-{
-    auto it = strings.begin();
-    if(it == strings.end())
-        return "";
-
-    auto nit = std::next(it);
-    return std::accumulate(
-        nit, strings.end(), *it, [&](std::string x, std::string y) { return x + delim + y; });
-}
-
-template <class F>
-static inline std::string TransformString(std::string s, F f)
-{
-    std::transform(s.begin(), s.end(), s.begin(), f);
-    return s;
-}
-
-inline std::string ToUpper(std::string s) { return TransformString(std::move(s), ::toupper); }
-
-inline bool StartsWith(const std::string& value, const std::string& prefix)
-{
-    if(prefix.size() > value.size())
-        return false;
-    else
-        return std::equal(prefix.begin(), prefix.end(), value.begin());
-}
-
-inline std::string RemovePrefix(std::string s, std::string prefix)
-{
-    if(StartsWith(s, prefix))
-        return s.substr(prefix.length());
-    else
-        return s;
-}
-
-inline std::vector<std::string> SplitSpaceSeparated(const std::string& in)
-{
-    std::istringstream ss(in);
-    std::istream_iterator<std::string> begin(ss), end;
-    return {begin, end};
-}
-
-inline std::vector<std::string> SplitSpaceSeparated(const std::string& in,
-                                                    const std::vector<std::string>& dontSplitAfter)
-{
-    std::vector<std::string> rv;
-    std::istringstream ss(in);
-    std::string s;
-    while(ss >> s)
-    {
-        if(std::any_of(dontSplitAfter.begin(), dontSplitAfter.end(), [&](const auto& dont) {
-               return dont == s;
-           }))
-        {
-            std::string s2;
-            if(ss >> s2)
-            {
-                s += std::string(" ").append(s2); // Exactly one space is important.
-                rv.push_back(s);
-                continue;
-            }
-            throw std::runtime_error("Error parsing string: '" + in + '\'');
-        }
-        rv.push_back(s);
-    }
-    return rv;
-}
-
-} // namespace online_compile
-
-#endif // GUARD_OLC_STRINGUTILS_HPP
diff --git a/src/composable_kernel/host/online_compile/include/target_properties.hpp b/src/composable_kernel/host/online_compile/include/target_properties.hpp
deleted file mode 100644
index 349a63fdd5..0000000000
--- a/src/composable_kernel/host/online_compile/include/target_properties.hpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2020 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_OLC_TARGET_PROPERTIES_HPP
-#define GUARD_OLC_TARGET_PROPERTIES_HPP
-
-#include <boost/optional.hpp>
-#include <string>
-
-namespace online_compile {
-
-struct Handle;
-
-struct TargetProperties
-{
-    const std::string& Name() const { return name; }
-    const std::string& DbId() const { return dbId; }
-    boost::optional<bool> Xnack() const { return xnack; }
-    boost::optional<bool> Sramecc() const { return sramecc; }
-    boost::optional<bool> SrameccReported() const { return sramecc_reported; }
-    void Init(const Handle*);
-
-    private:
-    void InitDbId();
-    std::string name;
-    std::string dbId;
-    boost::optional<bool> xnack            = boost::none;
-    boost::optional<bool> sramecc          = boost::none;
-    boost::optional<bool> sramecc_reported = boost::none;
-};
-
-} // namespace online_compile
-
-#endif // GUARD_OLC_TARGET_PROPERTIES_HPP
diff --git a/src/composable_kernel/host/online_compile/include/tmp_dir.hpp b/src/composable_kernel/host/online_compile/include/tmp_dir.hpp
deleted file mode 100644
index 3221786061..0000000000
--- a/src/composable_kernel/host/online_compile/include/tmp_dir.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef GUARD_OLC_TMP_DIR_HPP
-#define GUARD_OLC_TMP_DIR_HPP
-
-#include <string>
-#include <boost/filesystem/path.hpp>
-
-namespace online_compile {
-
-void SystemCmd(std::string cmd);
-
-struct TmpDir
-{
-    boost::filesystem::path path;
-    TmpDir(std::string prefix);
-
-    TmpDir(TmpDir const&) = delete;
-    TmpDir& operator=(TmpDir const&) = delete;
-
-    void Execute(std::string exe, std::string args) const;
-
-    ~TmpDir();
-};
-
-} // namespace online_compile
-
-#endif
diff --git a/src/composable_kernel/host/online_compile/include/write_file.hpp b/src/composable_kernel/host/online_compile/include/write_file.hpp
deleted file mode 100644
index 098ff17abf..0000000000
--- a/src/composable_kernel/host/online_compile/include/write_file.hpp
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef GUARD_OLC_WRITE_FILE_HPP
-#define GUARD_OLC_WRITE_FILE_HPP
-
-#include <boost/filesystem.hpp>
-#include <manage_ptr.hpp>
-#include <fstream>
-
-namespace online_compile {
-
-using FilePtr = OLC_MANAGE_PTR(FILE*, std::fclose);
-
-inline void WriteFile(const std::string& content, const boost::filesystem::path& name)
-{
-    // std::cerr << "Write file: " << name << std::endl;
-    FilePtr f{std::fopen(name.string().c_str(), "w")};
-    if(std::fwrite(content.c_str(), 1, content.size(), f.get()) != content.size())
-        throw std::runtime_error("Failed to write to file");
-}
-
-inline void WriteFile(const std::vector<char>& content, const boost::filesystem::path& name)
-{
-    // std::cerr << "Write file: " << name << std::endl;
-    FilePtr f{std::fopen(name.string().c_str(), "w")};
-    if(std::fwrite(&content[0], 1, content.size(), f.get()) != content.size())
-        throw std::runtime_error("Failed to write to file");
-}
-
-} // namespace online_compile
-
-#endif
diff --git a/src/composable_kernel/host/online_compile/kernel.cpp.in b/src/composable_kernel/host/online_compile/kernel.cpp.in
deleted file mode 100644
index b9a9805284..0000000000
--- a/src/composable_kernel/host/online_compile/kernel.cpp.in
+++ /dev/null
@@ -1,70 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include <algorithm>
-#include <map>
-#include <stdexcept>
-
-// clang-format off
-${KERNELS_DECLS}
-// clang-format on
-
-namespace online_compile {
-
-const std::map<std::string, std::string>& kernels()
-{
-    static const std::map<std::string, std::string> data{${INIT_KERNELS}};
-    return data;
-}
-
-std::string GetKernelSrc(std::string name)
-{
-    // Use the base name of the string
-    int start  = 0;
-    auto slash = static_cast<int>(name.find_last_of("/\\"));
-    if(slash != std::string::npos)
-    {
-        start = slash + 1;
-    }
-
-    int len = name.size();
-    auto ex = static_cast<int>(name.rfind('.'));
-    if(ex != std::string::npos)
-    {
-        len = ex - start;
-    }
-
-    auto key = name.substr(start, len);
-    // Convert to uppercase
-    std::transform(key.begin(), key.end(), key.begin(), ::toupper);
-
-    auto it = kernels().find(key);
-    if(it == kernels().end())
-        throw std::runtime_error("Failed to load kernel source: " + key);
-
-    return it->second;
-}
-
-} // namespace online_compile
diff --git a/src/composable_kernel/host/online_compile/kernel_includes.cpp.in b/src/composable_kernel/host/online_compile/kernel_includes.cpp.in
deleted file mode 100644
index a7e6bd689b..0000000000
--- a/src/composable_kernel/host/online_compile/kernel_includes.cpp.in
+++ /dev/null
@@ -1,80 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2019 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include "olc_kernel_includes.h"
-#include <algorithm>
-#include <map>
-#include <stdexcept>
-#include <vector>
-
-namespace online_compile {
-
-static inline bool EndsWith(const std::string& value, const std::string& suffix)
-{
-    if(suffix.size() > value.size())
-        return false;
-    else
-        return std::equal(suffix.rbegin(), suffix.rend(), value.rbegin());
-}
-
-const std::map<std::string, std::string>& kernel_includes()
-{
-    static const std::map<std::string, std::string> data{${INIT_KERNELS}};
-    return data;
-}
-
-std::string GetKernelInc(std::string key)
-{
-    auto it = kernel_includes().find(key);
-    if(it == kernel_includes().end())
-        throw std::runtime_error("Failed to load kernel source: " + key);
-
-    return it->second;
-}
-
-std::vector<std::string> GetKernelIncList()
-{
-    std::vector<std::string> keys;
-    auto m = kernel_includes();
-    std::transform(m.begin(),
-                   m.end(),
-                   std::back_inserter(keys),
-                   [](decltype(m)::value_type const& pair) { return pair.first; });
-    return keys;
-}
-
-std::vector<std::string> GetHipKernelIncList()
-{
-    auto keys = GetKernelIncList();
-    keys.erase(std::remove_if(keys.begin(),
-                              keys.end(),
-                              [&](const auto& key) {
-                                  return !(EndsWith(key, ".hpp") || EndsWith(key, ".h"));
-                              }),
-               keys.end());
-    return keys;
-}
-
-} // namespace online_compile
diff --git a/src/composable_kernel/host/online_compile/kernels_batch.cpp.in b/src/composable_kernel/host/online_compile/kernels_batch.cpp.in
deleted file mode 100644
index a31caf9c5e..0000000000
--- a/src/composable_kernel/host/online_compile/kernels_batch.cpp.in
+++ /dev/null
@@ -1 +0,0 @@
-#include "${KERNEL_SRC_HPP_FILENAME}"
diff --git a/src/composable_kernel/host/driver_online/include/online_driver_common.hpp b/src/composable_kernel/host/solver/include/solver_common.hpp
similarity index 79%
rename from src/composable_kernel/host/driver_online/include/online_driver_common.hpp
rename to src/composable_kernel/host/solver/include/solver_common.hpp
index 508a3594cd..d1792f7681 100644
--- a/src/composable_kernel/host/driver_online/include/online_driver_common.hpp
+++ b/src/composable_kernel/host/solver/include/solver_common.hpp
@@ -1,16 +1,9 @@
-#ifndef ONLINE_DRIVER_COMMON_HPP
-#define ONLINE_DRIVER_COMMON_HPP
+#ifndef CK_SOLVER_COMMON_HPP
+#define CK_SOLVER_COMMON_HPP
 
 namespace ck {
 namespace driver {
 
-inline auto get_ck_hip_online_compile_common_flag()
-{
-    std::string param = " -std=c++17";
-
-    return param;
-}
-
 // greatest common divisor, aka highest common factor
 inline int gcd(int x, int y)
 {
diff --git a/src/include/miopen/solver/ck_util.hpp b/src/include/miopen/solver/ck_util.hpp
index 2bfaf8ce28..129ef76903 100644
--- a/src/include/miopen/solver/ck_util.hpp
+++ b/src/include/miopen/solver/ck_util.hpp
@@ -35,7 +35,7 @@
 
 #include "../composable_kernel/composable_kernel/include/utility/data_type_enum.hpp"
 #include "../composable_kernel/host/solver/include/convolution_problem_descriptor.hpp"
-#include "../composable_kernel/host/driver_online/include/online_driver_common.hpp"
+#include "../composable_kernel/host/solver/include/solver_common.hpp"
 
 MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM)
 MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CK_USE_AMD_BUFFER_ADDRESSING)

From fff94ec3d26c78467edc1cbfd027ea1bec1cbe76 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Sat, 7 Aug 2021 02:09:08 +0000
Subject: [PATCH 07/57] refactor

---
 .../miopen/solver/ck_utility_common.hpp       | 141 ++++++++++++++++++
 ...pp => convolution_context_interpreter.hpp} | 112 ++------------
 .../miopen/solver/implicitgemm_util.hpp       |   7 +-
 .../conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp     |  32 ++--
 4 files changed, 174 insertions(+), 118 deletions(-)
 create mode 100644 src/include/miopen/solver/ck_utility_common.hpp
 rename src/include/miopen/solver/{ck_util.hpp => convolution_context_interpreter.hpp} (61%)

diff --git a/src/include/miopen/solver/ck_utility_common.hpp b/src/include/miopen/solver/ck_utility_common.hpp
new file mode 100644
index 0000000000..c9b1a1b080
--- /dev/null
+++ b/src/include/miopen/solver/ck_utility_common.hpp
@@ -0,0 +1,141 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef GUARD_CK_UTIL_HPP_
+#define GUARD_CK_UTIL_HPP_
+
+#include <miopen/env.hpp>
+#include <miopen/hip_build_utils.hpp>
+#include <miopen/mlo_internal.hpp>
+#include <miopen/rocm_features.hpp>
+#include <miopen/solver/convolution_context_interpreter.hpp>
+#include <algorithm>
+
+#include "../composable_kernel/composable_kernel/include/utility/data_type_enum.hpp"
+#include "../composable_kernel/host/solver/include/convolution_problem_descriptor.hpp"
+#include "../composable_kernel/host/solver/include/solver_common.hpp"
+
+MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM)
+MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CK_USE_AMD_BUFFER_ADDRESSING)
+
+namespace miopen {
+namespace solver {
+namespace ck_utility {
+
+static inline bool is_ck_supported_hardware(const ConvolutionContext& c)
+{
+    return (StartsWith(c.GetStream().GetDeviceName(), "gfx803") &&
+            c.GetStream().GetMaxComputeUnits() == 64) ||
+           StartsWith(c.GetStream().GetDeviceName(), "gfx900") ||
+           StartsWith(c.GetStream().GetDeviceName(), "gfx906") ||
+           StartsWith(c.GetStream().GetDeviceName(), "gfx908") ||
+           StartsWith(c.GetStream().GetDeviceName(), "gfx90a") ||
+           StartsWith(c.GetStream().GetDeviceName(), "gfx1030");
+}
+
+static inline bool is_support_amd_buffer_atomic_fadd(const std::string& device_name)
+{
+    return StartsWith(device_name, "gfx908");
+}
+
+static inline auto get_ck_common_compiler_flag(const ConvolutionContext& ctx)
+{
+    auto compiler_flag = std::string(" --std=c++17");
+
+    // GPU target
+    std::string gpu_target;
+
+    if(StartsWith(ctx.GetStream().GetDeviceName(), "gfx803"))
+        compiler_flag += std::string(" -DCK_AMD_GPU_GFX803");
+    else if(StartsWith(ctx.GetStream().GetDeviceName(), "gfx900"))
+        compiler_flag += std::string(" -DCK_AMD_GPU_GFX900");
+    else if(StartsWith(ctx.GetStream().GetDeviceName(), "gfx906"))
+        compiler_flag += std::string(" -DCK_AMD_GPU_GFX906");
+    else if(StartsWith(ctx.GetStream().GetDeviceName(), "gfx908"))
+        compiler_flag += std::string(" -DCK_AMD_GPU_GFX908");
+    else if(StartsWith(ctx.GetStream().GetDeviceName(), "gfx90a"))
+        compiler_flag += std::string(" -DCK_AMD_GPU_GFX90A");
+    else if(StartsWith(ctx.GetStream().GetDeviceName(), "gfx1030"))
+        compiler_flag += std::string(" -DCK_AMD_GPU_GFX1030");
+
+    // buffer atomic-fadd
+    compiler_flag +=
+        std::string(" -DCK_USE_AMD_BUFFER_ATOMIC_FADD=") +
+        (is_support_amd_buffer_atomic_fadd(ctx.GetStream().GetDeviceName()) ? '1' : '0');
+
+    // sync LDS
+    compiler_flag +=
+        std::string(" -DCK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM=") +
+        (miopen::IsDisabled(MIOPEN_DEBUG_CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM{}) ? '0' : '1');
+
+    // buffer addressing
+    compiler_flag += std::string(" -DCK_USE_AMD_BUFFER_ADDRESSING=") +
+                     (miopen::IsDisabled(MIOPEN_DEBUG_CK_USE_AMD_BUFFER_ADDRESSING{}) ? '0' : '1');
+
+    return compiler_flag;
+}
+
+static inline auto get_ck_convolution_problem_descriptor(const ConvolutionContext& ctx)
+{
+    ck::DataTypeEnum_t ck_datatype;
+
+    if(ctx.IsFp32())
+        ck_datatype = ck::DataTypeEnum_t::Float;
+    else if(ctx.IsFp16())
+        ck_datatype = ck::DataTypeEnum_t::Half;
+    else if(ctx.IsBfp16())
+        ck_datatype = ck::DataTypeEnum_t::BFloat16;
+    else
+        ck_datatype = ck::DataTypeEnum_t::Unknown;
+
+    return ck::driver::ConvolutionProblemDescriptor{
+        miopen::solver::ConvolutionContextInterpreter::GetBatchN(ctx),
+        ConvolutionContextInterpreter::GetOutputChannelK(ctx),
+        ConvolutionContextInterpreter::GetInputChannelC(ctx),
+        ConvolutionContextInterpreter::GetFilterHeightY(ctx),
+        ConvolutionContextInterpreter::GetFilterWidthX(ctx),
+        ConvolutionContextInterpreter::GetInputHeightHi(ctx),
+        ConvolutionContextInterpreter::GetInputWidthWi(ctx),
+        ConvolutionContextInterpreter::GetOutputHeightHo(ctx),
+        ConvolutionContextInterpreter::GetOutputWidthWo(ctx),
+        ConvolutionContextInterpreter::GetAdjustedConvolutionStrideH(ctx),
+        ConvolutionContextInterpreter::GetAdjustedConvolutionStrideW(ctx),
+        ConvolutionContextInterpreter::GetAdjustedConvolutionDilationH(ctx),
+        ConvolutionContextInterpreter::GetAdjustedConvolutionDilationW(ctx),
+        ConvolutionContextInterpreter::GetInputLeftPadH(ctx),
+        ConvolutionContextInterpreter::GetInputLeftPadW(ctx),
+        ConvolutionContextInterpreter::GetAdjustedInputRightPadH(ctx),
+        ConvolutionContextInterpreter::GetAdjustedInputRightPadW(ctx),
+        ck_datatype,
+        ck_datatype,
+        ck_datatype};
+}
+
+} // namespace ck_utility
+} // namespace solver
+} // namespace miopen
+
+#endif
diff --git a/src/include/miopen/solver/ck_util.hpp b/src/include/miopen/solver/convolution_context_interpreter.hpp
similarity index 61%
rename from src/include/miopen/solver/ck_util.hpp
rename to src/include/miopen/solver/convolution_context_interpreter.hpp
index 129ef76903..fff7455ec1 100644
--- a/src/include/miopen/solver/ck_util.hpp
+++ b/src/include/miopen/solver/convolution_context_interpreter.hpp
@@ -24,80 +24,22 @@
  *
  *******************************************************************************/
 
-#ifndef GUARD_CK_UTIL_HPP_
-#define GUARD_CK_UTIL_HPP_
+#ifndef GUARD_CONVOLUTION_CONTEXT_INTERPRETER_HPP_
+#define GUARD_CONVOLUTION_CONTEXT_INTERPRETER_HPP_
 
 #include <miopen/env.hpp>
-#include <miopen/hip_build_utils.hpp>
 #include <miopen/mlo_internal.hpp>
 #include <miopen/rocm_features.hpp>
 #include <algorithm>
 
-#include "../composable_kernel/composable_kernel/include/utility/data_type_enum.hpp"
-#include "../composable_kernel/host/solver/include/convolution_problem_descriptor.hpp"
-#include "../composable_kernel/host/solver/include/solver_common.hpp"
-
-MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM)
-MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CK_USE_AMD_BUFFER_ADDRESSING)
-
 namespace miopen {
 namespace solver {
 
-static inline bool is_composable_kernel_supported_hardware(const ConvolutionContext& c)
-{
-    return (StartsWith(c.GetStream().GetDeviceName(), "gfx803") &&
-            c.GetStream().GetMaxComputeUnits() == 64) ||
-           StartsWith(c.GetStream().GetDeviceName(), "gfx900") ||
-           StartsWith(c.GetStream().GetDeviceName(), "gfx906") ||
-           StartsWith(c.GetStream().GetDeviceName(), "gfx908") ||
-           StartsWith(c.GetStream().GetDeviceName(), "gfx90a") ||
-           StartsWith(c.GetStream().GetDeviceName(), "gfx1030");
-}
-
-static inline bool support_amd_buffer_atomic_fadd(const std::string& device_name)
-{
-    return StartsWith(device_name, "gfx908");
-}
-
-static inline auto get_ck_common_compiler_flag(const ConvolutionContext& ctx)
-{
-    auto compiler_flag = std::string(" --std=c++17");
-
-    // GPU target
-    std::string gpu_target;
-
-    if(StartsWith(ctx.GetStream().GetDeviceName(), "gfx803"))
-        compiler_flag += std::string(" -DCK_AMD_GPU_GFX803");
-    else if(StartsWith(ctx.GetStream().GetDeviceName(), "gfx900"))
-        compiler_flag += std::string(" -DCK_AMD_GPU_GFX900");
-    else if(StartsWith(ctx.GetStream().GetDeviceName(), "gfx906"))
-        compiler_flag += std::string(" -DCK_AMD_GPU_GFX906");
-    else if(StartsWith(ctx.GetStream().GetDeviceName(), "gfx908"))
-        compiler_flag += std::string(" -DCK_AMD_GPU_GFX908");
-    else if(StartsWith(ctx.GetStream().GetDeviceName(), "gfx90a"))
-        compiler_flag += std::string(" -DCK_AMD_GPU_GFX90A");
-    else if(StartsWith(ctx.GetStream().GetDeviceName(), "gfx1030"))
-        compiler_flag += std::string(" -DCK_AMD_GPU_GFX1030");
-
-    // buffer atomic-fadd
-    compiler_flag += std::string(" -DCK_USE_AMD_BUFFER_ATOMIC_FADD=") +
-                     (support_amd_buffer_atomic_fadd(ctx.GetStream().GetDeviceName()) ? '1' : '0');
-
-    // sync LDS
-    compiler_flag +=
-        std::string(" -DCK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM=") +
-        (miopen::IsDisabled(MIOPEN_DEBUG_CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM{}) ? '0' : '1');
-
-    // buffer addressing
-    compiler_flag += std::string(" -DCK_USE_AMD_BUFFER_ADDRESSING=") +
-                     (miopen::IsDisabled(MIOPEN_DEBUG_CK_USE_AMD_BUFFER_ADDRESSING{}) ? '0' : '1');
-
-    return compiler_flag;
-}
-
 // 1. get the original dimension of conv problem
-//    (undo the dimeniosn swapping happened inside ConvolutionContext)
-// 2. adjust right padding size to align with the way implicit GEMM deal with padding
+//    (undo the dimeniosn swapping and tensor swapping happened inside ConvolutionContext)
+// 2. adjust right padding size so that filter will not move out-of-bound
+// 3. adjust stride to 1 if output image size is 1
+// 4. adjust dilation to 1 if filter size is 1
 struct ConvolutionContextInterpreter
 {
     static auto GetGroupCountG(const ConvolutionContext& c) { return c.group_counts; }
@@ -244,7 +186,7 @@ struct ConvolutionContextInterpreter
 
     static auto GetInputLeftPadW(const ConvolutionContext& c) { return c.pad_w; }
 
-    // adjust right padding size to align with the way implicit GEMM deal with padding
+    // adjust right padding size so that filter will not move out-of-bound
     static auto GetAdjustedInputRightPadD(const ConvolutionContext& c)
     {
         int di              = GetInputDepthDi(c);
@@ -262,7 +204,7 @@ struct ConvolutionContextInterpreter
         return in_right_pad_d;
     }
 
-    // adjust right padding size to align with the way implicit GEMM deal with padding
+    // adjust right padding size so that filter will not move out-of-bound
     static auto GetAdjustedInputRightPadH(const ConvolutionContext& c)
     {
         int hi              = GetInputHeightHi(c);
@@ -280,7 +222,7 @@ struct ConvolutionContextInterpreter
         return in_right_pad_h;
     }
 
-    // adjust right padding size to align with the way implicit GEMM deal with padding
+    // adjust right padding size so that filter will not move out-of-bound
     static auto GetAdjustedInputRightPadW(const ConvolutionContext& c)
     {
         int wi              = GetInputWidthWi(c);
@@ -299,42 +241,6 @@ struct ConvolutionContextInterpreter
     }
 };
 
-static inline auto get_ck_convolution_problem_descriptor(const ConvolutionContext& ctx)
-{
-    ck::DataTypeEnum_t ck_datatype;
-
-    if(ctx.IsFp32())
-        ck_datatype = ck::DataTypeEnum_t::Float;
-    else if(ctx.IsFp16())
-        ck_datatype = ck::DataTypeEnum_t::Half;
-    else if(ctx.IsBfp16())
-        ck_datatype = ck::DataTypeEnum_t::BFloat16;
-    else
-        ck_datatype = ck::DataTypeEnum_t::Unknown;
-
-    return ck::driver::ConvolutionProblemDescriptor{
-        ConvolutionContextInterpreter::GetBatchN(ctx),
-        ConvolutionContextInterpreter::GetOutputChannelK(ctx),
-        ConvolutionContextInterpreter::GetInputChannelC(ctx),
-        ConvolutionContextInterpreter::GetFilterHeightY(ctx),
-        ConvolutionContextInterpreter::GetFilterWidthX(ctx),
-        ConvolutionContextInterpreter::GetInputHeightHi(ctx),
-        ConvolutionContextInterpreter::GetInputWidthWi(ctx),
-        ConvolutionContextInterpreter::GetOutputHeightHo(ctx),
-        ConvolutionContextInterpreter::GetOutputWidthWo(ctx),
-        ConvolutionContextInterpreter::GetAdjustedConvolutionStrideH(ctx),
-        ConvolutionContextInterpreter::GetAdjustedConvolutionStrideW(ctx),
-        ConvolutionContextInterpreter::GetAdjustedConvolutionDilationH(ctx),
-        ConvolutionContextInterpreter::GetAdjustedConvolutionDilationW(ctx),
-        ConvolutionContextInterpreter::GetInputLeftPadH(ctx),
-        ConvolutionContextInterpreter::GetInputLeftPadW(ctx),
-        ConvolutionContextInterpreter::GetAdjustedInputRightPadH(ctx),
-        ConvolutionContextInterpreter::GetAdjustedInputRightPadW(ctx),
-        ck_datatype,
-        ck_datatype,
-        ck_datatype};
-}
-
 } // namespace solver
 } // namespace miopen
 
diff --git a/src/include/miopen/solver/implicitgemm_util.hpp b/src/include/miopen/solver/implicitgemm_util.hpp
index 113c1643fc..0032a62029 100644
--- a/src/include/miopen/solver/implicitgemm_util.hpp
+++ b/src/include/miopen/solver/implicitgemm_util.hpp
@@ -31,7 +31,7 @@
 #include <miopen/hip_build_utils.hpp>
 #include <miopen/mlo_internal.hpp>
 #include <miopen/rocm_features.hpp>
-#include <miopen/solver/ck_util.hpp>
+#include <miopen/solver/convolution_context_interpreter.hpp>
 #include <algorithm>
 
 MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_IMPLICIT_GEMM_NON_XDLOPS_INLINE_ASM)
@@ -490,6 +490,11 @@ static inline bool is_use_v_fmac_f32(const ConvolutionContext& ctx)
     return StartsWith(device_name, "gfx1030");
 }
 
+static inline bool support_amd_buffer_atomic_fadd(const std::string& device_name)
+{
+    return StartsWith(device_name, "gfx908");
+}
+
 template <typename T>
 int amd_buffer_load_max_length()
 {
diff --git a/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp b/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp
index 5ce2131e42..dc56bf85ce 100644
--- a/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp
+++ b/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp
@@ -28,7 +28,7 @@
 #include <miopen/solver.hpp>
 #include <miopen/handle.hpp>
 #include <miopen/generic_search.hpp>
-#include <miopen/solver/ck_util.hpp>
+#include <miopen/solver/ck_utility_common.hpp>
 #include <cstddef>
 
 #include "../composable_kernel/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp"
@@ -37,6 +37,7 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW)
 
 namespace miopen {
 namespace solver {
+namespace ck_utility {
 
 static inline auto get_ck_tunable_conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw(
     const PerformanceConvCkIgemmFwdV6r1DlopsNchw& config)
@@ -45,6 +46,8 @@ static inline auto get_ck_tunable_conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw(
                                                                                .ck_tunable_list_id];
 }
 
+} // namespace ck_utility
+
 bool PerformanceConvCkIgemmFwdV6r1DlopsNchw::SetNextValue(const ConvolutionContext&)
 {
     if(ck_tunable_list_id <
@@ -66,14 +69,14 @@ bool PerformanceConvCkIgemmFwdV6r1DlopsNchw::IsValid(const ConvolutionContext& c
 
     std::tie(compile_param, found) =
         ck::driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::CalculateCompileParameterBasedOnTunable(
-            get_ck_convolution_problem_descriptor(ctx),
-            get_ck_tunable_conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw(*this));
+            ck_utility::get_ck_convolution_problem_descriptor(ctx),
+            ck_utility::get_ck_tunable_conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw(*this));
 
     if(!found)
         return false;
 
     return ck::driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::IsValidCompileParameter(
-        get_ck_convolution_problem_descriptor(ctx), compile_param);
+        ck_utility::get_ck_convolution_problem_descriptor(ctx), compile_param);
 }
 
 bool ConvCkIgemmFwdV6r1DlopsNchw::IsApplicable(const ConvolutionContext& ctx) const
@@ -82,7 +85,7 @@ bool ConvCkIgemmFwdV6r1DlopsNchw::IsApplicable(const ConvolutionContext& ctx) co
         return false;
     if(!ctx.use_hip_kernels)
         return false;
-    if(!is_composable_kernel_supported_hardware(ctx))
+    if(!ck_utility::is_ck_supported_hardware(ctx))
         return false;
     if(!ctx.IsLayoutDefault())
         return false;
@@ -96,7 +99,7 @@ bool ConvCkIgemmFwdV6r1DlopsNchw::IsApplicable(const ConvolutionContext& ctx) co
         return false;
 
     return ck::driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::IsApplicable(
-        get_ck_convolution_problem_descriptor(ctx));
+        ck_utility::get_ck_convolution_problem_descriptor(ctx));
 }
 
 PerformanceConvCkIgemmFwdV6r1DlopsNchw
@@ -125,13 +128,14 @@ ConvSolution ConvCkIgemmFwdV6r1DlopsNchw::GetSolution(
     ConvSolution sol;
     KernelInfo kernel0_info, kernel1_info;
 
-    const auto ck_conv_problem_desc = get_ck_convolution_problem_descriptor(ctx);
+    const auto ck_conv_problem_desc = ck_utility::get_ck_convolution_problem_descriptor(ctx);
 
     auto ck_compile_param = ck::driver::CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{};
 
     std::tie(ck_compile_param, std::ignore) =
         ck::driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::CalculateCompileParameterBasedOnTunable(
-            ck_conv_problem_desc, get_ck_tunable_conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw(config));
+            ck_conv_problem_desc,
+            ck_utility::get_ck_tunable_conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw(config));
 
     // kernel0: prepare
     {
@@ -144,8 +148,8 @@ ConvSolution ConvCkIgemmFwdV6r1DlopsNchw::GetSolution(
         kernel0_info.l_wk = {1, 1, 1};
         kernel0_info.g_wk = {1, 1, 1};
 
-        kernel0_info.comp_options =
-            ck_compile_param.GetCompileParameterString() + get_ck_common_compiler_flag(ctx);
+        kernel0_info.comp_options = ck_compile_param.GetCompileParameterString() +
+                                    ck_utility::get_ck_common_compiler_flag(ctx);
     }
 
     // kernel1: compute
@@ -167,8 +171,8 @@ ConvSolution ConvCkIgemmFwdV6r1DlopsNchw::GetSolution(
         kernel1_info.l_wk = {block_size, 1, 1};
         kernel1_info.g_wk = {block_size * grid_size, 1, 1};
 
-        kernel1_info.comp_options =
-            ck_compile_param.GetCompileParameterString() + get_ck_common_compiler_flag(ctx);
+        kernel1_info.comp_options = ck_compile_param.GetCompileParameterString() +
+                                    ck_utility::get_ck_common_compiler_flag(ctx);
     }
 
     sol.construction_params.push_back(kernel0_info);
@@ -209,7 +213,7 @@ ConvSolution ConvCkIgemmFwdV6r1DlopsNchw::GetSolution(
                 elapsed += handle.GetKernelTime();
             }
 
-            // kernel for computatition
+            // kernel for computation
             kernel1(tensors.w, tensors.in, tensors.out, data_ctx.workSpace);
 
             if(handle.IsProfilingEnabled())
@@ -227,7 +231,7 @@ ConvSolution ConvCkIgemmFwdV6r1DlopsNchw::GetSolution(
 std::size_t ConvCkIgemmFwdV6r1DlopsNchw::GetWorkspaceSize(const ConvolutionContext& ctx) const
 {
     return ck::driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetMaxWorkSpaceSize(
-        get_ck_convolution_problem_descriptor(ctx));
+        ck_utility::get_ck_convolution_problem_descriptor(ctx));
 }
 
 PerformanceConvCkIgemmFwdV6r1DlopsNchw

From 44da4773c4bc5aeb57cba1f7f01e042565babf30 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Sat, 7 Aug 2021 02:31:19 +0000
Subject: [PATCH 08/57] fix

---
 .../conv_bwd_driver_offline.cpp               | 58 +++++++++----------
 .../conv_fwd_driver_offline.cpp               | 58 +++++++++----------
 2 files changed, 58 insertions(+), 58 deletions(-)

diff --git a/src/composable_kernel/host/driver_offline/conv_bwd_driver_offline.cpp b/src/composable_kernel/host/driver_offline/conv_bwd_driver_offline.cpp
index 61c3fc385d..c674ee5965 100644
--- a/src/composable_kernel/host/driver_offline/conv_bwd_driver_offline.cpp
+++ b/src/composable_kernel/host/driver_offline/conv_bwd_driver_offline.cpp
@@ -46,29 +46,29 @@ int main(int argc, char* argv[])
         exit(1);
     }
 
-    const ConvTensorLayout layout   = static_cast<ConvTensorLayout>(atoi(argv[1]));
-    const ConvBackwardDataAlgo algo = static_cast<ConvBackwardDataAlgo>(atoi(argv[2]));
-    const bool do_verification      = atoi(argv[3]);
-    const int init_method           = atoi(argv[4]);
-    const bool do_log               = atoi(argv[5]);
-    const int nrepeat               = atoi(argv[6]);
-
-    const index_t N  = atoi(argv[7]);
-    const index_t K  = atoi(argv[8]);
-    const index_t C  = atoi(argv[9]);
-    const index_t Y  = atoi(argv[10]);
-    const index_t X  = atoi(argv[11]);
-    const index_t Hi = atoi(argv[12]);
-    const index_t Wi = atoi(argv[13]);
-
-    const index_t conv_stride_h   = atoi(argv[14]);
-    const index_t conv_stride_w   = atoi(argv[15]);
-    const index_t conv_dilation_h = atoi(argv[16]);
-    const index_t conv_dilation_w = atoi(argv[17]);
-    const index_t in_left_pad_h   = atoi(argv[18]);
-    const index_t in_left_pad_w   = atoi(argv[19]);
-    const index_t in_right_pad_h  = atoi(argv[20]);
-    const index_t in_right_pad_w  = atoi(argv[21]);
+    const ConvTensorLayout layout   = static_cast<ConvTensorLayout>(std::stoi(argv[1]));
+    const ConvBackwardDataAlgo algo = static_cast<ConvBackwardDataAlgo>(std::stoi(argv[2]));
+    const bool do_verification      = std::stoi(argv[3]);
+    const int init_method           = std::stoi(argv[4]);
+    const bool do_log               = std::stoi(argv[5]);
+    const int nrepeat               = std::stoi(argv[6]);
+
+    const index_t N  = std::stoi(argv[7]);
+    const index_t K  = std::stoi(argv[8]);
+    const index_t C  = std::stoi(argv[9]);
+    const index_t Y  = std::stoi(argv[10]);
+    const index_t X  = std::stoi(argv[11]);
+    const index_t Hi = std::stoi(argv[12]);
+    const index_t Wi = std::stoi(argv[13]);
+
+    const index_t conv_stride_h   = std::stoi(argv[14]);
+    const index_t conv_stride_w   = std::stoi(argv[15]);
+    const index_t conv_dilation_h = std::stoi(argv[16]);
+    const index_t conv_dilation_w = std::stoi(argv[17]);
+    const index_t in_left_pad_h   = std::stoi(argv[18]);
+    const index_t in_left_pad_w   = std::stoi(argv[19]);
+    const index_t in_right_pad_h  = std::stoi(argv[20]);
+    const index_t in_right_pad_w  = std::stoi(argv[21]);
 
     const index_t YEff = (Y - 1) * conv_dilation_h + 1;
     const index_t XEff = (X - 1) * conv_dilation_w + 1;
@@ -83,12 +83,12 @@ int main(int argc, char* argv[])
         exit(1);
     }
 
-    const ConvTensorLayout layout   = static_cast<ConvTensorLayout>(atoi(argv[1]));
-    const ConvBackwardDataAlgo algo = static_cast<ConvBackwardDataAlgo>(atoi(argv[2]));
-    const bool do_verification      = atoi(argv[3]);
-    const int init_method           = atoi(argv[4]);
-    const bool do_log               = atoi(argv[5]);
-    const int nrepeat               = atoi(argv[6]);
+    const ConvTensorLayout layout   = static_cast<ConvTensorLayout>(std::stoi(argv[1]));
+    const ConvBackwardDataAlgo algo = static_cast<ConvBackwardDataAlgo>(std::stoi(argv[2]));
+    const bool do_verification      = std::stoi(argv[3]);
+    const int init_method           = std::stoi(argv[4]);
+    const bool do_log               = std::stoi(argv[5]);
+    const int nrepeat               = std::stoi(argv[6]);
 
     constexpr index_t N  = 128;
     constexpr index_t C  = 192;
diff --git a/src/composable_kernel/host/driver_offline/conv_fwd_driver_offline.cpp b/src/composable_kernel/host/driver_offline/conv_fwd_driver_offline.cpp
index ef2e16c4fa..7e4eb0571c 100644
--- a/src/composable_kernel/host/driver_offline/conv_fwd_driver_offline.cpp
+++ b/src/composable_kernel/host/driver_offline/conv_fwd_driver_offline.cpp
@@ -58,29 +58,29 @@ int main(int argc, char* argv[])
         exit(1);
     }
 
-    const ConvTensorLayout layout = static_cast<ConvTensorLayout>(atoi(argv[1]));
-    const ConvForwardAlgo algo    = static_cast<ConvForwardAlgo>(atoi(argv[2]));
-    const bool do_verification    = atoi(argv[3]);
-    const int init_method         = atoi(argv[4]);
-    const bool do_log             = atoi(argv[5]);
-    const int nrepeat             = atoi(argv[6]);
-
-    const index_t N  = atoi(argv[7]);
-    const index_t K  = atoi(argv[8]);
-    const index_t C  = atoi(argv[9]);
-    const index_t Y  = atoi(argv[10]);
-    const index_t X  = atoi(argv[11]);
-    const index_t Hi = atoi(argv[12]);
-    const index_t Wi = atoi(argv[13]);
-
-    const index_t conv_stride_h   = atoi(argv[14]);
-    const index_t conv_stride_w   = atoi(argv[15]);
-    const index_t conv_dilation_h = atoi(argv[16]);
-    const index_t conv_dilation_w = atoi(argv[17]);
-    const index_t in_left_pad_h   = atoi(argv[18]);
-    const index_t in_left_pad_w   = atoi(argv[19]);
-    const index_t in_right_pad_h  = atoi(argv[20]);
-    const index_t in_right_pad_w  = atoi(argv[21]);
+    const ConvTensorLayout layout = static_cast<ConvTensorLayout>(std::stoi(argv[1]));
+    const ConvForwardAlgo algo    = static_cast<ConvForwardAlgo>(std::stoi(argv[2]));
+    const bool do_verification    = std::stoi(argv[3]);
+    const int init_method         = std::stoi(argv[4]);
+    const bool do_log             = std::stoi(argv[5]);
+    const int nrepeat             = std::stoi(argv[6]);
+
+    const index_t N  = std::stoi(argv[7]);
+    const index_t K  = std::stoi(argv[8]);
+    const index_t C  = std::stoi(argv[9]);
+    const index_t Y  = std::stoi(argv[10]);
+    const index_t X  = std::stoi(argv[11]);
+    const index_t Hi = std::stoi(argv[12]);
+    const index_t Wi = std::stoi(argv[13]);
+
+    const index_t conv_stride_h   = std::stoi(argv[14]);
+    const index_t conv_stride_w   = std::stoi(argv[15]);
+    const index_t conv_dilation_h = std::stoi(argv[16]);
+    const index_t conv_dilation_w = std::stoi(argv[17]);
+    const index_t in_left_pad_h   = std::stoi(argv[18]);
+    const index_t in_left_pad_w   = std::stoi(argv[19]);
+    const index_t in_right_pad_h  = std::stoi(argv[20]);
+    const index_t in_right_pad_w  = std::stoi(argv[21]);
 
     const index_t YEff = (Y - 1) * conv_dilation_h + 1;
     const index_t XEff = (X - 1) * conv_dilation_w + 1;
@@ -95,12 +95,12 @@ int main(int argc, char* argv[])
         exit(1);
     }
 
-    const ConvTensorLayout layout = static_cast<ConvTensorLayout>(atoi(argv[1]));
-    const ConvForwardAlgo algo    = static_cast<ConvForwardAlgo>(atoi(argv[2]));
-    const bool do_verification    = atoi(argv[3]);
-    const int init_method         = atoi(argv[4]);
-    const bool do_log             = atoi(argv[5]);
-    const int nrepeat             = atoi(argv[6]);
+    const ConvTensorLayout layout = static_cast<ConvTensorLayout>(std::stoi(argv[1]));
+    const ConvForwardAlgo algo    = static_cast<ConvForwardAlgo>(std::stoi(argv[2]));
+    const bool do_verification    = std::stoi(argv[3]);
+    const int init_method         = std::stoi(argv[4]);
+    const bool do_log             = std::stoi(argv[5]);
+    const int nrepeat             = std::stoi(argv[6]);
 
     constexpr index_t N  = 128;
     constexpr index_t C  = 192;

From 12086674d8028a87c71a15b93a0d1e0b76bd5713 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Sat, 7 Aug 2021 04:25:43 +0000
Subject: [PATCH 09/57] add ctest

---
 test/CMakeLists.txt | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 7d3c1def6d..6314cdf5b0 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1289,6 +1289,35 @@ add_custom_test(test_regression_half_mi100 SKIP_UNLESS_ALL FLOAT_DISABLED HALF_E
 COMMAND MIOPEN_FIND_MODE=normal MIOPEN_DEBUG_FIND_ONLY_SOLVER=ConvHipImplicitGemmBwdDataV4R1Xdlops $<TARGET_FILE:test_conv2d> ${MIOPEN_TEST_FLOAT_ARG} --verbose --input  128  24 14 14 --weights 64  24 5 5 --pads_strides_dilations 2 2 1 1 1 1 --disable-forward --disable-backward-weights
 )
 
+set(CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW_ENV
+    MIOPEN_FIND_MODE=normal
+    MIOPEN_DEBUG_FIND_ONLY_SOLVER=ConvCkIgemmFwdV6r1DlopsNchw)
+
+add_custom_test(test_conv_ck_igemm_fwd_v6r1_dlops_nchw FLOAT_ENABLED HALF_ENABLED BF16_DISABLED VEGA_ENABLED GFX908_ENABLED GFX1030_ENABLED SKIP_UNLESS_ALL
+COMMAND ${CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW_ENV}     $<TARGET_FILE:test_conv2d> ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 128 1024 14 14  --weights 2048 1024 1 1 --pads_strides_dilations 0 0 2 2 1 1 --disable-backward-data --disable-backward-weights
+COMMAND ${CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW_ENV}     $<TARGET_FILE:test_conv2d> ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 128  256 14 14  --weights  256 1024 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights
+COMMAND ${CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW_ENV}     $<TARGET_FILE:test_conv2d> ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 128 1024 14 14  --weights  512 1024 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights
+COMMAND ${CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW_ENV}     $<TARGET_FILE:test_conv2d> ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 128  128 28 28  --weights  128 1024 3 3 --pads_strides_dilations 1 1 1 1 1 1 --disable-backward-data --disable-backward-weights
+COMMAND ${CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW_ENV}     $<TARGET_FILE:test_conv2d> ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 128  128 28 28  --weights  512  128 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights
+COMMAND ${CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW_ENV}     $<TARGET_FILE:test_conv2d> ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 128  128 58 58  --weights  128  128 3 3 --pads_strides_dilations 1 1 1 1 1 1 --disable-backward-data --disable-backward-weights
+COMMAND ${CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW_ENV}     $<TARGET_FILE:test_conv2d> ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 128 2048  7  7  --weights  512 2048 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights
+COMMAND ${CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW_ENV}     $<TARGET_FILE:test_conv2d> ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 128  256 14 14  --weights 1024  256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights
+COMMAND ${CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW_ENV}     $<TARGET_FILE:test_conv2d> ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 128  256 14 14  --weights  256  256 3 3 --pads_strides_dilations 1 1 1 1 1 1 --disable-backward-data --disable-backward-weights
+COMMAND ${CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW_ENV}     $<TARGET_FILE:test_conv2d> ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 128  256 30 30  --weights  256  256 3 3 --pads_strides_dilations 0 0 2 2 1 1 --disable-backward-data --disable-backward-weights
+COMMAND ${CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW_ENV}     $<TARGET_FILE:test_conv2d> ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 128  256 56 56  --weights  128  256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights
+COMMAND ${CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW_ENV}     $<TARGET_FILE:test_conv2d> ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 128  256 56 56  --weights  512  256 1 1 --pads_strides_dilations 0 0 2 2 1 1 --disable-backward-data --disable-backward-weights
+COMMAND ${CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW_ENV}     $<TARGET_FILE:test_conv2d> ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 128  256 56 56  --weights   64  256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights
+COMMAND ${CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW_ENV}     $<TARGET_FILE:test_conv2d> ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 128  512 16 16  --weights  512  512 3 3 --pads_strides_dilations 0 0 2 2 1 1 --disable-backward-data --disable-backward-weights
+COMMAND ${CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW_ENV}     $<TARGET_FILE:test_conv2d> ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 128  512 28 28  --weights 1024  512 1 1 --pads_strides_dilations 0 0 2 2 1 1 --disable-backward-data --disable-backward-weights
+COMMAND ${CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW_ENV}     $<TARGET_FILE:test_conv2d> ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 128  512 28 28  --weights  128  512 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights
+COMMAND ${CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW_ENV}     $<TARGET_FILE:test_conv2d> ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 128  512 28 28  --weights  256  512 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights
+COMMAND ${CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW_ENV}     $<TARGET_FILE:test_conv2d> ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 128  512  7  7  --weights 2048  512 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights
+COMMAND ${CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW_ENV}     $<TARGET_FILE:test_conv2d> ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 128  512  7  7  --weights  512  512 3 3 --pads_strides_dilations 1 1 1 1 1 1 --disable-backward-data --disable-backward-weights
+COMMAND ${CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW_ENV}     $<TARGET_FILE:test_conv2d> ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 128   64 56 56  --weights  256   64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights
+COMMAND ${CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW_ENV}     $<TARGET_FILE:test_conv2d> ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 128   64 56 56  --weights   64   64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights
+COMMAND ${CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW_ENV}     $<TARGET_FILE:test_conv2d> ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 128   64 56 56  --weights   64   64 3 3 --pads_strides_dilations 1 1 1 1 1 1 --disable-backward-data --disable-backward-weights
+)
+
 if(MIOPEN_TEST_DEEPBENCH)
     add_custom_test(test_deepbench_conv  MIOTENSILE_ENABLED GFX1030_ENABLED
     COMMAND	$<TARGET_FILE:test_conv2d>	--verbose	--input	4	1	161	700	--weights	32	1	5	20	--pads_strides_dilations	0	0	2	2	1	1

From 9f89938610c5cdcd85ac714bdd815f3e02a96c0c Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Fri, 6 Aug 2021 23:51:45 -0500
Subject: [PATCH 10/57] tidy

---
 src/include/miopen/solver.hpp                    | 2 +-
 src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/include/miopen/solver.hpp b/src/include/miopen/solver.hpp
index 72f44791ea..165fe7bfeb 100644
--- a/src/include/miopen/solver.hpp
+++ b/src/include/miopen/solver.hpp
@@ -2186,7 +2186,7 @@ struct ConvCkIgemmFwdV6r1DlopsNchw : SolverBase<ConvolutionContext>
     PerformanceConvCkIgemmFwdV6r1DlopsNchw Search(const ConvolutionContext&,
                                                   const AnyInvokeParams&) const;
     ConvSolution GetSolution(const ConvolutionContext&,
-                             const PerformanceConvCkIgemmFwdV6r1DlopsNchw,
+                             const PerformanceConvCkIgemmFwdV6r1DlopsNchw&,
                              bool disableConfigOverrideFromEnv = false) const;
 };
 
diff --git a/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp b/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp
index dc56bf85ce..66b0090eae 100644
--- a/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp
+++ b/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp
@@ -123,7 +123,7 @@ bool ConvCkIgemmFwdV6r1DlopsNchw::IsValidPerformanceConfig(
 }
 
 ConvSolution ConvCkIgemmFwdV6r1DlopsNchw::GetSolution(
-    const ConvolutionContext& ctx, const PerformanceConvCkIgemmFwdV6r1DlopsNchw config, bool) const
+    const ConvolutionContext& ctx, const PerformanceConvCkIgemmFwdV6r1DlopsNchw& config, bool) const
 {
     ConvSolution sol;
     KernelInfo kernel0_info, kernel1_info;

From 4f825a5f9638bbd1612db3b6ecf0212c74658da5 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Sun, 8 Aug 2021 17:41:54 +0000
Subject: [PATCH 11/57] add tidy

---
 src/composable_kernel/.clang-tidy             |   3 +
 src/composable_kernel/CMakeLists.txt          | 160 +++++++-
 src/composable_kernel/cmake/Analyzers.cmake   |  34 ++
 src/composable_kernel/cmake/ClangTidy.cmake   | 162 ++++++++
 src/composable_kernel/cmake/CppCheck.cmake    | 130 +++++++
 src/composable_kernel/cmake/DoxygenDoc.cmake  | 355 ++++++++++++++++++
 .../cmake/EnableCompilerWarnings.cmake        | 110 ++++++
 .../include/utility/data_type_enum.hpp        |   5 +-
 8 files changed, 955 insertions(+), 4 deletions(-)
 create mode 100644 src/composable_kernel/.clang-tidy
 create mode 100644 src/composable_kernel/cmake/Analyzers.cmake
 create mode 100644 src/composable_kernel/cmake/ClangTidy.cmake
 create mode 100644 src/composable_kernel/cmake/CppCheck.cmake
 create mode 100644 src/composable_kernel/cmake/DoxygenDoc.cmake
 create mode 100644 src/composable_kernel/cmake/EnableCompilerWarnings.cmake

diff --git a/src/composable_kernel/.clang-tidy b/src/composable_kernel/.clang-tidy
new file mode 100644
index 0000000000..5c2b781687
--- /dev/null
+++ b/src/composable_kernel/.clang-tidy
@@ -0,0 +1,3 @@
+CheckOptions:
+  - key: bugprone-reserved-identifier.AllowedIdentifiers
+    value: '__HIP_PLATFORM_HCC__;__HIP_ROCclr__'
diff --git a/src/composable_kernel/CMakeLists.txt b/src/composable_kernel/CMakeLists.txt
index fa5dcfe3ea..b75846301d 100644
--- a/src/composable_kernel/CMakeLists.txt
+++ b/src/composable_kernel/CMakeLists.txt
@@ -1,8 +1,10 @@
-cmake_minimum_required(VERSION 2.8.3)
+cmake_minimum_required(VERSION 3.5)
 project(composable_kernel)
 
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
 
+include(CheckCXXCompilerFlag)
+
 ## C++
 enable_language(CXX)
 set(CMAKE_CXX_STANDARD 17)
@@ -36,4 +38,160 @@ link_libraries(${OpenMP_pthread_LIBRARY})
 find_package(HIP REQUIRED)
 message(STATUS "Build with HIP ${hip_VERSION}")
 
+## tidy
+include(EnableCompilerWarnings)
+set(MIOPEN_TIDY_ERRORS ERRORS * -readability-inconsistent-declaration-parameter-name)
+if(CMAKE_CXX_COMPILER MATCHES ".*hcc" OR CMAKE_CXX_COMPILER MATCHES ".*clang\\+\\+")
+    set(MIOPEN_TIDY_CHECKS -modernize-use-override -readability-non-const-parameter)
+# Enable tidy on hip
+elseif(MIOPEN_BACKEND STREQUAL "HIP" OR MIOPEN_BACKEND STREQUAL "HIPNOGPU")
+    set(MIOPEN_TIDY_ERRORS ALL)
+
+endif()
+
+include(ClangTidy)
+enable_clang_tidy(
+    CHECKS
+        *
+        -abseil-*
+        -android-cloexec-fopen
+        # Yea we shouldn't be using rand()
+        -cert-msc30-c
+        -bugprone-exception-escape
+        -bugprone-macro-parentheses
+        -cert-env33-c
+        -cert-msc32-c
+        -cert-msc50-cpp
+        -cert-msc51-cpp
+        -cert-dcl37-c
+        -cert-dcl51-cpp
+        -clang-analyzer-alpha.core.CastToStruct
+        -clang-analyzer-optin.performance.Padding
+        -clang-diagnostic-deprecated-declarations
+        -clang-diagnostic-extern-c-compat
+        -clang-diagnostic-unused-command-line-argument
+        -cppcoreguidelines-avoid-c-arrays
+        -cppcoreguidelines-avoid-magic-numbers
+        -cppcoreguidelines-explicit-virtual-functions
+        -cppcoreguidelines-init-variables
+        -cppcoreguidelines-macro-usage
+        -cppcoreguidelines-non-private-member-variables-in-classes
+        -cppcoreguidelines-pro-bounds-array-to-pointer-decay
+        -cppcoreguidelines-pro-bounds-constant-array-index
+        -cppcoreguidelines-pro-bounds-pointer-arithmetic
+        -cppcoreguidelines-pro-type-member-init
+        -cppcoreguidelines-pro-type-reinterpret-cast
+        -cppcoreguidelines-pro-type-union-access
+        -cppcoreguidelines-pro-type-vararg
+        -cppcoreguidelines-special-member-functions
+        -fuchsia-*
+        -google-explicit-constructor
+        -google-readability-braces-around-statements
+        -google-readability-todo
+        -google-runtime-int
+        -google-runtime-references
+        -hicpp-vararg
+        -hicpp-braces-around-statements
+        -hicpp-explicit-conversions
+        -hicpp-named-parameter
+        -hicpp-no-array-decay
+        # We really shouldn't use bitwise operators with signed integers, but
+        # opencl leaves us no choice
+        -hicpp-avoid-c-arrays
+        -hicpp-signed-bitwise
+        -hicpp-special-member-functions
+        -hicpp-uppercase-literal-suffix
+        -hicpp-use-auto
+        -hicpp-use-equals-default
+        -hicpp-use-override
+        -llvm-header-guard
+        -llvm-include-order
+        #-llvmlibc-*
+        -llvmlibc-restrict-system-libc-headers
+        -llvmlibc-callee-namespace
+        -llvmlibc-implementation-in-namespace
+        -llvm-else-after-return
+        -llvm-qualified-auto
+        -misc-misplaced-const
+        -misc-non-private-member-variables-in-classes
+        -misc-no-recursion
+        -modernize-avoid-bind
+        -modernize-avoid-c-arrays
+        -modernize-pass-by-value
+        -modernize-use-auto
+        -modernize-use-default-member-init
+        -modernize-use-equals-default
+        -modernize-use-trailing-return-type
+        -modernize-use-transparent-functors
+        -performance-unnecessary-value-param
+        -readability-braces-around-statements
+        -readability-else-after-return
+        # we are not ready to use it, but very useful
+        -readability-function-cognitive-complexity
+        -readability-isolate-declaration
+        -readability-magic-numbers
+        -readability-named-parameter
+        -readability-uppercase-literal-suffix
+        -readability-convert-member-functions-to-static
+        -readability-qualified-auto
+        -readability-redundant-string-init
+        # too many narrowing conversions in our code
+        -bugprone-narrowing-conversions
+        -cppcoreguidelines-narrowing-conversions
+        -altera-struct-pack-align
+        -cppcoreguidelines-prefer-member-initializer
+
+        ${MIOPEN_TIDY_CHECKS}
+    ${MIOPEN_TIDY_ERRORS}
+    HEADER_FILTER
+        "\.hpp$"
+    EXTRA_ARGS
+        -DMIOPEN_USE_CLANG_TIDY
+
+)
+
+include(CppCheck)
+enable_cppcheck(
+    CHECKS
+        warning
+        style
+        performance
+        portability
+    SUPPRESS
+        ConfigurationNotChecked
+        constStatement
+        duplicateCondition
+        noExplicitConstructor
+        passedByValue
+        # preprocessorErrorDirective
+        shadowVariable
+        unusedFunction
+        unusedPrivateFunction
+        unusedStructMember
+        # Ignore initializer lists in the tests
+        useInitializationList:*test/*.cpp
+        *:*src/sqlite/*.cpp
+        *:*.cl
+        *:*src/kernels/*.h
+        knownConditionTrueFalse:*src/kernels/composable_kernel/*/*
+        redundantAssignment:*src/kernels/composable_kernel/*/*
+        unreadVariable:*src/kernels/composable_kernel/*/*
+        unusedScopedObject:*src/kernels/composable_kernel/*/*
+        wrongPrintfScanfArgNum:*src/kernels/composable_kernel/*/*
+        unmatchedSuppression
+    FORCE
+    SOURCES
+        host/host_tensor/src
+        host/driver_offline
+        composable_kernel/src
+    INCLUDE
+        ${CMAKE_CURRENT_SOURCE_DIR}/include
+        ${CMAKE_CURRENT_BINARY_DIR}/include
+        ${CMAKE_CURRENT_SOURCE_DIR}/src/include
+    DEFINE
+        CPPCHECK=1
+        MIOPEN_USE_MIOPENGEMM=1
+        __linux__=1
+)
+
 add_subdirectory(host)
diff --git a/src/composable_kernel/cmake/Analyzers.cmake b/src/composable_kernel/cmake/Analyzers.cmake
new file mode 100644
index 0000000000..1bf1a52c68
--- /dev/null
+++ b/src/composable_kernel/cmake/Analyzers.cmake
@@ -0,0 +1,34 @@
+################################################################################
+# 
+# MIT License
+# 
+# Copyright (c) 2017 Advanced Micro Devices, Inc.
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# 
+################################################################################
+
+if(NOT TARGET analyze)
+    add_custom_target(analyze)
+endif()
+
+function(mark_as_analyzer)
+    add_dependencies(analyze ${ARGN})
+endfunction()
+
diff --git a/src/composable_kernel/cmake/ClangTidy.cmake b/src/composable_kernel/cmake/ClangTidy.cmake
new file mode 100644
index 0000000000..01b348c458
--- /dev/null
+++ b/src/composable_kernel/cmake/ClangTidy.cmake
@@ -0,0 +1,162 @@
+################################################################################
+#
+# MIT License
+#
+# Copyright (c) 2017 Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+################################################################################
+include(CMakeParseArguments)
+include(Analyzers)
+
+get_filename_component(CLANG_TIDY_EXE_HINT "${CMAKE_CXX_COMPILER}" PATH)
+
+find_program(CLANG_TIDY_EXE
+    NAMES
+        clang-tidy
+        clang-tidy-5.0
+        clang-tidy-4.0
+        clang-tidy-3.9
+        clang-tidy-3.8
+        clang-tidy-3.7
+        clang-tidy-3.6
+        clang-tidy-3.5
+    HINTS
+        ${CLANG_TIDY_EXE_HINT}
+    PATH_SUFFIXES
+        compiler/bin
+    PATHS
+        /opt/rocm/llvm/bin
+        /opt/rocm/hcc
+        /usr/local/opt/llvm/bin
+)
+
+function(find_clang_tidy_version VAR)
+    execute_process(COMMAND ${CLANG_TIDY_EXE} -version OUTPUT_VARIABLE VERSION_OUTPUT)
+    separate_arguments(VERSION_OUTPUT_LIST UNIX_COMMAND "${VERSION_OUTPUT}")
+    list(FIND VERSION_OUTPUT_LIST "version" VERSION_INDEX)
+    if(VERSION_INDEX GREATER 0)
+        math(EXPR VERSION_INDEX "${VERSION_INDEX} + 1")
+        list(GET VERSION_OUTPUT_LIST ${VERSION_INDEX} VERSION)
+        set(${VAR} ${VERSION} PARENT_SCOPE)
+    else()
+        set(${VAR} "0.0" PARENT_SCOPE)
+    endif()
+
+endfunction()
+
+if( NOT CLANG_TIDY_EXE )
+    message( STATUS "Clang tidy not found" )
+    set(CLANG_TIDY_VERSION "0.0")
+else()
+    find_clang_tidy_version(CLANG_TIDY_VERSION)
+    message( STATUS "Clang tidy found: ${CLANG_TIDY_VERSION}")
+endif()
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+set(CLANG_TIDY_FIXIT_DIR ${CMAKE_BINARY_DIR}/fixits)
+file(MAKE_DIRECTORY ${CLANG_TIDY_FIXIT_DIR})
+set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${CLANG_TIDY_FIXIT_DIR})
+
+macro(enable_clang_tidy)
+    set(options ANALYZE_TEMPORARY_DTORS ALL)
+    set(oneValueArgs HEADER_FILTER)
+    set(multiValueArgs CHECKS ERRORS EXTRA_ARGS)
+
+    cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    string(REPLACE ";" "," CLANG_TIDY_CHECKS "${PARSE_CHECKS}")
+    string(REPLACE ";" "," CLANG_TIDY_ERRORS "${PARSE_ERRORS}")
+    set(CLANG_TIDY_EXTRA_ARGS)
+    foreach(ARG ${PARSE_EXTRA_ARGS})
+        list(APPEND CLANG_TIDY_EXTRA_ARGS "-extra-arg=${ARG}")
+    endforeach()
+
+    set(CLANG_TIDY_ALL)
+    if(PARSE_ALL)
+        set(CLANG_TIDY_ALL ALL)
+    endif()
+
+    message(STATUS "Clang tidy checks: ${CLANG_TIDY_CHECKS}")
+
+    if (${PARSE_ANALYZE_TEMPORARY_DTORS})
+        set(CLANG_TIDY_ANALYZE_TEMPORARY_DTORS "-analyze-temporary-dtors")
+    endif()
+
+    if (${CLANG_TIDY_VERSION} VERSION_LESS "3.9.0")
+        set(CLANG_TIDY_ERRORS_ARG "")
+    else()
+        set(CLANG_TIDY_ERRORS_ARG "-warnings-as-errors='${CLANG_TIDY_ERRORS}'")
+    endif()
+
+    if (${CLANG_TIDY_VERSION} VERSION_LESS "3.9.0")
+        set(CLANG_TIDY_QUIET_ARG "")
+    else()
+        set(CLANG_TIDY_QUIET_ARG "-quiet")
+    endif()
+
+    if(PARSE_HEADER_FILTER)
+        string(REPLACE "$" "$$" CLANG_TIDY_HEADER_FILTER "${PARSE_HEADER_FILTER}")
+    else()
+        set(CLANG_TIDY_HEADER_FILTER ".*")
+    endif()
+
+    set(CLANG_TIDY_COMMAND
+        ${CLANG_TIDY_EXE}
+        ${CLANG_TIDY_QUIET_ARG}
+        -p ${CMAKE_BINARY_DIR}
+        -checks='${CLANG_TIDY_CHECKS}'
+        ${CLANG_TIDY_ERRORS_ARG}
+        ${CLANG_TIDY_EXTRA_ARGS}
+        ${CLANG_TIDY_ANALYZE_TEMPORARY_DTORS}
+        -header-filter='${CLANG_TIDY_HEADER_FILTER}'
+    )
+    add_custom_target(tidy ${CLANG_TIDY_ALL})
+    mark_as_analyzer(tidy)
+    add_custom_target(tidy-base)
+    add_custom_target(tidy-make-fixit-dir COMMAND ${CMAKE_COMMAND} -E make_directory ${CLANG_TIDY_FIXIT_DIR})
+    add_custom_target(tidy-rm-fixit-dir COMMAND ${CMAKE_COMMAND} -E remove_directory ${CLANG_TIDY_FIXIT_DIR})
+    add_dependencies(tidy-make-fixit-dir tidy-rm-fixit-dir)
+    add_dependencies(tidy-base tidy-make-fixit-dir)
+endmacro()
+
+function(clang_tidy_check TARGET)
+    get_target_property(SOURCES ${TARGET} SOURCES)
+    # TODO: Use generator expressions instead
+    # COMMAND ${CLANG_TIDY_COMMAND} $<TARGET_PROPERTY:${TARGET},SOURCES>
+    # COMMAND ${CLANG_TIDY_COMMAND} $<JOIN:$<TARGET_PROPERTY:${TARGET},SOURCES>, >
+    foreach(SOURCE ${SOURCES})
+        if((NOT "${SOURCE}" MATCHES "(h|hpp|hxx)$") AND (NOT "${SOURCE}" MATCHES "TARGET_OBJECTS"))
+            string(MAKE_C_IDENTIFIER "${SOURCE}" tidy_file)
+            set(tidy_target tidy-target-${TARGET}-${tidy_file})
+            add_custom_target(${tidy_target}
+                # for some targets clang-tidy not able to get information from .clang-tidy
+                DEPENDS ${SOURCE}
+                COMMAND ${CLANG_TIDY_COMMAND} "-config=\{CheckOptions: \[\{key: bugprone-reserved-identifier.AllowedIdentifiers,value: __HIP_PLATFORM_HCC__\; __HIP_ROCclr__\}\]\}" ${SOURCE} "-export-fixes=${CLANG_TIDY_FIXIT_DIR}/${TARGET}-${tidy_file}.yaml"
+                WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+                COMMENT "clang-tidy: Running clang-tidy on target ${SOURCE}..."
+            )
+            add_dependencies(${tidy_target} ${TARGET})
+            add_dependencies(${tidy_target} tidy-base)
+            add_dependencies(tidy ${tidy_target})
+        endif()
+    endforeach()
+endfunction()
+
diff --git a/src/composable_kernel/cmake/CppCheck.cmake b/src/composable_kernel/cmake/CppCheck.cmake
new file mode 100644
index 0000000000..797dcf4b4d
--- /dev/null
+++ b/src/composable_kernel/cmake/CppCheck.cmake
@@ -0,0 +1,130 @@
+################################################################################
+# 
+# MIT License
+# 
+# Copyright (c) 2017 Advanced Micro Devices, Inc.
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# 
+################################################################################
+
+include(CMakeParseArguments)
+include(ProcessorCount)
+include(Analyzers)
+
+find_program(CPPCHECK_EXE 
+    NAMES 
+        cppcheck
+    PATHS
+        /opt/rocm/bin
+)
+
+ProcessorCount(CPPCHECK_JOBS)
+
+set(CPPCHECK_BUILD_DIR ${CMAKE_BINARY_DIR}/cppcheck-build)
+file(MAKE_DIRECTORY ${CPPCHECK_BUILD_DIR})
+set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${CPPCHECK_BUILD_DIR})
+
+macro(enable_cppcheck)
+    set(options FORCE)
+    set(oneValueArgs)
+    set(multiValueArgs CHECKS SUPPRESS DEFINE UNDEFINE INCLUDE SOURCES)
+
+    cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    string(REPLACE ";" "," CPPCHECK_CHECKS "${PARSE_CHECKS}")
+    string(REPLACE ";" "\n" CPPCHECK_SUPPRESS "${PARSE_SUPPRESS};*:/usr/*")
+    file(WRITE ${CMAKE_BINARY_DIR}/cppcheck-supressions "${CPPCHECK_SUPPRESS}")
+    set(CPPCHECK_DEFINES)
+    foreach(DEF ${PARSE_DEFINE})
+        set(CPPCHECK_DEFINES "${CPPCHECK_DEFINES} -D${DEF}")
+    endforeach()
+
+    set(CPPCHECK_UNDEFINES)
+    foreach(DEF ${PARSE_UNDEFINE})
+        set(CPPCHECK_UNDEFINES "${CPPCHECK_UNDEFINES} -U${DEF}")
+    endforeach()
+
+    set(CPPCHECK_INCLUDES)
+    foreach(INC ${PARSE_INCLUDE})
+        set(CPPCHECK_INCLUDES "${CPPCHECK_INCLUDES} -I${INC}")
+    endforeach()
+
+    # set(CPPCHECK_FORCE)
+    set(CPPCHECK_FORCE "--project=${CMAKE_BINARY_DIR}/compile_commands.json")
+    if(PARSE_FORCE)
+        set(CPPCHECK_FORCE --force)
+    endif()
+
+    set(SOURCES)
+    set(GLOBS)
+    foreach(SOURCE ${PARSE_SOURCES})
+        get_filename_component(ABS_SOURCE ${SOURCE} ABSOLUTE)
+        if(EXISTS ${ABS_SOURCE})
+            if(IS_DIRECTORY ${ABS_SOURCE})
+                set(GLOBS "${GLOBS} ${ABS_SOURCE}/*.cpp ${ABS_SOURCE}/*.hpp ${ABS_SOURCE}/*.cxx ${ABS_SOURCE}/*.c ${ABS_SOURCE}/*.h")
+            else()
+                set(SOURCES "${SOURCES} ${ABS_SOURCE}")
+            endif()
+        else()
+            set(GLOBS "${GLOBS} ${ABS_SOURCE}")
+        endif()
+    endforeach()
+
+    file(WRITE ${CMAKE_BINARY_DIR}/cppcheck.cmake "
+        file(GLOB_RECURSE GSRCS ${GLOBS})
+        set(CPPCHECK_COMMAND
+            ${CPPCHECK_EXE}
+            -q
+            # -v
+            # --report-progress
+            ${CPPCHECK_FORCE}
+            --cppcheck-build-dir=${CPPCHECK_BUILD_DIR}
+            --platform=native
+            --template=gcc
+            --error-exitcode=1
+            -j ${CPPCHECK_JOBS}
+            ${CPPCHECK_DEFINES}
+            ${CPPCHECK_UNDEFINES}
+            ${CPPCHECK_INCLUDES}
+            --enable=${CPPCHECK_CHECKS}
+            --inline-suppr
+            --suppressions-list=${CMAKE_BINARY_DIR}/cppcheck-supressions
+            ${SOURCES} \${GSRCS}
+        )
+        string(REPLACE \";\" \" \" CPPCHECK_SHOW_COMMAND \"\${CPPCHECK_COMMAND}\")
+        message(\"\${CPPCHECK_SHOW_COMMAND}\")
+        execute_process(
+            COMMAND \${CPPCHECK_COMMAND}
+            WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+            RESULT_VARIABLE RESULT
+        )
+        if(NOT RESULT EQUAL 0)
+            message(FATAL_ERROR \"Cppcheck failed\")
+        endif()
+")
+
+    add_custom_target(cppcheck
+        COMMAND ${CMAKE_COMMAND} -P ${CMAKE_BINARY_DIR}/cppcheck.cmake
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        COMMENT "cppcheck: Running cppcheck..."
+    )
+    mark_as_analyzer(cppcheck)
+endmacro()
+
+
diff --git a/src/composable_kernel/cmake/DoxygenDoc.cmake b/src/composable_kernel/cmake/DoxygenDoc.cmake
new file mode 100644
index 0000000000..2e3669fcdf
--- /dev/null
+++ b/src/composable_kernel/cmake/DoxygenDoc.cmake
@@ -0,0 +1,355 @@
+################################################################################
+# 
+# MIT License
+# 
+# Copyright (c) 2017 Advanced Micro Devices, Inc.
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# 
+################################################################################
+include(CMakeParseArguments)
+include(MainDoc)
+
+find_program(DOXYGEN_EXECUTABLE NAMES doxygen
+    PATH_SUFFIXES bin
+    DOC "Doxygen documentation generator"
+)
+mark_as_advanced(DOXYGEN_EXECUTABLE)
+
+find_path(DOT_EXECUTABLE NAMES dot
+    PATH_SUFFIXES bin
+    DOC "Graphviz"
+)
+mark_as_advanced(DOT_EXECUTABLE)
+
+set(DOXYGEN_ARGS
+ABBREVIATE_BRIEF
+ALIASES
+ALLEXTERNALS
+ALLOW_UNICODE_NAMES
+ALPHABETICAL_INDEX
+ALWAYS_DETAILED_SEC
+AUTOLINK_SUPPORT
+BINARY_TOC
+BRIEF_MEMBER_DESC
+BUILTIN_STL_SUPPORT
+CALLER_GRAPH
+CALL_GRAPH
+CASE_SENSE_NAMES
+CHM_FILE
+CHM_INDEX_ENCODING
+CITE_BIB_FILES
+CLANG_ASSISTED_PARSING
+CLANG_OPTIONS
+CLASS_DIAGRAMS
+CLASS_GRAPH
+COLLABORATION_GRAPH
+COLS_IN_ALPHA_INDEX
+COMPACT_LATEX
+COMPACT_RTF
+CPP_CLI_SUPPORT
+CREATE_SUBDIRS
+DIAFILE_DIRS
+DIA_PATH
+DIRECTORY_GRAPH
+DISABLE_INDEX
+DISTRIBUTE_GROUP_DOC
+DOCBOOK_OUTPUT
+DOCBOOK_PROGRAMLISTING
+DOCSET_BUNDLE_ID
+DOCSET_FEEDNAME
+DOCSET_PUBLISHER_ID
+DOCSET_PUBLISHER_NAME
+DOTFILE_DIRS
+DOT_CLEANUP
+DOT_FONTNAME
+DOT_FONTPATH
+DOT_FONTSIZE
+DOT_GRAPH_MAX_NODES
+DOT_IMAGE_FORMAT
+DOT_MULTI_TARGETS
+DOT_NUM_THREADS
+# DOT_PATH
+DOT_TRANSPARENT
+DOXYFILE_ENCODING
+ECLIPSE_DOC_ID
+ENABLED_SECTIONS
+ENABLE_PREPROCESSING
+ENUM_VALUES_PER_LINE
+EXAMPLE_PATH
+EXAMPLE_PATTERNS
+EXAMPLE_RECURSIVE
+EXCLUDE
+EXCLUDE_PATTERNS
+EXCLUDE_SYMBOLS
+EXCLUDE_SYMLINKS
+EXPAND_AS_DEFINED
+EXPAND_ONLY_PREDEF
+EXTENSION_MAPPING
+EXTERNAL_GROUPS
+EXTERNAL_PAGES
+EXTERNAL_SEARCH
+EXTERNAL_SEARCH_ID
+EXTRACT_ALL
+EXTRACT_ANON_NSPACES
+EXTRACT_LOCAL_CLASSES
+EXTRACT_LOCAL_METHODS
+EXTRACT_PACKAGE
+EXTRACT_PRIVATE
+EXTRACT_STATIC
+EXTRA_PACKAGES
+EXTRA_SEARCH_MAPPINGS
+EXT_LINKS_IN_WINDOW
+FILE_PATTERNS
+FILE_VERSION_FILTER
+FILTER_PATTERNS
+FILTER_SOURCE_FILES
+FILTER_SOURCE_PATTERNS
+FORCE_LOCAL_INCLUDES
+FORMULA_FONTSIZE
+FORMULA_TRANSPARENT
+FULL_PATH_NAMES
+GENERATE_AUTOGEN_DEF
+GENERATE_BUGLIST
+GENERATE_CHI
+GENERATE_DEPRECATEDLIST
+GENERATE_DOCBOOK
+GENERATE_DOCSET
+GENERATE_ECLIPSEHELP
+GENERATE_HTML
+GENERATE_HTMLHELP
+GENERATE_LATEX
+GENERATE_LEGEND
+GENERATE_MAN
+GENERATE_PERLMOD
+GENERATE_QHP
+GENERATE_RTF
+GENERATE_TAGFILE
+GENERATE_TESTLIST
+GENERATE_TODOLIST
+GENERATE_TREEVIEW
+GENERATE_XML
+GRAPHICAL_HIERARCHY
+GROUP_GRAPHS
+GROUP_NESTED_COMPOUNDS
+# HAVE_DOT
+HHC_LOCATION
+HIDE_COMPOUND_REFERENCE
+HIDE_FRIEND_COMPOUNDS
+HIDE_IN_BODY_DOCS
+HIDE_SCOPE_NAMES
+HIDE_UNDOC_CLASSES
+HIDE_UNDOC_MEMBERS
+HIDE_UNDOC_RELATIONS
+HTML_COLORSTYLE_GAMMA
+HTML_COLORSTYLE_HUE
+HTML_COLORSTYLE_SAT
+HTML_DYNAMIC_SECTIONS
+HTML_EXTRA_FILES
+HTML_EXTRA_STYLESHEET
+HTML_FILE_EXTENSION
+HTML_FOOTER
+HTML_HEADER
+HTML_INDEX_NUM_ENTRIES
+HTML_OUTPUT
+HTML_STYLESHEET
+HTML_TIMESTAMP
+IDL_PROPERTY_SUPPORT
+IGNORE_PREFIX
+IMAGE_PATH
+INCLUDED_BY_GRAPH
+INCLUDE_FILE_PATTERNS
+INCLUDE_GRAPH
+INCLUDE_PATH
+INHERIT_DOCS
+INLINE_GROUPED_CLASSES
+INLINE_INFO
+INLINE_INHERITED_MEMB
+INLINE_SIMPLE_STRUCTS
+INLINE_SOURCES
+INPUT
+INPUT_ENCODING
+INPUT_FILTER
+INTERACTIVE_SVG
+INTERNAL_DOCS
+JAVADOC_AUTOBRIEF
+LATEX_BATCHMODE
+LATEX_BIB_STYLE
+LATEX_CMD_NAME
+LATEX_EXTRA_FILES
+LATEX_EXTRA_STYLESHEET
+LATEX_FOOTER
+LATEX_HEADER
+LATEX_HIDE_INDICES
+LATEX_OUTPUT
+LATEX_SOURCE_CODE
+LATEX_TIMESTAMP
+LAYOUT_FILE
+LOOKUP_CACHE_SIZE
+MACRO_EXPANSION
+MAKEINDEX_CMD_NAME
+MAN_EXTENSION
+MAN_LINKS
+MAN_OUTPUT
+MAN_SUBDIR
+MARKDOWN_SUPPORT
+MATHJAX_CODEFILE
+MATHJAX_EXTENSIONS
+MATHJAX_FORMAT
+MATHJAX_RELPATH
+MAX_DOT_GRAPH_DEPTH
+MAX_INITIALIZER_LINES
+MSCFILE_DIRS
+MSCGEN_PATH
+MULTILINE_CPP_IS_BRIEF
+OPTIMIZE_FOR_FORTRAN
+OPTIMIZE_OUTPUT_FOR_C
+OPTIMIZE_OUTPUT_JAVA
+OPTIMIZE_OUTPUT_VHDL
+OUTPUT_DIRECTORY
+OUTPUT_LANGUAGE
+PAPER_TYPE
+PDF_HYPERLINKS
+PERLMOD_LATEX
+PERLMOD_MAKEVAR_PREFIX
+PERLMOD_PRETTY
+PERL_PATH
+PLANTUML_CFG_FILE
+PLANTUML_INCLUDE_PATH
+PLANTUML_JAR_PATH
+PREDEFINED
+PROJECT_BRIEF
+PROJECT_LOGO
+PROJECT_NAME
+PROJECT_NUMBER
+QCH_FILE
+QHG_LOCATION
+QHP_CUST_FILTER_ATTRS
+QHP_CUST_FILTER_NAME
+QHP_NAMESPACE
+QHP_SECT_FILTER_ATTRS
+QHP_VIRTUAL_FOLDER
+QT_AUTOBRIEF
+QUIET
+RECURSIVE
+REFERENCED_BY_RELATION
+REFERENCES_LINK_SOURCE
+REFERENCES_RELATION
+REPEAT_BRIEF
+RTF_EXTENSIONS_FILE
+RTF_HYPERLINKS
+RTF_OUTPUT
+RTF_SOURCE_CODE
+RTF_STYLESHEET_FILE
+SEARCHDATA_FILE
+SEARCHENGINE
+SEARCHENGINE_URL
+SEARCH_INCLUDES
+SEPARATE_MEMBER_PAGES
+SERVER_BASED_SEARCH
+SHORT_NAMES
+SHOW_FILES
+SHOW_GROUPED_MEMB_INC
+SHOW_INCLUDE_FILES
+SHOW_NAMESPACES
+SHOW_USED_FILES
+SIP_SUPPORT
+SKIP_FUNCTION_MACROS
+SORT_BRIEF_DOCS
+SORT_BY_SCOPE_NAME
+SORT_GROUP_NAMES
+SORT_MEMBERS_CTORS_1ST
+SORT_MEMBER_DOCS
+SOURCE_BROWSER
+SOURCE_TOOLTIPS
+STRICT_PROTO_MATCHING
+STRIP_CODE_COMMENTS
+STRIP_FROM_INC_PATH
+STRIP_FROM_PATH
+SUBGROUPING
+TAB_SIZE
+TAGFILES
+TCL_SUBST
+TEMPLATE_RELATIONS
+TOC_EXPAND
+TOC_INCLUDE_HEADINGS
+TREEVIEW_WIDTH
+TYPEDEF_HIDES_STRUCT
+UML_LIMIT_NUM_FIELDS
+UML_LOOK
+USE_HTAGS
+USE_MATHJAX
+USE_MDFILE_AS_MAINPAGE
+USE_PDFLATEX
+VERBATIM_HEADERS
+WARNINGS
+WARN_AS_ERROR
+WARN_FORMAT
+WARN_IF_DOC_ERROR
+WARN_IF_UNDOCUMENTED
+WARN_LOGFILE
+WARN_NO_PARAMDOC
+XML_OUTPUT
+XML_PROGRAMLISTING
+)
+
+set(DOXYGEN_CONFIG_FILE "${CMAKE_CURRENT_BINARY_DIR}/doxygen/doxygen.conf" CACHE PATH "Path to generated doxygen configuration file")
+
+function(add_doxygen_doc)
+    set(options)
+    set(oneValueArgs)
+    set(multiValueArgs DEPENDS ${DOXYGEN_ARGS})
+
+    cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    file(WRITE ${DOXYGEN_CONFIG_FILE} "# Auto-generated doxygen configuration file\n")
+
+    foreach(ARG ${DOXYGEN_ARGS})
+        if(PARSE_${ARG})
+            string(REPLACE ";" " " ARG_VALUE ${PARSE_${ARG}})
+            file(APPEND ${DOXYGEN_CONFIG_FILE} "\n${ARG} = ${ARG_VALUE}\n")
+        endif()
+    endforeach()
+
+    if(PARSE_OUTPUT_DIRECTORY)
+        if(NOT EXISTS ${PARSE_OUTPUT_DIRECTORY})
+            file(MAKE_DIRECTORY ${PARSE_OUTPUT_DIRECTORY})
+        endif()
+    endif()
+
+    if(DOT_EXECUTABLE)
+        file(APPEND ${DOXYGEN_CONFIG_FILE} "\nDOT_PATH = \"${DOT_EXECUTABLE}\"\n")
+        file(APPEND ${DOXYGEN_CONFIG_FILE} "\nHAVE_DOT = YES\n")
+    else()
+        file(APPEND ${DOXYGEN_CONFIG_FILE} "\nHAVE_DOT = NO\n")
+    endif()
+
+    add_custom_target(doxygen
+        ${DOXYGEN_EXECUTABLE} ${DOXYGEN_CONFIG_FILE}
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        COMMENT "Building documentation with doxygen"
+    )
+    if(PARSE_OUTPUT_DIRECTORY)
+        clean_doc_output(${PARSE_OUTPUT_DIRECTORY})
+    endif()
+    mark_as_doc(doxygen)
+    if(PARSE_DEPENDS)
+        add_dependencies(doxygen ${PARSE_DEPENDS})
+    endif()
+endfunction()
diff --git a/src/composable_kernel/cmake/EnableCompilerWarnings.cmake b/src/composable_kernel/cmake/EnableCompilerWarnings.cmake
new file mode 100644
index 0000000000..9f193b2090
--- /dev/null
+++ b/src/composable_kernel/cmake/EnableCompilerWarnings.cmake
@@ -0,0 +1,110 @@
+################################################################################
+#
+# MIT License
+#
+# Copyright (c) 2017 Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+################################################################################
+# - Enable warning all for gcc/clang or use /W4 for visual studio
+
+## Strict warning level
+if (MSVC)
+    # Use the highest warning level for visual studio.
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /w")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /w")
+    # set(CMAKE_CXX_WARNING_LEVEL 4)
+    # if (CMAKE_CXX_FLAGS MATCHES "/W[0-4]")
+    #     string(REGEX REPLACE "/W[0-4]" "/W4" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+    # else ()
+    #     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4")
+    # endif ()
+
+    # set(CMAKE_C_WARNING_LEVEL 4)
+    # if (CMAKE_C_FLAGS MATCHES "/W[0-4]")
+    #     string(REGEX REPLACE "/W[0-4]" "/W4" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
+    # else ()
+    #     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /W4")
+    # endif ()
+
+else()
+    foreach(COMPILER C CXX)
+        set(CMAKE_COMPILER_WARNINGS)
+        # use -Wall for gcc and clang
+        list(APPEND CMAKE_COMPILER_WARNINGS
+            -Wall
+            -Wextra
+            -Wcomment
+            -Wendif-labels
+            -Wformat
+            -Winit-self
+            -Wreturn-type
+            -Wsequence-point
+            # Shadow is broken on gcc when using lambdas
+            # -Wshadow
+            -Wswitch
+            -Wtrigraphs
+            -Wundef
+            -Wuninitialized
+            -Wunreachable-code
+            -Wunused
+
+            -Wno-sign-compare
+            -Wno-extra-semi-stmt
+        )
+        if (CMAKE_${COMPILER}_COMPILER_ID MATCHES "Clang")
+            list(APPEND CMAKE_COMPILER_WARNINGS
+                -Weverything
+                -Wno-c++98-compat
+                -Wno-c++98-compat-pedantic
+                -Wno-conversion
+                -Wno-double-promotion
+                -Wno-exit-time-destructors
+                -Wno-extra-semi
+                -Wno-float-conversion
+                -Wno-gnu-anonymous-struct
+                -Wno-gnu-zero-variadic-macro-arguments
+                -Wno-missing-prototypes
+                -Wno-nested-anon-types
+                -Wno-padded
+                -Wno-return-std-move-in-c++11
+                -Wno-shorten-64-to-32
+                -Wno-sign-conversion
+                -Wno-unknown-warning-option
+                -Wno-unused-command-line-argument
+                -Wno-weak-vtables
+                -Wno-covered-switch-default
+            )
+        else()
+            if (CMAKE_${COMPILER}_COMPILER_ID MATCHES "GNU" AND ${COMPILER} MATCHES "CXX")
+                # cmake 3.5.2 does not support >=.
+                if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS "6.1")
+                    list(APPEND CMAKE_COMPILER_WARNINGS
+                        -Wno-ignored-attributes)
+                endif()
+            endif()
+            list(APPEND CMAKE_COMPILER_WARNINGS
+                -Wno-missing-field-initializers
+                -Wno-deprecated-declarations
+            )
+        endif()
+        add_definitions(${CMAKE_COMPILER_WARNINGS})
+    endforeach()
+endif ()
diff --git a/src/composable_kernel/composable_kernel/include/utility/data_type_enum.hpp b/src/composable_kernel/composable_kernel/include/utility/data_type_enum.hpp
index 43499605dc..10a8771052 100644
--- a/src/composable_kernel/composable_kernel/include/utility/data_type_enum.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/data_type_enum.hpp
@@ -4,8 +4,7 @@
 namespace ck {
 
 // this enumerate should be synchronized with include/miopen.h
-typedef enum
-{
+using DataTypeEnum_t = enum {
     Half     = 0,
     Float    = 1,
     Int32    = 2,
@@ -14,7 +13,7 @@ typedef enum
     BFloat16 = 5,
     Double   = 6,
     Unknown  = 100,
-} DataTypeEnum_t;
+};
 
 } // namespace ck
 #endif

From 2ad51a5d16695ec73eebc1a837364803baf46586 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Mon, 9 Aug 2021 15:11:35 +0000
Subject: [PATCH 12/57] tidy

---
 src/composable_kernel/CMakeLists.txt          | 28 +++---
 .../host/driver_offline/CMakeLists.txt        |  4 +-
 .../{ => src}/conv_bwd_driver_offline.cpp     |  0
 .../{ => src}/conv_fwd_driver_offline.cpp     |  0
 ...nv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp | 95 ++++++++++---------
 src/reducetensor.cpp                          |  2 +-
 .../conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp     |  4 +-
 7 files changed, 69 insertions(+), 64 deletions(-)
 rename src/composable_kernel/host/driver_offline/{ => src}/conv_bwd_driver_offline.cpp (100%)
 rename src/composable_kernel/host/driver_offline/{ => src}/conv_fwd_driver_offline.cpp (100%)

diff --git a/src/composable_kernel/CMakeLists.txt b/src/composable_kernel/CMakeLists.txt
index b75846301d..6e757ef048 100644
--- a/src/composable_kernel/CMakeLists.txt
+++ b/src/composable_kernel/CMakeLists.txt
@@ -169,28 +169,30 @@ enable_cppcheck(
         unusedPrivateFunction
         unusedStructMember
         # Ignore initializer lists in the tests
-        useInitializationList:*test/*.cpp
-        *:*src/sqlite/*.cpp
-        *:*.cl
-        *:*src/kernels/*.h
-        knownConditionTrueFalse:*src/kernels/composable_kernel/*/*
-        redundantAssignment:*src/kernels/composable_kernel/*/*
-        unreadVariable:*src/kernels/composable_kernel/*/*
-        unusedScopedObject:*src/kernels/composable_kernel/*/*
-        wrongPrintfScanfArgNum:*src/kernels/composable_kernel/*/*
+       #useInitializationList:*test/*.cpp
+       #*:*src/sqlite/*.cpp
+       #*:*.cl
+       #*:*src/kernels/*.h
+       #knownConditionTrueFalse:*src/kernels/composable_kernel/*/*
+       #redundantAssignment:*src/kernels/composable_kernel/*/*
+       #unreadVariable:*src/kernels/composable_kernel/*/*
+       #unusedScopedObject:*src/kernels/composable_kernel/*/*
+       #wrongPrintfScanfArgNum:*src/kernels/composable_kernel/*/*
         unmatchedSuppression
     FORCE
     SOURCES
         host/host_tensor/src
-        host/driver_offline
-        composable_kernel/src
+        host/driver_offline/src
+        composable_kernel/src/kernel_wrapper
     INCLUDE
+        host/host_tensor/include
+        host/solver/include
+        host/driver_offline/include
+        composable_kernel/include/*
         ${CMAKE_CURRENT_SOURCE_DIR}/include
         ${CMAKE_CURRENT_BINARY_DIR}/include
-        ${CMAKE_CURRENT_SOURCE_DIR}/src/include
     DEFINE
         CPPCHECK=1
-        MIOPEN_USE_MIOPENGEMM=1
         __linux__=1
 )
 
diff --git a/src/composable_kernel/host/driver_offline/CMakeLists.txt b/src/composable_kernel/host/driver_offline/CMakeLists.txt
index 927975d449..9743abbb0b 100644
--- a/src/composable_kernel/host/driver_offline/CMakeLists.txt
+++ b/src/composable_kernel/host/driver_offline/CMakeLists.txt
@@ -12,8 +12,8 @@ include_directories(BEFORE
     ${PROJECT_SOURCE_DIR}/external/half/include
 )
 
-set(CONV_FWD_DRIVER_OFFLINE_SOURCE conv_fwd_driver_offline.cpp)
-set(CONV_BWD_DRIVER_OFFLINE_SOURCE conv_bwd_driver_offline.cpp)
+set(CONV_FWD_DRIVER_OFFLINE_SOURCE src/conv_fwd_driver_offline.cpp)
+set(CONV_BWD_DRIVER_OFFLINE_SOURCE src/conv_bwd_driver_offline.cpp)
 
 add_executable(conv_fwd_driver_offline ${CONV_FWD_DRIVER_OFFLINE_SOURCE})
 add_executable(conv_bwd_driver_offline ${CONV_BWD_DRIVER_OFFLINE_SOURCE})
diff --git a/src/composable_kernel/host/driver_offline/conv_bwd_driver_offline.cpp b/src/composable_kernel/host/driver_offline/src/conv_bwd_driver_offline.cpp
similarity index 100%
rename from src/composable_kernel/host/driver_offline/conv_bwd_driver_offline.cpp
rename to src/composable_kernel/host/driver_offline/src/conv_bwd_driver_offline.cpp
diff --git a/src/composable_kernel/host/driver_offline/conv_fwd_driver_offline.cpp b/src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp
similarity index 100%
rename from src/composable_kernel/host/driver_offline/conv_fwd_driver_offline.cpp
rename to src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp
diff --git a/src/composable_kernel/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp b/src/composable_kernel/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp
index a30c2720ee..983e0f0b74 100644
--- a/src/composable_kernel/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/src/composable_kernel/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp
@@ -8,41 +8,6 @@ namespace driver {
 
 struct CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw
 {
-    ck::DataTypeEnum_t ABDataTypeEnum;
-    ck::DataTypeEnum_t AccDataTypeEnum;
-    ck::DataTypeEnum_t CDataTypeEnum;
-
-    int BlockSize;
-
-    int GN0;
-    int GK1;
-
-    int GM1PerBlockGM11;
-    int GN1PerBlockGN11;
-    int GK0PerBlock;
-
-    int BM1PerThreadBM11;
-    int BN1PerThreadBN11;
-    int BK0PerThread;
-
-    std::array<int, 2> BM10BN10ThreadClusterBM10Xs;
-    std::array<int, 2> BM10BN10ThreadClusterBN10Xs;
-
-    std::array<int, 5> ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1;
-    std::array<int, 5> ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1;
-    std::array<int, 5> ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
-    std::array<int, 5> ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
-
-    std::array<int, 5> BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1;
-    std::array<int, 5> BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1;
-    std::array<int, 5> BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
-    std::array<int, 5> BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
-
-    int CThreadTransferDstScalarPerVector;
-
-    bool HasMainKBlockLoop;
-    bool HasDoubleTailKBlockLoop;
-
     auto GetCompileParameterString() const
     {
         // clang-format off
@@ -128,11 +93,46 @@ struct CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw
             " -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
                 std::to_string(CThreadTransferDstScalarPerVector) +
             " -DCK_PARAM_HasMainKBlockLoop=" +
-                std::to_string(HasMainKBlockLoop) + 
+                std::to_string(static_cast<int>(HasMainKBlockLoop)) + 
             " -DCK_PARAM_HasDoubleTailKBlockLoop=" +
-                std::to_string(HasDoubleTailKBlockLoop);
+                std::to_string(static_cast<int>(HasDoubleTailKBlockLoop));
         // clang-format on
     }
+
+    ck::DataTypeEnum_t ABDataTypeEnum;
+    ck::DataTypeEnum_t AccDataTypeEnum;
+    ck::DataTypeEnum_t CDataTypeEnum;
+
+    int BlockSize;
+
+    int GN0;
+    int GK1;
+
+    int GM1PerBlockGM11;
+    int GN1PerBlockGN11;
+    int GK0PerBlock;
+
+    int BM1PerThreadBM11;
+    int BN1PerThreadBN11;
+    int BK0PerThread;
+
+    std::array<int, 2> BM10BN10ThreadClusterBM10Xs;
+    std::array<int, 2> BM10BN10ThreadClusterBN10Xs;
+
+    std::array<int, 5> ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1;
+    std::array<int, 5> ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1;
+    std::array<int, 5> ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
+    std::array<int, 5> ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
+
+    std::array<int, 5> BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1;
+    std::array<int, 5> BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1;
+    std::array<int, 5> BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
+    std::array<int, 5> BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
+
+    int CThreadTransferDstScalarPerVector;
+
+    bool HasMainKBlockLoop;
+    bool HasDoubleTailKBlockLoop;
 };
 
 struct TunableConvIgemmFwdV6r1DlopsNchwKcyxNkhw
@@ -230,8 +230,6 @@ struct ConvIgemmFwdV6r1DlopsNchwKcyxNkhw
     CalculateCompileParameterBasedOnTunable(const ConvolutionProblemDescriptor& conv_problem_desc,
                                             const TunableConvIgemmFwdV6r1DlopsNchwKcyxNkhw& tunable)
     {
-        using namespace ck;
-
         const int C  = conv_problem_desc.C;
         const int Y  = conv_problem_desc.Y;
         const int X  = conv_problem_desc.X;
@@ -248,12 +246,17 @@ struct ConvIgemmFwdV6r1DlopsNchwKcyxNkhw
 
         DataTypeEnum_t AccDataTypeEnum;
 
-        switch(ABDataTypeEnum)
+        if(ABDataTypeEnum == DataTypeEnum_t::Float || ABDataTypeEnum == DataTypeEnum_t::Half)
+        {
+            AccDataTypeEnum = DataTypeEnum_t::Float;
+        }
+        else if(ABDataTypeEnum == DataTypeEnum_t::Int8)
         {
-        case DataTypeEnum_t::Float:
-        case DataTypeEnum_t::Half: AccDataTypeEnum = DataTypeEnum_t::Float; break;
-        case DataTypeEnum_t::Int8: AccDataTypeEnum = DataTypeEnum_t::Int32; break;
-        default: return std::make_tuple(CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{}, false);
+            AccDataTypeEnum = DataTypeEnum_t::Int32;
+        }
+        else
+        {
+            return std::make_tuple(CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{}, false);
         }
 
         const int BlockSize = tunable.BlockSize;
@@ -343,7 +346,7 @@ struct ConvIgemmFwdV6r1DlopsNchwKcyxNkhw
     {
         for(const auto& tunable : generate_tunable_list_conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw())
         {
-            CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param;
+            CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param{};
             bool found = false;
 
             std::tie(compile_param, found) =
@@ -369,8 +372,6 @@ struct ConvIgemmFwdV6r1DlopsNchwKcyxNkhw
     IsValidCompileParameter(const ConvolutionProblemDescriptor& conv_problem_desc,
                             const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw& compile_param)
     {
-        using namespace ck;
-
         const int N  = conv_problem_desc.N;
         const int K  = conv_problem_desc.K;
         const int C  = conv_problem_desc.C;
diff --git a/src/reducetensor.cpp b/src/reducetensor.cpp
index 2231252cd4..f89607b247 100644
--- a/src/reducetensor.cpp
+++ b/src/reducetensor.cpp
@@ -577,7 +577,7 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
 
     std::string param1 = param + " -DCK_PARAM_GRIDSIZE=" + std::to_string(gridSize) + " ";
 
-    std::string program_name = "gridwise_generic_reduction.cpp";
+    std::string program_name = "static_kernel_gridwise_generic_reduction.cpp";
     std::string algo_name    = "generic_reduce_tensor";
     std::string network_config;
 
diff --git a/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp b/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp
index 66b0090eae..c6e538543d 100644
--- a/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp
+++ b/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp
@@ -109,11 +109,13 @@ ConvCkIgemmFwdV6r1DlopsNchw::GetPerformanceConfig(const ConvolutionContext& ctx)
     {
         if(IsValidPerformanceConfig(ctx, i))
         {
-            return PerformanceConvCkIgemmFwdV6r1DlopsNchw(i);
+            return {i};
         }
     }
 
     MIOPEN_LOG_E("cannot find a valid performance config");
+
+    return {-1};
 }
 
 bool ConvCkIgemmFwdV6r1DlopsNchw::IsValidPerformanceConfig(

From add55bb61e10b24ea920d2e1c801c6c92ff0c37e Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Mon, 9 Aug 2021 17:33:32 +0000
Subject: [PATCH 13/57] tidy

---
 CMakeLists.txt                                     | 10 +++++-----
 ...d_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp |  6 ------
 ...d_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp |  3 ---
 .../tensor_operation/blockwise_gemm_dlops_v3.hpp   |  1 -
 .../tensor_operation/blockwise_gemm_xdlops.hpp     |  3 ---
 .../composable_kernel/include/utility/sequence.hpp |  2 --
 ...rd_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp |  1 -
 ..._implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp |  8 --------
 .../include/driver_dynamic_gemm_xdlops_v2r3.hpp    |  3 ---
 src/include/miopen/solver/ck_utility_common.hpp    | 14 +++++++-------
 10 files changed, 12 insertions(+), 39 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5c5f514c37..0e4237842c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -562,11 +562,11 @@ enable_cppcheck(
         *:*src/sqlite/*.cpp
         *:*.cl
         *:*src/kernels/*.h
-        knownConditionTrueFalse:*src/kernels/composable_kernel/*/*
-        redundantAssignment:*src/kernels/composable_kernel/*/*
-        unreadVariable:*src/kernels/composable_kernel/*/*
-        unusedScopedObject:*src/kernels/composable_kernel/*/*
-        wrongPrintfScanfArgNum:*src/kernels/composable_kernel/*/*
+        knownConditionTrueFalse:*src/kernels/static_composable_kernel/*/*
+        redundantAssignment:*src/kernels/static_composable_kernel/*/*
+        unreadVariable:*src/kernels/static_composable_kernel/*/*
+        unusedScopedObject:*src/kernels/static_composable_kernel/*/*
+        wrongPrintfScanfArgNum:*src/kernels/static_composable_kernel/*/*
         unmatchedSuppression
     FORCE
     SOURCES
diff --git a/src/composable_kernel/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp b/src/composable_kernel/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp
index 404129365f..4378314108 100644
--- a/src/composable_kernel/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/src/composable_kernel/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -126,9 +126,6 @@ transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_no_pad(
     const auto C = in_n_c_hi_wi_global_desc.GetLength(I1);
     const auto K = out_n_k_ho_wo_global_desc.GetLength(I1);
 
-    const auto Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
-    const auto Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
-
     const auto Ho = out_n_k_ho_wo_global_desc.GetLength(I2);
     const auto Wo = out_n_k_ho_wo_global_desc.GetLength(I3);
 
@@ -209,9 +206,6 @@ __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_
     const auto C = in_n_c_hi_wi_global_desc.GetLength(I1);
     const auto K = out_n_k_ho_wo_global_desc.GetLength(I1);
 
-    const auto Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
-    const auto Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
-
     const auto Ho = out_n_k_ho_wo_global_desc.GetLength(I2);
     const auto Wo = out_n_k_ho_wo_global_desc.GetLength(I3);
 
diff --git a/src/composable_kernel/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp b/src/composable_kernel/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp
index 79051d9512..4764f02787 100644
--- a/src/composable_kernel/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp
+++ b/src/composable_kernel/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp
@@ -125,9 +125,6 @@ __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_
     const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
     const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
 
-    const auto Hi = in_n_hi_wi_c_grid_desc.GetLength(I1);
-    const auto Wi = in_n_hi_wi_c_grid_desc.GetLength(I2);
-
     const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
     const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
 
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp
index 074d519b76..25b2ba7ce8 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp
@@ -69,7 +69,6 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
                       "wrong! K dimension not consistent\n");
 
         constexpr index_t K = BlockMatrixA{}.GetLength(I1); // A is transposed
-        constexpr index_t N = BlockMatrixB{}.GetLength(I1);
         constexpr index_t H = BlockMatrixB{}.GetLength(I2);
         constexpr index_t W = BlockMatrixB{}.GetLength(I3);
 
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp
index 98407ab7fc..74c9dc1547 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp
@@ -52,7 +52,6 @@ struct BlockwiseGemmXdlops_km_kn_m0m1m2n_v1
         const index_t waveId    = thread_id / WaveSize;
         const index_t laneId    = thread_id % WaveSize;
         const index_t waveId_m  = waveId / NWaves;
-        const index_t waveId_n  = waveId % NWaves;
 
         if constexpr(xdlops_gemm.IsKReduction)
         {
@@ -73,7 +72,6 @@ struct BlockwiseGemmXdlops_km_kn_m0m1m2n_v1
         const index_t thread_id = get_thread_local_1d_id();
         const index_t waveId    = thread_id / WaveSize;
         const index_t laneId    = thread_id % WaveSize;
-        const index_t waveId_m  = waveId / NWaves;
         const index_t waveId_n  = waveId % NWaves;
 
         if constexpr(xdlops_gemm.IsKReduction)
@@ -293,7 +291,6 @@ struct BlockwiseGemmXdlops_km_kn_m0m1m2n_v1_2x2pipeline
         const index_t thread_id = get_thread_local_1d_id();
         const index_t waveId    = thread_id / WaveSize;
         const index_t laneId    = thread_id % WaveSize;
-        const index_t waveId_m  = waveId / NWaves;
         const index_t waveId_n  = waveId % NWaves;
 
         if constexpr(xdlops_gemm.IsKReduction)
diff --git a/src/composable_kernel/composable_kernel/include/utility/sequence.hpp b/src/composable_kernel/composable_kernel/include/utility/sequence.hpp
index 81eb488715..b35999d56f 100644
--- a/src/composable_kernel/composable_kernel/include/utility/sequence.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/sequence.hpp
@@ -685,8 +685,6 @@ __host__ __device__ constexpr auto operator+(Number<Y>, Sequence<Xs...>)
 template <index_t Y, index_t... Xs>
 __host__ __device__ constexpr auto operator-(Number<Y>, Sequence<Xs...>)
 {
-    constexpr auto seq_x = Sequence<Xs...>{};
-
     return Sequence<(Y - Xs)...>{};
 }
 
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp
index d49693b511..d946bc63ee 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp
@@ -225,7 +225,6 @@ extern "C" __global__ void
     constexpr auto I0 = Number<0>{};
     constexpr auto I1 = Number<1>{};
     constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
 
     constexpr auto in_n_hi_wi_c_desc =
         make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 28, 28, 256));
diff --git a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp b/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp
index 10284b48f3..5310503318 100644
--- a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp
@@ -35,11 +35,6 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nh
     constexpr auto I1 = Number<1>{};
     constexpr auto I2 = Number<2>{};
     constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
-    constexpr auto I6 = Number<6>{};
-    constexpr auto I7 = Number<7>{};
-    constexpr auto I8 = Number<8>{};
 
     DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
     DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
@@ -218,9 +213,6 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nh
             const auto K = out_n_ho_wo_k_lengths[I3];
             const auto C = wei_k_y_x_c_lengths[I3];
 
-            const auto Hi = in_n_hi_wi_c_lengths[I1];
-            const auto Wi = in_n_hi_wi_c_lengths[I2];
-
             const auto Ho = out_n_ho_wo_k_lengths[I1];
             const auto Wo = out_n_ho_wo_k_lengths[I2];
 
diff --git a/src/composable_kernel/host/driver_offline/include/driver_dynamic_gemm_xdlops_v2r3.hpp b/src/composable_kernel/host/driver_offline/include/driver_dynamic_gemm_xdlops_v2r3.hpp
index 481d08188d..a2f4e28c54 100644
--- a/src/composable_kernel/host/driver_offline/include/driver_dynamic_gemm_xdlops_v2r3.hpp
+++ b/src/composable_kernel/host/driver_offline/include/driver_dynamic_gemm_xdlops_v2r3.hpp
@@ -66,9 +66,6 @@ __host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
     constexpr auto I0 = Number<0>{};
     constexpr auto I1 = Number<1>{};
     constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
 
     using GridwiseGemm =
         GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
diff --git a/src/include/miopen/solver/ck_utility_common.hpp b/src/include/miopen/solver/ck_utility_common.hpp
index c9b1a1b080..b44e9ff374 100644
--- a/src/include/miopen/solver/ck_utility_common.hpp
+++ b/src/include/miopen/solver/ck_utility_common.hpp
@@ -66,19 +66,19 @@ static inline auto get_ck_common_compiler_flag(const ConvolutionContext& ctx)
     auto compiler_flag = std::string(" --std=c++17");
 
     // GPU target
-    std::string gpu_target;
+    std::string gpu_target = ctx.GetStream().GetDeviceName();
 
-    if(StartsWith(ctx.GetStream().GetDeviceName(), "gfx803"))
+    if(StartsWith(gpu_target, "gfx803"))
         compiler_flag += std::string(" -DCK_AMD_GPU_GFX803");
-    else if(StartsWith(ctx.GetStream().GetDeviceName(), "gfx900"))
+    else if(StartsWith(gpu_target, "gfx900"))
         compiler_flag += std::string(" -DCK_AMD_GPU_GFX900");
-    else if(StartsWith(ctx.GetStream().GetDeviceName(), "gfx906"))
+    else if(StartsWith(gpu_target, "gfx906"))
         compiler_flag += std::string(" -DCK_AMD_GPU_GFX906");
-    else if(StartsWith(ctx.GetStream().GetDeviceName(), "gfx908"))
+    else if(StartsWith(gpu_target, "gfx908"))
         compiler_flag += std::string(" -DCK_AMD_GPU_GFX908");
-    else if(StartsWith(ctx.GetStream().GetDeviceName(), "gfx90a"))
+    else if(StartsWith(gpu_target, "gfx90a"))
         compiler_flag += std::string(" -DCK_AMD_GPU_GFX90A");
-    else if(StartsWith(ctx.GetStream().GetDeviceName(), "gfx1030"))
+    else if(StartsWith(gpu_target, "gfx1030"))
         compiler_flag += std::string(" -DCK_AMD_GPU_GFX1030");
 
     // buffer atomic-fadd

From 5cae6d0796ea2da8dbdeaa3f85b5d93819b872a0 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Mon, 9 Aug 2021 19:27:49 +0000
Subject: [PATCH 14/57] tidy

---
 src/composable_kernel/CMakeLists.txt          |  3 +-
 .../blockwise_gemm_dlops_v2r2.hpp             |  6 +-
 .../blockwise_gemm_dlops_v3.hpp               |  3 -
 .../blockwise_gemm_xdlops.hpp                 |  1 -
 .../gridwise_dynamic_gemm_dlops_v1r2.hpp      | 11 ---
 .../gridwise_dynamic_gemm_dlops_v1r3.hpp      | 15 +---
 .../gridwise_dynamic_gemm_dlops_v2.hpp        |  2 -
 .../gridwise_dynamic_gemm_xdlops_v2r3.hpp     |  5 --
 .../threadwise_gemm_dlops_v3.hpp              |  2 -
 ...mplicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp | 10 +--
 ...licit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp |  7 +-
 ...icit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp | 10 +--
 ...icit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp |  7 +-
 ...mplicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp |  2 +-
 ...mplicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp |  4 +-
 ...mplicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp |  9 ++-
 ..._gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp |  9 ++-
 .../src/conv_bwd_driver_offline.cpp           | 16 ++--
 .../src/conv_fwd_driver_offline.cpp           |  2 -
 .../host/host_tensor/include/conv_common.hpp  |  2 +-
 .../host/host_tensor/include/host_conv.hpp    | 32 ++++----
 .../include/host_tensor_generator.hpp         |  2 +-
 ...nv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp | 76 ++++++++++---------
 23 files changed, 91 insertions(+), 145 deletions(-)

diff --git a/src/composable_kernel/CMakeLists.txt b/src/composable_kernel/CMakeLists.txt
index 6e757ef048..12bf7ae562 100644
--- a/src/composable_kernel/CMakeLists.txt
+++ b/src/composable_kernel/CMakeLists.txt
@@ -142,12 +142,11 @@ enable_clang_tidy(
         -cppcoreguidelines-prefer-member-initializer
 
         ${MIOPEN_TIDY_CHECKS}
-    ${MIOPEN_TIDY_ERRORS}
+        ${MIOPEN_TIDY_ERRORS}
     HEADER_FILTER
         "\.hpp$"
     EXTRA_ARGS
         -DMIOPEN_USE_CLANG_TIDY
-
 )
 
 include(CppCheck)
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r2.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r2.hpp
index 694cf9c6a3..f021a7b9b4 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r2.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r2.hpp
@@ -71,7 +71,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2
     static constexpr index_t N0 = N / N1;
 
     __host__ __device__ static constexpr auto
-    MakeAKM0M1BlockDescriptor(const AKMBlockDesc& a_k_m_block_desc)
+    MakeAKM0M1BlockDescriptor(const AKMBlockDesc& /* a_k_m_block_desc */)
     {
         const auto a_k_m0_m1_block_desc = transform_dynamic_tensor_descriptor(
             AKMBlockDesc{},
@@ -84,7 +84,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2
     }
 
     __host__ __device__ static constexpr auto
-    MakeBKN0N1BlockDescriptor(const BKNBlockDesc& b_k_n_block_desc)
+    MakeBKN0N1BlockDescriptor(const BKNBlockDesc& /* b_k_n_block_desc */)
     {
         const auto b_k_n0_n1_block_desc = transform_dynamic_tensor_descriptor(
             BKNBlockDesc{},
@@ -194,7 +194,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2
               typename ABlockBuffer,
               typename BBlockBuffer,
               typename CThreadBuffer>
-    __device__ void Run(const CM0M1N0N1ThreadDesc& c_m0_m1_n0_n1_thread_desc,
+    __device__ void Run(const CM0M1N0N1ThreadDesc& /* c_m0_m1_n0_n1_thread_desc */,
                         const ABlockBuffer& a_block_buf,
                         const BBlockBuffer& b_block_buf,
                         CThreadBuffer& c_thread_buf) const
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp
index 25b2ba7ce8..b656b4595a 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp
@@ -120,9 +120,6 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
                       "wrong! inconsistent type");
 
         constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
 
         constexpr auto a_block_mtx = BlockMatrixA{};
 
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp
index 74c9dc1547..715fbc0b41 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp
@@ -270,7 +270,6 @@ struct BlockwiseGemmXdlops_km_kn_m0m1m2n_v1_2x2pipeline
         const index_t waveId    = thread_id / WaveSize;
         const index_t laneId    = thread_id % WaveSize;
         const index_t waveId_m  = waveId / NWaves;
-        const index_t waveId_n  = waveId % NWaves;
 
         if constexpr(xdlops_gemm.IsKReduction)
         {
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r2.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r2.hpp
index 7a4ef1d7ea..2c45e42a0e 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r2.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r2.hpp
@@ -619,17 +619,6 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
 
         // output: register to global memory
         {
-            constexpr index_t M11 =
-                M1PerThreadM111 * M11N11ThreadClusterM1100 * M11N11ThreadClusterM1101;
-            constexpr index_t N11 =
-                N1PerThreadN111 * M11N11ThreadClusterN1100 * M11N11ThreadClusterN1101;
-
-            constexpr index_t M10 = MPerBlockM1 / M11;
-            constexpr index_t N10 = NPerBlockN1 / N11;
-
-            constexpr index_t M111 = M1PerThreadM111;
-            constexpr index_t N111 = N1PerThreadN111;
-
             constexpr auto c_m0_m10_m11_n0_n10_n11_thread_desc =
                 make_dynamic_naive_tensor_descriptor_packed_v2(
                     make_tuple(I1,
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r3.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r3.hpp
index db3cb99121..5de41b1f7c 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r3.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r3.hpp
@@ -191,12 +191,12 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
         const auto M  = a_k0_m_k1_grid_desc.GetLength(I1);
         const auto N  = b_k0_n_k1_grid_desc.GetLength(I1);
         const auto K0 = a_k0_m_k1_grid_desc.GetLength(I0);
-        const auto K1 = a_k0_m_k1_grid_desc.GetLength(I2);
 
         // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
 
         return (M == c_m_n_grid_desc.GetLength(I0) && N == c_m_n_grid_desc.GetLength(I1) &&
                 K0 == b_k0_n_k1_grid_desc.GetLength(I0) &&
+                K1 == a_k0_m_k1_grid_desc.GetLength(I2) &&
                 K1 == b_k0_n_k1_grid_desc.GetLength(I2)) &&
                (M % MPerBlockM1 == 0 && N % NPerBlockN1 == 0 && K0 % KPerBlock == 0);
     }
@@ -608,19 +608,6 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
 
         // output: register to global memory
         {
-            constexpr auto M11 =
-                Number<container_reduce(M11N11ThreadClusterM110Xs{}, math::multiplies_v2{}, I1) *
-                       M1PerThreadM111>{};
-            constexpr auto N11 =
-                Number<container_reduce(M11N11ThreadClusterN110Xs{}, math::multiplies_v2{}, I1) *
-                       N1PerThreadN111>{};
-
-            constexpr index_t M10 = MPerBlockM1 / M11;
-            constexpr index_t N10 = NPerBlockN1 / N11;
-
-            constexpr index_t M111 = M1PerThreadM111;
-            constexpr index_t N111 = N1PerThreadN111;
-
             constexpr auto c_m0_m10_m11_n0_n10_n11_thread_desc =
                 make_dynamic_naive_tensor_descriptor_packed_v2(
                     make_tuple(I1,
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v2.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v2.hpp
index 34dea34833..5e90e0e85d 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v2.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v2.hpp
@@ -102,7 +102,6 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
 
 // divide block work by [M, N]
 #if 0
-        const auto k_block_work_num   = K / Number<KPerBlock>{};
         const auto ho_block_work_num  = Ho / Number<HoPerBlock>{};
         const auto wo_block_work_num  = Wo / Number<WoPerBlock>{};
         const auto hwo_block_work_num = ho_block_work_num * wo_block_work_num;
@@ -114,7 +113,6 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
         const index_t wo_block_work_id = hwo_block_work_id - ho_block_work_id * wo_block_work_num;
 #else
         // Hack: this force result into SGPR
-        const index_t k_block_work_num   = __builtin_amdgcn_readfirstlane(K / KPerBlock);
         const index_t ho_block_work_num  = __builtin_amdgcn_readfirstlane(Ho / HoPerBlock);
         const index_t wo_block_work_num  = __builtin_amdgcn_readfirstlane(Wo / WoPerBlock);
         const index_t hwo_block_work_num = ho_block_work_num * wo_block_work_num;
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp
index a5b1de79a7..124623c702 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp
@@ -269,11 +269,6 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
                                const CM0M1M2NGridDesc& c_m0_m1_m2_n_grid_desc,
                                const CBlockClusterAdaptor& c_block_cluster_adaptor)
     {
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-
         const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
             p_a_grid, a_k0_m_k1_grid_desc.GetElementSpaceSize());
         const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp
index 153d512df7..f9d8ac05b6 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp
@@ -57,8 +57,6 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3
 
         constexpr auto I0 = Number<0>{};
         constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
 
         constexpr auto E = ADesc{}.GetLength(I0);
         constexpr auto K = ADesc{}.GetLength(I1);
diff --git a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp b/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
index 24ba775309..c044036a2c 100644
--- a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
+++ b/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
@@ -34,12 +34,6 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
     constexpr auto I0 = Number<0>{};
     constexpr auto I1 = Number<1>{};
     constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
-    constexpr auto I6 = Number<6>{};
-    constexpr auto I7 = Number<7>{};
-    constexpr auto I8 = Number<8>{};
 
     DeviceMem in_n_c_hi_wi_device_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
     DeviceMem wei_k_c_y_x_device_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
@@ -198,8 +192,8 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
             in_gemmk_gemmn0_gemmn1_grid_move_slice_window_iterator_hacks,
             nrepeat);
 
-        float perf = (float)calculate_convolution_flops(
-                         in_n_c_hi_wi_desc, wei_k_c_y_x_desc, out_n_k_ho_wo_desc) /
+        float perf = static_cast<float>(calculate_convolution_flops(
+                         in_n_c_hi_wi_desc, wei_k_c_y_x_desc, out_n_k_ho_wo_desc)) /
                      (std::size_t(1000) * 1000 * 1000) / ave_time;
 
         std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
diff --git a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp b/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp
index cdd1084c0d..ce94f2071b 100644
--- a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp
+++ b/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp
@@ -35,11 +35,6 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhw
     constexpr auto I1 = Number<1>{};
     constexpr auto I2 = Number<2>{};
     constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
-    constexpr auto I6 = Number<6>{};
-    constexpr auto I7 = Number<7>{};
-    constexpr auto I8 = Number<8>{};
 
     DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
     DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
@@ -271,7 +266,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhw
             const auto Y = wei_k_y_x_c_lengths[I1];
             const auto X = wei_k_y_x_c_lengths[I2];
 
-            float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
+            float perf = static_cast<float>(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
                          (std::size_t(1000) * 1000 * 1000) / ave_time;
 
             std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
diff --git a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp b/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
index b56cbc0335..514ff6a3a9 100644
--- a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
+++ b/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
@@ -34,12 +34,6 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
     constexpr auto I0 = Number<0>{};
     constexpr auto I1 = Number<1>{};
     constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
-    constexpr auto I6 = Number<6>{};
-    constexpr auto I7 = Number<7>{};
-    constexpr auto I8 = Number<8>{};
 
     DeviceMem in_n_c_hi_wi_device_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
     DeviceMem wei_k_c_y_x_device_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
@@ -194,8 +188,8 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
                    in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks,
                    nrepeat);
 
-        float perf = (float)calculate_convolution_flops(
-                         in_n_c_hi_wi_desc, wei_k_c_y_x_desc, out_n_k_ho_wo_desc) /
+        float perf = static_cast<float>(calculate_convolution_flops(
+                         in_n_c_hi_wi_desc, wei_k_c_y_x_desc, out_n_k_ho_wo_desc)) /
                      (std::size_t(1000) * 1000 * 1000) / ave_time;
 
         std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
diff --git a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp b/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
index 601878c347..8596630ad3 100644
--- a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
@@ -35,11 +35,6 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nh
     constexpr auto I1 = Number<1>{};
     constexpr auto I2 = Number<2>{};
     constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
-    constexpr auto I6 = Number<6>{};
-    constexpr auto I7 = Number<7>{};
-    constexpr auto I8 = Number<8>{};
 
     DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
     DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
@@ -352,7 +347,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nh
             const auto Y = wei_k_y_x_c_lengths[I1];
             const auto X = wei_k_y_x_c_lengths[I2];
 
-            float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
+            float perf = static_cast<float>((std::size_t(2) * N * K * Ho * Wo * C * Y * X)) /
                          (std::size_t(1000) * 1000 * 1000) / ave_time;
 
             std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
diff --git a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp b/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
index ca0d47c33a..583c8a8a79 100644
--- a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
@@ -26,7 +26,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
     const Tensor<TInWei>& in_n_c_hi_wi,
     const Tensor<TInWei>& wei_k_c_y_x,
     Tensor<TOut>& out_n_k_ho_wo,
-    ck::index_t nrepeat)
+    ck::index_t /* nrepeat */)
 {
     using namespace ck;
 
diff --git a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp b/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
index 8fb276b464..9edbb811ca 100644
--- a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
@@ -232,8 +232,8 @@ void device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
             in_grid_move_slice_window_iterator_hacks,
             nrepeat);
 
-        float perf = (float)calculate_convolution_flops(
-                         in_desc_n_c_hi_wi, wei_desc_k_c_y_x, out_desc_n_k_ho_wo) /
+        float perf = static_cast<float>(calculate_convolution_flops(
+                         in_desc_n_c_hi_wi, wei_desc_k_c_y_x, out_desc_n_k_ho_wo)) /
                      (std::size_t(1000) * 1000 * 1000) / ave_time;
 
         std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
diff --git a/src/composable_kernel/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp b/src/composable_kernel/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
index 7c4b1043f3..34b9a54374 100644
--- a/src/composable_kernel/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/src/composable_kernel/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
@@ -338,10 +338,11 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
 
             float ave_time = timer.GetElapsedTime() / nrepeat;
 
-            float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc,
-                                                            wei_k_c_y_x_global_desc,
-                                                            out_n_k0_ho_wo_k1_global_desc) /
-                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+            float perf =
+                static_cast<float>(calculate_convolution_flops(in_n_c_hi_wi_global_desc,
+                                                               wei_k_c_y_x_global_desc,
+                                                               out_n_k0_ho_wo_k1_global_desc)) /
+                (std::size_t(1000) * 1000 * 1000) / ave_time;
 
             std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
                       << std::endl;
diff --git a/src/composable_kernel/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp b/src/composable_kernel/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
index b7f8e6039c..4e0f6e9f77 100644
--- a/src/composable_kernel/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
+++ b/src/composable_kernel/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
@@ -354,10 +354,11 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
 
             float ave_time = timer.GetElapsedTime() / nrepeat;
 
-            float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc,
-                                                            wei_k_c_y_x_global_desc,
-                                                            out_n_k0_ho_wo_k1_global_desc) /
-                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+            float perf =
+                static_cast<float>(calculate_convolution_flops(in_n_c_hi_wi_global_desc,
+                                                               wei_k_c_y_x_global_desc,
+                                                               out_n_k0_ho_wo_k1_global_desc)) /
+                (std::size_t(1000) * 1000 * 1000) / ave_time;
 
             std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
                       << std::endl;
diff --git a/src/composable_kernel/host/driver_offline/src/conv_bwd_driver_offline.cpp b/src/composable_kernel/host/driver_offline/src/conv_bwd_driver_offline.cpp
index c674ee5965..3d1faaaf66 100644
--- a/src/composable_kernel/host/driver_offline/src/conv_bwd_driver_offline.cpp
+++ b/src/composable_kernel/host/driver_offline/src/conv_bwd_driver_offline.cpp
@@ -128,10 +128,8 @@ int main(int argc, char* argv[])
 
     std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4);
 
-    switch(layout)
+    if(layout == ConvTensorLayout::NCHW)
     {
-    case ConvTensorLayout::NCHW:
-        // NCHW
         in_lengths_host[0]  = static_cast<std::size_t>(N);
         in_lengths_host[1]  = static_cast<std::size_t>(C);
         in_lengths_host[2]  = static_cast<std::size_t>(Hi);
@@ -144,9 +142,9 @@ int main(int argc, char* argv[])
         out_lengths_host[1] = static_cast<std::size_t>(K);
         out_lengths_host[2] = static_cast<std::size_t>(Ho);
         out_lengths_host[3] = static_cast<std::size_t>(Wo);
-        break;
-    case ConvTensorLayout::NHWC:
-        // NHWC
+    }
+    else if(layout == ConvTensorLayout::NHWC)
+    {
         in_lengths_host[0]  = static_cast<std::size_t>(N);
         in_lengths_host[1]  = static_cast<std::size_t>(Hi);
         in_lengths_host[2]  = static_cast<std::size_t>(Wi);
@@ -159,8 +157,10 @@ int main(int argc, char* argv[])
         out_lengths_host[1] = static_cast<std::size_t>(Ho);
         out_lengths_host[2] = static_cast<std::size_t>(Wo);
         out_lengths_host[3] = static_cast<std::size_t>(K);
-        break;
-    default: throw std::runtime_error("wrong! not implemented");
+    }
+    else
+    {
+        throw std::runtime_error("wrong! not implemented");
     }
 
     Tensor<in_data_t> in_host(in_lengths_host);
diff --git a/src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp b/src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp
index 7e4eb0571c..54392f3926 100644
--- a/src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp
+++ b/src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp
@@ -467,7 +467,6 @@ int main(int argc, char* argv[])
 
         check_error(out_host, out_device);
 
-#if 0
         if(do_log)
         {
             LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
@@ -475,6 +474,5 @@ int main(int argc, char* argv[])
             LogRangeAsType<float>(std::cout << "out_host  : ", out_host.mData, ",") << std::endl;
             LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
         }
-#endif
     }
 }
diff --git a/src/composable_kernel/host/host_tensor/include/conv_common.hpp b/src/composable_kernel/host/host_tensor/include/conv_common.hpp
index 73126b3c79..ca95c1f138 100644
--- a/src/composable_kernel/host/host_tensor/include/conv_common.hpp
+++ b/src/composable_kernel/host/host_tensor/include/conv_common.hpp
@@ -62,7 +62,7 @@ constexpr auto get_convolution_output_default_4d_tensor_descriptor(
 
 template <class InDesc, class WeiDesc, class OutDesc>
 constexpr std::size_t
-calculate_convolution_flops(const InDesc& in_desc, const WeiDesc& wei_desc, const OutDesc& out_desc)
+calculate_convolution_flops(const InDesc&, const WeiDesc& wei_desc, const OutDesc& out_desc)
 {
     using namespace ck;
 
diff --git a/src/composable_kernel/host/host_tensor/include/host_conv.hpp b/src/composable_kernel/host/host_tensor/include/host_conv.hpp
index 7f26cb42f7..c1228f4832 100644
--- a/src/composable_kernel/host/host_tensor/include/host_conv.hpp
+++ b/src/composable_kernel/host/host_tensor/include/host_conv.hpp
@@ -14,15 +14,13 @@ void host_direct_convolution(const Tensor<TIn>& in,
                              const ConvStrides& conv_strides,
                              const ConvDilations& conv_dilations,
                              const InLeftPads& in_left_pads,
-                             const InRightPads& in_right_pads,
+                             const InRightPads&,
                              const ConvTensorLayout layout = ConvTensorLayout::NCHW)
 {
     using namespace ck;
 
     constexpr auto I0 = Number<0>{};
     constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
 
     auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
         double v = 0;
@@ -68,23 +66,25 @@ void host_direct_convolution(const Tensor<TIn>& in,
         out(n, ho, wo, k) = v;
     };
 
-    switch(layout)
+    if(layout == ConvTensorLayout::NCHW)
     {
-    case ConvTensorLayout::NCHW:
         make_ParallelTensorFunctor(f_nchw,
                                    out.mDesc.GetLengths()[0],
                                    out.mDesc.GetLengths()[1],
                                    out.mDesc.GetLengths()[2],
                                    out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
-        break;
-    case ConvTensorLayout::NHWC:
+    }
+    else if(layout == ConvTensorLayout::NHWC)
+    {
         make_ParallelTensorFunctor(f_nhwc,
                                    out.mDesc.GetLengths()[0],
                                    out.mDesc.GetLengths()[1],
                                    out.mDesc.GetLengths()[2],
                                    out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
-        break;
-    default: throw std::runtime_error("wrong! not supported layout");
+    }
+    else
+    {
+        throw std::runtime_error("wrong! not supported layout");
     }
 }
 
@@ -100,17 +100,15 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
     constexpr std::size_t HoPerTile = 2;
     constexpr std::size_t WoPerTile = 2;
 
-    std::size_t N  = in_nchw.mDesc.GetLengths()[0];
-    std::size_t C  = in_nchw.mDesc.GetLengths()[1];
-    std::size_t HI = in_nchw.mDesc.GetLengths()[2];
-    std::size_t WI = in_nchw.mDesc.GetLengths()[3];
+    std::size_t N = in_nchw.mDesc.GetLengths()[0];
+    std::size_t C = in_nchw.mDesc.GetLengths()[1];
 
     std::size_t K = wei_kcyx.mDesc.GetLengths()[0];
     std::size_t Y = wei_kcyx.mDesc.GetLengths()[2];
     std::size_t X = wei_kcyx.mDesc.GetLengths()[3];
 
-    std::size_t HO = out_nkhw.mDesc.GetLengths()[2];
-    std::size_t WO = out_nkhw.mDesc.GetLengths()[3];
+    std::size_t Ho = out_nkhw.mDesc.GetLengths()[2];
+    std::size_t Wo = out_nkhw.mDesc.GetLengths()[3];
 
     index_t h_pad_low = InLeftPads{}.Get(Number<0>{});
     index_t w_pad_low = InLeftPads{}.Get(Number<1>{});
@@ -118,8 +116,8 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
     std::size_t HiPerTile = HoPerTile + Y - 1;
     std::size_t WiPerTile = WoPerTile + X - 1;
 
-    std::size_t HTile = (HO + HoPerTile - 1) / HoPerTile;
-    std::size_t WTile = (WO + WoPerTile - 1) / WoPerTile;
+    std::size_t HTile = (Ho + HoPerTile - 1) / HoPerTile;
+    std::size_t WTile = (Wo + WoPerTile - 1) / WoPerTile;
 
     Tensor<double> in_hold({N, C, HTile, WTile, HiPerTile, WiPerTile});
     Tensor<double> in_transform({N, C, HTile, WTile, HiPerTile, WiPerTile});
diff --git a/src/composable_kernel/host/host_tensor/include/host_tensor_generator.hpp b/src/composable_kernel/host/host_tensor/include/host_tensor_generator.hpp
index 98192e066f..7c09843d01 100644
--- a/src/composable_kernel/host/host_tensor/include/host_tensor_generator.hpp
+++ b/src/composable_kernel/host/host_tensor/include/host_tensor_generator.hpp
@@ -9,7 +9,7 @@ struct GeneratorTensor_1
     int value = 1;
 
     template <typename... Is>
-    float operator()(Is... is)
+    float operator()(Is...)
     {
         return value;
     }
diff --git a/src/composable_kernel/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp b/src/composable_kernel/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp
index 983e0f0b74..c1bd754750 100644
--- a/src/composable_kernel/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/src/composable_kernel/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp
@@ -99,40 +99,48 @@ struct CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw
         // clang-format on
     }
 
-    ck::DataTypeEnum_t ABDataTypeEnum;
-    ck::DataTypeEnum_t AccDataTypeEnum;
-    ck::DataTypeEnum_t CDataTypeEnum;
-
-    int BlockSize;
-
-    int GN0;
-    int GK1;
-
-    int GM1PerBlockGM11;
-    int GN1PerBlockGN11;
-    int GK0PerBlock;
-
-    int BM1PerThreadBM11;
-    int BN1PerThreadBN11;
-    int BK0PerThread;
-
-    std::array<int, 2> BM10BN10ThreadClusterBM10Xs;
-    std::array<int, 2> BM10BN10ThreadClusterBN10Xs;
-
-    std::array<int, 5> ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1;
-    std::array<int, 5> ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1;
-    std::array<int, 5> ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
-    std::array<int, 5> ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
-
-    std::array<int, 5> BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1;
-    std::array<int, 5> BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1;
-    std::array<int, 5> BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
-    std::array<int, 5> BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
-
-    int CThreadTransferDstScalarPerVector;
-
-    bool HasMainKBlockLoop;
-    bool HasDoubleTailKBlockLoop;
+    ck::DataTypeEnum_t ABDataTypeEnum  = ck::DataTypeEnum_t::Unknown;
+    ck::DataTypeEnum_t AccDataTypeEnum = ck::DataTypeEnum_t::Unknown;
+    ck::DataTypeEnum_t CDataTypeEnum   = ck::DataTypeEnum_t::Unknown;
+
+    int BlockSize = 1;
+
+    int GN0 = -1;
+    int GK1 = -1;
+
+    int GM1PerBlockGM11 = -1;
+    int GN1PerBlockGN11 = -1;
+    int GK0PerBlock     = -1;
+
+    int BM1PerThreadBM11 = -1;
+    int BN1PerThreadBN11 = -1;
+    int BK0PerThread     = -1;
+
+    std::array<int, 2> BM10BN10ThreadClusterBM10Xs = {-1, -1};
+    std::array<int, 2> BM10BN10ThreadClusterBN10Xs = {-1, -1};
+
+    std::array<int, 5> ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1 = {
+        -1, -1, -1, -1, -1};
+    std::array<int, 5> ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1 = {
+        -1, -1, -1, -1, -1};
+    std::array<int, 5> ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = {
+        -1, -1, -1, -1, -1};
+    std::array<int, 5> ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = {
+        -1, -1, -1, -1, -1};
+
+    std::array<int, 5> BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1 = {
+        -1, -1, -1, -1, -1};
+    std::array<int, 5> BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1 = {
+        -1, -1, -1, -1, -1};
+    std::array<int, 5> BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = {
+        -1, -1, -1, -1, -1};
+    std::array<int, 5> BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = {
+        -1, -1, -1, -1, -1};
+
+    int CThreadTransferDstScalarPerVector = -1;
+
+    bool HasMainKBlockLoop       = false;
+    bool HasDoubleTailKBlockLoop = false;
 };
 
 struct TunableConvIgemmFwdV6r1DlopsNchwKcyxNkhw

From 63204c082bc50eb95162c857a08824d8fc45aaca Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Mon, 9 Aug 2021 19:32:07 +0000
Subject: [PATCH 15/57] tidy

---
 .../solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/composable_kernel/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp b/src/composable_kernel/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp
index c1bd754750..b2e14df8f9 100644
--- a/src/composable_kernel/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/src/composable_kernel/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp
@@ -103,7 +103,7 @@ struct CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw
     ck::DataTypeEnum_t AccDataTypeEnum = ck::DataTypeEnum_t::Unknown;
     ck::DataTypeEnum_t CDataTypeEnum   = ck::DataTypeEnum_t::Unknown;
 
-    int BlockSize = 1;
+    int BlockSize = -1;
 
     int GN0 = -1;
     int GK1 = -1;

From 64b8ab8d6aa5edfa9ab2ef22d6700a01a6da831f Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Mon, 9 Aug 2021 21:10:09 +0000
Subject: [PATCH 16/57] tidy

---
 CMakeLists.txt                                |  4 +--
 .../gridwise_dynamic_gemm_dlops_v1r2.hpp      | 12 ++++-----
 .../gridwise_dynamic_gemm_dlops_v1r3.hpp      | 12 ++++-----
 .../gridwise_dynamic_gemm_xdlops_v2r3.hpp     | 16 ++++++------
 .../include/utility/amd_address_space.hpp     | 25 +++++++++++++++++++
 ...ssing_v2.hpp => amd_buffer_addressing.hpp} |  8 +++---
 .../include/utility/common_header.hpp         |  5 ++--
 .../include/utility/config.hpp                | 11 +-------
 .../include/utility/dynamic_buffer.hpp        |  2 +-
 9 files changed, 56 insertions(+), 39 deletions(-)
 create mode 100644 src/composable_kernel/composable_kernel/include/utility/amd_address_space.hpp
 rename src/composable_kernel/composable_kernel/include/utility/{amd_buffer_addressing_v2.hpp => amd_buffer_addressing.hpp} (99%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0e4237842c..479223e786 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -562,9 +562,9 @@ enable_cppcheck(
         *:*src/sqlite/*.cpp
         *:*.cl
         *:*src/kernels/*.h
-        knownConditionTrueFalse:*src/kernels/static_composable_kernel/*/*
+        knownConditionTrueFalse:*src/kernels/static_composable_kernel/*/*,*src/composable_kernel/composable_kernel/*/*
         redundantAssignment:*src/kernels/static_composable_kernel/*/*
-        unreadVariable:*src/kernels/static_composable_kernel/*/*
+        unreadVariable:*src/kernels/static_composable_kernel/*/*,*src/composable_kernel/external/*/*
         unusedScopedObject:*src/kernels/static_composable_kernel/*/*
         wrongPrintfScanfArgNum:*src/kernels/static_composable_kernel/*/*
         unmatchedSuppression
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r2.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r2.hpp
index 2c45e42a0e..e4858af492 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r2.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r2.hpp
@@ -80,16 +80,16 @@ __global__ void
     // first cast void CONSTANT void* to void*
     // second cast void* to Desc*
     // the copy constructor of tensor descriptor doesn't take address_space(4)
-    const auto a_k_m0_m1_grid_desc =
-        *reinterpret_cast<const AKM0M1GridDesc*>((const void*)p_a_k_m0_m1_grid_desc);
-    const auto b_k_n0_n1_grid_desc =
-        *reinterpret_cast<const BKN0N1GridDesc*>((const void*)p_b_k_n0_n1_grid_desc);
+    const auto a_k_m0_m1_grid_desc = *reinterpret_cast<const AKM0M1GridDesc*>(
+        cast_pointer_to_generic_address_space(p_a_k_m0_m1_grid_desc));
+    const auto b_k_n0_n1_grid_desc = *reinterpret_cast<const BKN0N1GridDesc*>(
+        cast_pointer_to_generic_address_space(p_b_k_n0_n1_grid_desc));
     const auto c_m0_m10_m11_n0_n10_n11_grid_desc =
         *reinterpret_cast<const CM0M10M11N0N10N11GridDesc*>(
-            (const void*)p_c_m0_m10_m11_n0_n10_n11_grid_desc);
+            cast_pointer_to_generic_address_space(p_c_m0_m10_m11_n0_n10_n11_grid_desc));
     const auto c_blockid_to_m0_n0_block_cluster_adaptor =
         *reinterpret_cast<const CBlockIdToM0N0BlockClusterAdaptor*>(
-            (const void*)p_c_blockid_to_m0_n0_block_cluster_adaptor);
+            cast_pointer_to_generic_address_space(p_c_blockid_to_m0_n0_block_cluster_adaptor));
 
     constexpr index_t shared_block_size =
         GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r3.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r3.hpp
index 5de41b1f7c..244c376cf8 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r3.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r3.hpp
@@ -80,16 +80,16 @@ __global__ void
     // first cast void CONSTANT void* to void*
     // second cast void* to Desc*
     // the copy constructor of tensor descriptor doesn't take address_space(4)
-    const auto a_k0_m0_m1_k1_grid_desc =
-        *reinterpret_cast<const AK0M0M1K1GridDesc*>((const void*)p_a_k0_m0_m1_k1_grid_desc);
-    const auto b_k0_n0_n1_k1_grid_desc =
-        *reinterpret_cast<const BK0N0N1K1GridDesc*>((const void*)p_b_k0_n0_n1_k1_grid_desc);
+    const auto a_k0_m0_m1_k1_grid_desc = *reinterpret_cast<const AK0M0M1K1GridDesc*>(
+        cast_pointer_to_generic_address_space(p_a_k0_m0_m1_k1_grid_desc));
+    const auto b_k0_n0_n1_k1_grid_desc = *reinterpret_cast<const BK0N0N1K1GridDesc*>(
+        cast_pointer_to_generic_address_space(p_b_k0_n0_n1_k1_grid_desc));
     const auto c_m0_m10_m11_n0_n10_n11_grid_desc =
         *reinterpret_cast<const CM0M10M11N0N10N11GridDesc*>(
-            (const void*)p_c_m0_m10_m11_n0_n10_n11_grid_desc);
+            cast_pointer_to_generic_address_space(p_c_m0_m10_m11_n0_n10_n11_grid_desc));
     const auto c_blockid_to_m0_n0_block_cluster_adaptor =
         *reinterpret_cast<const CBlockIdToM0N0BlockClusterAdaptor*>(
-            (const void*)p_c_blockid_to_m0_n0_block_cluster_adaptor);
+            cast_pointer_to_generic_address_space(p_c_blockid_to_m0_n0_block_cluster_adaptor));
 
     constexpr index_t shared_block_size =
         GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp
index 124623c702..3a8883b460 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp
@@ -69,14 +69,14 @@ __global__ void
     constexpr index_t shared_block_size =
         GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
 
-    const auto a_k0_m_k1_grid_desc =
-        *reinterpret_cast<const AK0MK1GridDesc*>((const void*)p_a_k0_m_k1_grid_desc);
-    const auto b_k0_n_k1_grid_desc =
-        *reinterpret_cast<const BK0NK1GridDesc*>((const void*)p_b_k0_n_k1_grid_desc);
-    const auto c_m0_m1_m2_n_grid_desc =
-        *reinterpret_cast<const CM0M1M2NGridDesc*>((const void*)p_c_m0_m1_m2_n_grid_desc);
-    const auto c_block_cluster_adaptor =
-        *reinterpret_cast<const CBlockClusterAdaptor*>((const void*)p_c_block_cluster_adaptor);
+    const auto a_k0_m_k1_grid_desc = *reinterpret_cast<const AK0MK1GridDesc*>(
+        cast_pointer_to_generic_address_space(p_a_k0_m_k1_grid_desc));
+    const auto b_k0_n_k1_grid_desc = *reinterpret_cast<const BK0NK1GridDesc*>(
+        cast_pointer_to_generic_address_space(p_b_k0_n_k1_grid_desc));
+    const auto c_m0_m1_m2_n_grid_desc = *reinterpret_cast<const CM0M1M2NGridDesc*>(
+        cast_pointer_to_generic_address_space(p_c_m0_m1_m2_n_grid_desc));
+    const auto c_block_cluster_adaptor = *reinterpret_cast<const CBlockClusterAdaptor*>(
+        cast_pointer_to_generic_address_space(p_c_block_cluster_adaptor));
 
     __shared__ FloatAB p_shared_block[shared_block_size];
 
diff --git a/src/composable_kernel/composable_kernel/include/utility/amd_address_space.hpp b/src/composable_kernel/composable_kernel/include/utility/amd_address_space.hpp
new file mode 100644
index 0000000000..f9bb6a5133
--- /dev/null
+++ b/src/composable_kernel/composable_kernel/include/utility/amd_address_space.hpp
@@ -0,0 +1,25 @@
+#ifndef CK_AMD_ADDRESS_SPACE_HPP
+#define CK_AMD_ADDRESS_SPACE_HPP
+
+#include "config.hpp"
+
+namespace ck {
+
+enum AddressSpaceEnum_t
+{
+    Generic,
+    Global,
+    Lds,
+    Sgpr,
+    Vgpr,
+};
+
+template <typename T>
+__device__ T* cast_pointer_to_generic_address_space(T CONSTANT* p)
+{
+    return (T*)p;
+}
+
+} // namespace ck
+
+#endif
diff --git a/src/composable_kernel/composable_kernel/include/utility/amd_buffer_addressing_v2.hpp b/src/composable_kernel/composable_kernel/include/utility/amd_buffer_addressing.hpp
similarity index 99%
rename from src/composable_kernel/composable_kernel/include/utility/amd_buffer_addressing_v2.hpp
rename to src/composable_kernel/composable_kernel/include/utility/amd_buffer_addressing.hpp
index 0139bceb61..711af2e648 100644
--- a/src/composable_kernel/composable_kernel/include/utility/amd_buffer_addressing_v2.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/amd_buffer_addressing.hpp
@@ -1,12 +1,12 @@
-#ifndef CK_AMD_BUFFER_ADDRESSING_V2_HPP
-#define CK_AMD_BUFFER_ADDRESSING_V2_HPP
+#ifndef CK_AMD_BUFFER_ADDRESSING_HPP
+#define CK_AMD_BUFFER_ADDRESSING_HPP
 
 #include "data_type.hpp"
 
 namespace ck {
 
 template <typename T>
-union BufferResource_v2
+union BufferResource
 {
     // 128 bit SGPRs to supply buffer resource in buffer instructions
     // https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
@@ -19,7 +19,7 @@ union BufferResource_v2
 template <typename T>
 __device__ int32x4_t make_wave_buffer_resource(T* p_wave, index_t data_space_size)
 {
-    BufferResource_v2<T> wave_buffer_resource;
+    BufferResource<T> wave_buffer_resource;
 
     // wavewise base address (64 bit)
     wave_buffer_resource.address(Number<0>{}) = const_cast<remove_cv_t<T>*>(p_wave);
diff --git a/src/composable_kernel/composable_kernel/include/utility/common_header.hpp b/src/composable_kernel/composable_kernel/include/utility/common_header.hpp
index 5ff7688a1c..39b400b3cd 100644
--- a/src/composable_kernel/composable_kernel/include/utility/common_header.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/common_header.hpp
@@ -23,9 +23,10 @@
 #include "tuple.hpp"
 #include "tuple_helper.hpp"
 #include "type.hpp"
-#include "utility.hpp"
 #include "magic_division.hpp"
-#include "amd_buffer_addressing_v2.hpp"
+#include "utility.hpp"
+#include "amd_address_space.hpp"
+#include "amd_buffer_addressing.hpp"
 #include "static_buffer.hpp"
 #include "dynamic_buffer.hpp"
 
diff --git a/src/composable_kernel/composable_kernel/include/utility/config.hpp b/src/composable_kernel/composable_kernel/include/utility/config.hpp
index 4908d8d818..547d1fadbe 100644
--- a/src/composable_kernel/composable_kernel/include/utility/config.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/config.hpp
@@ -7,7 +7,7 @@
 #endif
 #include "bfloat16_dev.hpp"
 
-// address space for kernel parameter
+// "Constant" address space for kernel parameter
 #define CONSTANT __attribute__((address_space(4)))
 
 // GPU target
@@ -120,15 +120,6 @@
 
 namespace ck {
 
-enum AddressSpaceEnum_t
-{
-    Generic,
-    Global,
-    Lds,
-    Sgpr,
-    Vgpr
-};
-
 enum InMemoryDataOperationEnum_t
 {
     Set,
diff --git a/src/composable_kernel/composable_kernel/include/utility/dynamic_buffer.hpp b/src/composable_kernel/composable_kernel/include/utility/dynamic_buffer.hpp
index 5f5f386306..34c28d7fef 100644
--- a/src/composable_kernel/composable_kernel/include/utility/dynamic_buffer.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/dynamic_buffer.hpp
@@ -3,7 +3,7 @@
 
 namespace ck {
 
-#include "amd_buffer_addressing_v2.hpp"
+#include "amd_buffer_addressing.hpp"
 
 template <AddressSpaceEnum_t BufferAddressSpace, typename T, typename ElementSpaceSize>
 struct DynamicBuffer

From 685ff522d05c435159a4a5d628acc2a8487bcca1 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Mon, 9 Aug 2021 22:13:47 +0000
Subject: [PATCH 17/57] tidy

---
 .../include/utility/amd_address_space.hpp     |  6 ++
 .../include/utility/print.hpp                 | 48 ----------------
 ...licit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp |  3 -
 .../driver_dynamic_contraction_dlops_v1r2.hpp |  4 --
 ...mplicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp |  4 --
 ..._gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp |  4 --
 .../driver_dynamic_gemm_dlops_v1r2.hpp        | 48 ++++++++--------
 .../driver_dynamic_gemm_dlops_v1r3.hpp        | 56 +++++++++++--------
 .../driver_dynamic_gemm_xdlops_v2r3.hpp       | 28 +++++-----
 .../src/conv_fwd_driver_offline.cpp           | 16 +++---
 .../host/host_tensor/include/device.hpp       | 20 +++----
 11 files changed, 90 insertions(+), 147 deletions(-)

diff --git a/src/composable_kernel/composable_kernel/include/utility/amd_address_space.hpp b/src/composable_kernel/composable_kernel/include/utility/amd_address_space.hpp
index f9bb6a5133..c5bb1b2cd0 100644
--- a/src/composable_kernel/composable_kernel/include/utility/amd_address_space.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/amd_address_space.hpp
@@ -20,6 +20,12 @@ __device__ T* cast_pointer_to_generic_address_space(T CONSTANT* p)
     return (T*)p;
 }
 
+template <typename T>
+__host__ __device__ T CONSTANT* cast_pointer_to_constant_address_space(T* p)
+{
+    return (T CONSTANT*)p;
+}
+
 } // namespace ck
 
 #endif
diff --git a/src/composable_kernel/composable_kernel/include/utility/print.hpp b/src/composable_kernel/composable_kernel/include/utility/print.hpp
index 0dd646153a..d7d58bbb83 100644
--- a/src/composable_kernel/composable_kernel/include/utility/print.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/print.hpp
@@ -11,59 +11,11 @@ namespace ck {
 template <typename T>
 __host__ __device__ void print_array(const char* s, T a)
 {
-    using data_type         = decltype(a.At(Number<0>{}));
     constexpr index_t nsize = a.Size();
 
-#if 0
-    if constexpr(is_same<data_type, uint32_t>{})
-    {
-        printf("%s size %u, {", s, nsize);
-        static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("%u, ", uint32_t{a[i]}); });
-        printf("}\n");
-    }
-    else if constexpr(is_same<data_type, int32_t>{})
-    {
-        printf("%s size %d, {", s, nsize);
-        static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("%d, ", int32_t{a[i]}); });
-        printf("}\n");
-    }
-    else if constexpr(is_same<data_type, bool>{})
-    {
-        printf("%s size %d, {", s, nsize);
-        static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("%d, ", bool{a[i]}); });
-        printf("}\n");
-    }
-#else
     printf("%s size %d, {", s, nsize);
     static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("%d, ", int32_t{a[i]}); });
     printf("}\n");
-#endif
-}
-
-template <typename T>
-__host__ __device__ void print_array_v2(const char* s, T a)
-{
-    using data_type         = decltype(a.At(Number<0>{}));
-    constexpr index_t nsize = a.Size();
-
-#if 0
-    if constexpr(is_same<data_type, uint32_t>{})
-    {
-        printf("%s size %u, {", s, nsize);
-        static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("[%u] %u, ", i.value, a[i]); });
-        printf("}\n");
-    }
-    else if constexpr(is_same<data_type, int32_t>{})
-    {
-        printf("%s size %d, {", s, nsize);
-        static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("[%d] %d, ", i.value, a[i]); });
-        printf("}\n");
-    }
-#else
-    printf("%s size %d, {", s, nsize);
-    static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("[%d] %d, ", i.value, a[i]); });
-    printf("}\n");
-#endif
 }
 
 } // namespace ck
diff --git a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp b/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp
index ce94f2071b..d553d2586c 100644
--- a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp
+++ b/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp
@@ -257,9 +257,6 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhw
             const auto K = out_n_ho_wo_k_lengths[I3];
             const auto C = wei_k_y_x_c_lengths[I3];
 
-            const auto Hi = in_n_hi_wi_c_lengths[I1];
-            const auto Wi = in_n_hi_wi_c_lengths[I2];
-
             const auto Ho = out_n_ho_wo_k_lengths[I1];
             const auto Wo = out_n_ho_wo_k_lengths[I2];
 
diff --git a/src/composable_kernel/host/driver_offline/include/driver_dynamic_contraction_dlops_v1r2.hpp b/src/composable_kernel/host/driver_offline/include/driver_dynamic_contraction_dlops_v1r2.hpp
index 2f175962c1..b520be5b6a 100644
--- a/src/composable_kernel/host/driver_offline/include/driver_dynamic_contraction_dlops_v1r2.hpp
+++ b/src/composable_kernel/host/driver_offline/include/driver_dynamic_contraction_dlops_v1r2.hpp
@@ -194,7 +194,6 @@ driver_dynamic_contraction_dlops_v1r2(const FloatAB* p_a_grid,
                                           dim3(grid_size),
                                           dim3(BlockSize),
                                           0,
-                                          0,
                                           p_a_grid,
                                           p_b_grid,
                                           p_c_grid,
@@ -221,7 +220,6 @@ driver_dynamic_contraction_dlops_v1r2(const FloatAB* p_a_grid,
                                           dim3(grid_size),
                                           dim3(BlockSize),
                                           0,
-                                          0,
                                           p_a_grid,
                                           p_b_grid,
                                           p_c_grid,
@@ -248,7 +246,6 @@ driver_dynamic_contraction_dlops_v1r2(const FloatAB* p_a_grid,
                                           dim3(grid_size),
                                           dim3(BlockSize),
                                           0,
-                                          0,
                                           p_a_grid,
                                           p_b_grid,
                                           p_c_grid,
@@ -275,7 +272,6 @@ driver_dynamic_contraction_dlops_v1r2(const FloatAB* p_a_grid,
                                           dim3(grid_size),
                                           dim3(BlockSize),
                                           0,
-                                          0,
                                           p_a_grid,
                                           p_b_grid,
                                           p_c_grid,
diff --git a/src/composable_kernel/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp b/src/composable_kernel/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
index 34b9a54374..693045cd16 100644
--- a/src/composable_kernel/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/src/composable_kernel/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
@@ -244,7 +244,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
                                   dim3(GridSize),
                                   dim3(BlockSize),
                                   0,
-                                  0,
                                   wei_e_k_global_desc,
                                   p_wei_global,
                                   in_e_n_ho_wo_global_desc,
@@ -270,7 +269,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
                                   dim3(GridSize),
                                   dim3(BlockSize),
                                   0,
-                                  0,
                                   wei_e_k_global_desc,
                                   p_wei_global,
                                   in_e_n_ho_wo_global_desc,
@@ -296,7 +294,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
                                   dim3(GridSize),
                                   dim3(BlockSize),
                                   0,
-                                  0,
                                   wei_e_k_global_desc,
                                   p_wei_global,
                                   in_e_n_ho_wo_global_desc,
@@ -322,7 +319,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
                                   dim3(GridSize),
                                   dim3(BlockSize),
                                   0,
-                                  0,
                                   wei_e_k_global_desc,
                                   p_wei_global,
                                   in_e_n_ho_wo_global_desc,
diff --git a/src/composable_kernel/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp b/src/composable_kernel/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
index 4e0f6e9f77..2238b355f9 100644
--- a/src/composable_kernel/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
+++ b/src/composable_kernel/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
@@ -257,7 +257,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
                                   dim3(GridSize),
                                   dim3(BlockSize),
                                   0,
-                                  0,
                                   wei_e_k_global_desc,
                                   p_wei_global,
                                   in_e_n_ho_wo_global_desc,
@@ -284,7 +283,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
                                   dim3(GridSize),
                                   dim3(BlockSize),
                                   0,
-                                  0,
                                   wei_e_k_global_desc,
                                   p_wei_global,
                                   in_e_n_ho_wo_global_desc,
@@ -311,7 +309,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
                                   dim3(GridSize),
                                   dim3(BlockSize),
                                   0,
-                                  0,
                                   wei_e_k_global_desc,
                                   p_wei_global,
                                   in_e_n_ho_wo_global_desc,
@@ -338,7 +335,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
                                   dim3(GridSize),
                                   dim3(BlockSize),
                                   0,
-                                  0,
                                   wei_e_k_global_desc,
                                   p_wei_global,
                                   in_e_n_ho_wo_global_desc,
diff --git a/src/composable_kernel/host/driver_offline/include/driver_dynamic_gemm_dlops_v1r2.hpp b/src/composable_kernel/host/driver_offline/include/driver_dynamic_gemm_dlops_v1r2.hpp
index 0ebc68b48a..29a72502d5 100644
--- a/src/composable_kernel/host/driver_offline/include/driver_dynamic_gemm_dlops_v1r2.hpp
+++ b/src/composable_kernel/host/driver_offline/include/driver_dynamic_gemm_dlops_v1r2.hpp
@@ -189,7 +189,6 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
                                           dim3(grid_size),
                                           dim3(BlockSize),
                                           0,
-                                          0,
                                           p_a_grid,
                                           p_b_grid,
                                           p_c_grid,
@@ -216,7 +215,6 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
                                           dim3(grid_size),
                                           dim3(BlockSize),
                                           0,
-                                          0,
                                           p_a_grid,
                                           p_b_grid,
                                           p_c_grid,
@@ -243,7 +241,6 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
                                           dim3(grid_size),
                                           dim3(BlockSize),
                                           0,
-                                          0,
                                           p_a_grid,
                                           p_b_grid,
                                           p_c_grid,
@@ -270,7 +267,6 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
                                           dim3(grid_size),
                                           dim3(BlockSize),
                                           0,
-                                          0,
                                           p_a_grid,
                                           p_b_grid,
                                           p_c_grid,
@@ -315,14 +311,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
             dim3(grid_size),
             dim3(BlockSize),
             0,
-            0,
             p_a_grid,
             p_b_grid,
             p_c_grid,
-            (void CONSTANT*)a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer());
+            cast_pointer_to_constant_address_space(a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
     }
     else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
     {
@@ -343,14 +340,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
             dim3(grid_size),
             dim3(BlockSize),
             0,
-            0,
             p_a_grid,
             p_b_grid,
             p_c_grid,
-            (void CONSTANT*)a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer());
+            cast_pointer_to_constant_address_space(a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
     }
     else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
     {
@@ -371,14 +369,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
             dim3(grid_size),
             dim3(BlockSize),
             0,
-            0,
             p_a_grid,
             p_b_grid,
             p_c_grid,
-            (void CONSTANT*)a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer());
+            cast_pointer_to_constant_address_space(a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
     }
     else
     {
@@ -399,14 +398,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
             dim3(grid_size),
             dim3(BlockSize),
             0,
-            0,
             p_a_grid,
             p_b_grid,
             p_c_grid,
-            (void CONSTANT*)a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer());
+            cast_pointer_to_constant_address_space(a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
     }
 
     return ave_time;
diff --git a/src/composable_kernel/host/driver_offline/include/driver_dynamic_gemm_dlops_v1r3.hpp b/src/composable_kernel/host/driver_offline/include/driver_dynamic_gemm_dlops_v1r3.hpp
index d075eac822..242bcfb28b 100644
--- a/src/composable_kernel/host/driver_offline/include/driver_dynamic_gemm_dlops_v1r3.hpp
+++ b/src/composable_kernel/host/driver_offline/include/driver_dynamic_gemm_dlops_v1r3.hpp
@@ -185,7 +185,6 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
                                           dim3(grid_size),
                                           dim3(BlockSize),
                                           0,
-                                          0,
                                           p_a_grid,
                                           p_b_grid,
                                           p_c_grid,
@@ -212,7 +211,6 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
                                           dim3(grid_size),
                                           dim3(BlockSize),
                                           0,
-                                          0,
                                           p_a_grid,
                                           p_b_grid,
                                           p_c_grid,
@@ -239,7 +237,6 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
                                           dim3(grid_size),
                                           dim3(BlockSize),
                                           0,
-                                          0,
                                           p_a_grid,
                                           p_b_grid,
                                           p_c_grid,
@@ -266,7 +263,6 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
                                           dim3(grid_size),
                                           dim3(BlockSize),
                                           0,
-                                          0,
                                           p_a_grid,
                                           p_b_grid,
                                           p_c_grid,
@@ -311,14 +307,17 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
             dim3(grid_size),
             dim3(BlockSize),
             0,
-            0,
             p_a_grid,
             p_b_grid,
             p_c_grid,
-            (void CONSTANT*)a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer());
+            cast_pointer_to_constant_address_space(
+                a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
     }
     else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
     {
@@ -339,14 +338,17 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
             dim3(grid_size),
             dim3(BlockSize),
             0,
-            0,
             p_a_grid,
             p_b_grid,
             p_c_grid,
-            (void CONSTANT*)a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer());
+            cast_pointer_to_constant_address_space(
+                a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
     }
     else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
     {
@@ -367,14 +369,17 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
             dim3(grid_size),
             dim3(BlockSize),
             0,
-            0,
             p_a_grid,
             p_b_grid,
             p_c_grid,
-            (void CONSTANT*)a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer());
+            cast_pointer_to_constant_address_space(
+                a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
     }
     else
     {
@@ -395,14 +400,17 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
             dim3(grid_size),
             dim3(BlockSize),
             0,
-            0,
             p_a_grid,
             p_b_grid,
             p_c_grid,
-            (void CONSTANT*)a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer());
+            cast_pointer_to_constant_address_space(
+                a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
     }
 
     return ave_time;
diff --git a/src/composable_kernel/host/driver_offline/include/driver_dynamic_gemm_xdlops_v2r3.hpp b/src/composable_kernel/host/driver_offline/include/driver_dynamic_gemm_xdlops_v2r3.hpp
index a2f4e28c54..85f5e27b8d 100644
--- a/src/composable_kernel/host/driver_offline/include/driver_dynamic_gemm_xdlops_v2r3.hpp
+++ b/src/composable_kernel/host/driver_offline/include/driver_dynamic_gemm_xdlops_v2r3.hpp
@@ -153,7 +153,6 @@ __host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
                                             dim3(grid_size),
                                             dim3(BlockSize),
                                             0,
-                                            0,
                                             p_a_grid,
                                             p_b_grid,
                                             p_c_grid,
@@ -173,20 +172,19 @@ __host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
     c_m0_m1_m2_n_grid_desc_dev_buf.ToDevice(&c_m0_m1_m2_n_grid_desc);
     c_block_cluster_adaptor_dev_buf.ToDevice(&c_block_cluster_adaptor);
 
-    float ave_time =
-        launch_and_time_kernel(kernel,
-                               nrepeat,
-                               dim3(grid_size),
-                               dim3(BlockSize),
-                               0,
-                               0,
-                               p_a_grid,
-                               p_b_grid,
-                               p_c_grid,
-                               (void CONSTANT*)a_k0_m_k1_grid_desc_dev_buf.GetDeviceBuffer(),
-                               (void CONSTANT*)b_k0_n_k1_grid_desc_dev_buf.GetDeviceBuffer(),
-                               (void CONSTANT*)c_m0_m1_m2_n_grid_desc_dev_buf.GetDeviceBuffer(),
-                               (void CONSTANT*)c_block_cluster_adaptor_dev_buf.GetDeviceBuffer());
+    float ave_time = launch_and_time_kernel(
+        kernel,
+        nrepeat,
+        dim3(grid_size),
+        dim3(BlockSize),
+        0,
+        p_a_grid,
+        p_b_grid,
+        p_c_grid,
+        cast_pointer_to_constant_address_space(a_k0_m_k1_grid_desc_dev_buf.GetDeviceBuffer()),
+        cast_pointer_to_constant_address_space(b_k0_n_k1_grid_desc_dev_buf.GetDeviceBuffer()),
+        cast_pointer_to_constant_address_space(c_m0_m1_m2_n_grid_desc_dev_buf.GetDeviceBuffer()),
+        cast_pointer_to_constant_address_space(c_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
 #endif
     return ave_time;
 }
diff --git a/src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp b/src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp
index 54392f3926..4aac2b5e4f 100644
--- a/src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp
+++ b/src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp
@@ -142,10 +142,8 @@ int main(int argc, char* argv[])
 
     std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4);
 
-    switch(layout)
+    if(layout == ConvTensorLayout::NCHW)
     {
-    case ConvTensorLayout::NCHW:
-        // NCHW
         in_lengths_host[0]  = static_cast<std::size_t>(N);
         in_lengths_host[1]  = static_cast<std::size_t>(C);
         in_lengths_host[2]  = static_cast<std::size_t>(Hi);
@@ -158,9 +156,9 @@ int main(int argc, char* argv[])
         out_lengths_host[1] = static_cast<std::size_t>(K);
         out_lengths_host[2] = static_cast<std::size_t>(Ho);
         out_lengths_host[3] = static_cast<std::size_t>(Wo);
-        break;
-    case ConvTensorLayout::NHWC:
-        // NHWC
+    }
+    else if(layout == ConvTensorLayout::NHWC)
+    {
         in_lengths_host[0]  = static_cast<std::size_t>(N);
         in_lengths_host[1]  = static_cast<std::size_t>(Hi);
         in_lengths_host[2]  = static_cast<std::size_t>(Wi);
@@ -173,8 +171,10 @@ int main(int argc, char* argv[])
         out_lengths_host[1] = static_cast<std::size_t>(Ho);
         out_lengths_host[2] = static_cast<std::size_t>(Wo);
         out_lengths_host[3] = static_cast<std::size_t>(K);
-        break;
-    default: throw std::runtime_error("wrong! not implemented");
+    }
+    else
+    {
+        std::runtime_error("wrong! not implemented");
     }
 
     Tensor<in_data_t> in(in_lengths_host);
diff --git a/src/composable_kernel/host/host_tensor/include/device.hpp b/src/composable_kernel/host/host_tensor/include/device.hpp
index 2299e14921..e2cba94100 100644
--- a/src/composable_kernel/host/host_tensor/include/device.hpp
+++ b/src/composable_kernel/host/host_tensor/include/device.hpp
@@ -34,24 +34,16 @@ struct KernelTimer
 using device_stream_t = hipStream_t;
 
 template <typename... Args, typename F>
-void launch_kernel(F kernel,
-                   dim3 grid_dim,
-                   dim3 block_dim,
-                   std::size_t lds_byte,
-                   hipStream_t stream_id,
-                   Args... args)
+void launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
 {
+    hipStream_t stream_id = nullptr;
+
     hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
 }
 
 template <typename... Args, typename F>
-float launch_and_time_kernel(F kernel,
-                             int nrepeat,
-                             dim3 grid_dim,
-                             dim3 block_dim,
-                             std::size_t lds_byte,
-                             hipStream_t stream_id,
-                             Args... args)
+float launch_and_time_kernel(
+    F kernel, int nrepeat, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
 {
     KernelTimer timer;
 
@@ -66,6 +58,8 @@ float launch_and_time_kernel(F kernel,
 
     printf("Warm up\n");
 
+    hipStream_t stream_id = nullptr;
+
     // warm up
     hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
 

From aeddc20fef60a0fc8002f3be802e801a30c86590 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Mon, 9 Aug 2021 18:20:02 -0500
Subject: [PATCH 18/57] tidy

---
 CMakeLists.txt                                |  1 +
 src/composable_kernel/CMakeLists.txt          |  4 +++
 .../gridwise_dynamic_gemm_xdlops_v2r3.hpp     |  6 -----
 ...plicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp | 10 +------
 ...icit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp |  7 +----
 .../src/conv_bwd_driver_offline.cpp           |  2 --
 .../src/conv_fwd_driver_offline.cpp           |  8 +++---
 .../host/host_tensor/CMakeLists.txt           |  2 ++
 .../include/host_conv_bwd_data.hpp            | 26 +++++++------------
 9 files changed, 22 insertions(+), 44 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 479223e786..3deaad50da 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -377,6 +377,7 @@ add_definitions(-DBOOST_ALL_NO_LIB=1)
 find_package(Boost REQUIRED COMPONENTS ${BOOST_COMPONENTS})
 
 find_path(HALF_INCLUDE_DIR half.hpp)
+message("HALF_INCLUDE_DIR: ${HALF_INCLUDE_DIR}")
 
 option( MIOPEN_DEBUG_FIND_DB_CACHING "Use system find-db caching" ON)
 
diff --git a/src/composable_kernel/CMakeLists.txt b/src/composable_kernel/CMakeLists.txt
index 12bf7ae562..c3c0eab529 100644
--- a/src/composable_kernel/CMakeLists.txt
+++ b/src/composable_kernel/CMakeLists.txt
@@ -38,6 +38,10 @@ link_libraries(${OpenMP_pthread_LIBRARY})
 find_package(HIP REQUIRED)
 message(STATUS "Build with HIP ${hip_VERSION}")
 
+## half
+#find_path(HALF_INCLUDE_DIR half.hpp)
+#message("HALF_INCLUDE_DIR: ${HALF_INCLUDE_DIR}")
+
 ## tidy
 include(EnableCompilerWarnings)
 set(MIOPEN_TIDY_ERRORS ERRORS * -readability-inconsistent-declaration-parameter-name)
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp
index 3a8883b460..09b39ab2ca 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp
@@ -203,9 +203,6 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
     __host__ __device__ static constexpr auto
     MakeCM0M1M2NGridDescriptor(const CMNGridDesc& c_m_n_grid_desc)
     {
-        const auto M = c_m_n_grid_desc.GetLength(I0);
-        const auto N = c_m_n_grid_desc.GetLength(I1);
-
         constexpr auto xdlops_gemm = XdlopsGemm<FloatAB, MPerWave, NPerWave, K1>{};
 
         constexpr auto CLayout = xdlops_gemm.GetCLayout();
@@ -217,7 +214,6 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
         constexpr index_t MWaves = MPerBlock / (MPerWave * MRepeat);
         constexpr index_t NWaves = NPerBlock / (NPerWave * NRepeat);
 
-        constexpr auto N0 = Number<CLayout.N1()>{};
         constexpr auto N1 = Number<CLayout.N0()>{};
 
         const auto c_m0_m1_m2_n_grid_desc = transform_dynamic_tensor_descriptor(
@@ -277,8 +273,6 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
             p_c_grid, c_m0_m1_m2_n_grid_desc.GetElementSpaceSize());
 
         const auto K0 = a_k0_m_k1_grid_desc.GetLength(I0);
-        const auto M  = a_k0_m_k1_grid_desc.GetLength(I1);
-        const auto N  = b_k0_n_k1_grid_desc.GetLength(I1);
 
         // divide block work by [M, N]
         const auto block_work_idx =
diff --git a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp b/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
index 49e0223b33..187a05554b 100644
--- a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
@@ -35,11 +35,6 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyx
     constexpr auto I1 = Number<1>{};
     constexpr auto I2 = Number<2>{};
     constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
-    constexpr auto I6 = Number<6>{};
-    constexpr auto I7 = Number<7>{};
-    constexpr auto I8 = Number<8>{};
 
     DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
     DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
@@ -319,16 +314,13 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyx
             const auto K = out_n_ho_wo_k_lengths[I3];
             const auto C = wei_k_y_x_c_lengths[I3];
 
-            const auto Hi = in_n_hi_wi_c_lengths[I1];
-            const auto Wi = in_n_hi_wi_c_lengths[I2];
-
             const auto Ho = out_n_ho_wo_k_lengths[I1];
             const auto Wo = out_n_ho_wo_k_lengths[I2];
 
             const auto Y = wei_k_y_x_c_lengths[I1];
             const auto X = wei_k_y_x_c_lengths[I2];
 
-            float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
+            float perf = static_cast<float>((std::size_t(2) * N * K * Ho * Wo * C * Y * X)) /
                          (std::size_t(1000) * 1000 * 1000) / ave_time;
 
             std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
diff --git a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp b/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
index ce4dd155f6..e1c6db8045 100644
--- a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
@@ -35,11 +35,6 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_k
     constexpr auto I1 = Number<1>{};
     constexpr auto I2 = Number<2>{};
     constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
-    constexpr auto I6 = Number<6>{};
-    constexpr auto I7 = Number<7>{};
-    constexpr auto I8 = Number<8>{};
 
     DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
     DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
@@ -304,7 +299,7 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_k
             const auto Y = wei_k_y_x_c_lengths[I1];
             const auto X = wei_k_y_x_c_lengths[I2];
 
-            float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
+            float perf = static_cast<float>((std::size_t(2) * N * K * Ho * Wo * C * Y * X)) /
                          (std::size_t(1000) * 1000 * 1000) / ave_time;
 
             std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
diff --git a/src/composable_kernel/host/driver_offline/src/conv_bwd_driver_offline.cpp b/src/composable_kernel/host/driver_offline/src/conv_bwd_driver_offline.cpp
index 3d1faaaf66..828bbae2bd 100644
--- a/src/composable_kernel/host/driver_offline/src/conv_bwd_driver_offline.cpp
+++ b/src/composable_kernel/host/driver_offline/src/conv_bwd_driver_offline.cpp
@@ -277,8 +277,6 @@ int main(int argc, char* argv[])
                           in_right_pads_dev);
     };
 
-    const auto nhwc_desc = f_make_for_device_nhwc();
-
 #if USE_CONV_BWD_V4R1_XDL_NHWC
     if(algo == ConvBackwardDataAlgo::V4R1XDLNHWC)
     {
diff --git a/src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp b/src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp
index 4aac2b5e4f..42a2d2f681 100644
--- a/src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp
+++ b/src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp
@@ -20,12 +20,12 @@
 #include "device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
 
 #define USE_DYNAMIC_MODE 1
-#define USE_CONV_FWD_V4R4_NCHW 1
-#define USE_CONV_FWD_V4R4R2_NHWC 1
-#define USE_CONV_FWD_V6R1_NCHW 1
+#define USE_CONV_FWD_V4R4_NCHW 0
+#define USE_CONV_FWD_V4R4R2_NHWC 0
+#define USE_CONV_FWD_V6R1_NCHW 0
 #define USE_CONV_FWD_V5R1_NCHW 0
 #define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
-#define USE_CONV_FWD_V4R4R4_XDL_NHWC 0
+#define USE_CONV_FWD_V4R4R4_XDL_NHWC 1
 
 enum ConvForwardAlgo
 {
diff --git a/src/composable_kernel/host/host_tensor/CMakeLists.txt b/src/composable_kernel/host/host_tensor/CMakeLists.txt
index 9c30275220..90249fc664 100644
--- a/src/composable_kernel/host/host_tensor/CMakeLists.txt
+++ b/src/composable_kernel/host/host_tensor/CMakeLists.txt
@@ -10,6 +10,8 @@ set(HOST_TENSOR_SOURCE
 ## the library target
 add_library(host_tensor SHARED ${HOST_TENSOR_SOURCE}) 
 
+#target_include_directories(host_tensor SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
+
 target_link_libraries(host_tensor PRIVATE hip::device)
 target_link_libraries(host_tensor INTERFACE hip::host)
 
diff --git a/src/composable_kernel/host/host_tensor/include/host_conv_bwd_data.hpp b/src/composable_kernel/host/host_tensor/include/host_conv_bwd_data.hpp
index 07617c3926..ca23422e23 100644
--- a/src/composable_kernel/host/host_tensor/include/host_conv_bwd_data.hpp
+++ b/src/composable_kernel/host/host_tensor/include/host_conv_bwd_data.hpp
@@ -14,7 +14,7 @@ void host_direct_convolution_backward_data(Tensor<TIn>& in,
                                            const ConvStrides& conv_strides,
                                            const ConvDilations& conv_dilations,
                                            const InLeftPads& in_left_pads,
-                                           const InRightPads& in_right_pads,
+                                           const InRightPads& /* in_right_pads */,
                                            const ConvTensorLayout layout = ConvTensorLayout::NCHW)
 {
     using namespace ck;
@@ -25,11 +25,6 @@ void host_direct_convolution_backward_data(Tensor<TIn>& in,
     constexpr auto I3 = Number<3>{};
 
     auto f_nchw = [&](auto n, auto c, auto hi, auto wi) {
-        std::size_t N  = in.mDesc.GetLengths()[I0];
-        std::size_t C  = in.mDesc.GetLengths()[I1];
-        std::size_t Hi = in.mDesc.GetLengths()[I2];
-        std::size_t Wi = in.mDesc.GetLengths()[I3];
-
         std::size_t K = wei.mDesc.GetLengths()[I0];
         std::size_t Y = wei.mDesc.GetLengths()[I2];
         std::size_t X = wei.mDesc.GetLengths()[I3];
@@ -74,11 +69,6 @@ void host_direct_convolution_backward_data(Tensor<TIn>& in,
     };
 
     auto f_nhwc = [&](auto n, auto hi, auto wi, auto c) {
-        std::size_t N  = in.mDesc.GetLengths()[I0];
-        std::size_t Hi = in.mDesc.GetLengths()[I1];
-        std::size_t Wi = in.mDesc.GetLengths()[I2];
-        std::size_t C  = in.mDesc.GetLengths()[I3];
-
         std::size_t K = wei.mDesc.GetLengths()[I0];
         std::size_t Y = wei.mDesc.GetLengths()[I1];
         std::size_t X = wei.mDesc.GetLengths()[I2];
@@ -122,22 +112,24 @@ void host_direct_convolution_backward_data(Tensor<TIn>& in,
         in(n, hi, wi, c) = v;
     };
 
-    switch(layout)
+    if(layout == ConvTensorLayout::NCHW)
     {
-    case ConvTensorLayout::NCHW:
         make_ParallelTensorFunctor(f_nchw,
                                    in.mDesc.GetLengths()[0],
                                    in.mDesc.GetLengths()[1],
                                    in.mDesc.GetLengths()[2],
                                    in.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
-        break;
-    case ConvTensorLayout::NHWC:
+    }
+    else if(layout == ConvTensorLayout::NHWC)
+    {
         make_ParallelTensorFunctor(f_nhwc,
                                    in.mDesc.GetLengths()[0],
                                    in.mDesc.GetLengths()[1],
                                    in.mDesc.GetLengths()[2],
                                    in.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
-        break;
-    default: throw std::runtime_error("wrong! not supported layout");
+    }
+    else
+    {
+        throw std::runtime_error("wrong! not supported layout");
     }
 }

From 7b9a9ea0625af67d9551ffbb1f7856f27a1fa60f Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Mon, 9 Aug 2021 18:25:56 -0500
Subject: [PATCH 19/57] tidy

---
 CMakeLists.txt | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3deaad50da..1d59d1a975 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -563,11 +563,13 @@ enable_cppcheck(
         *:*src/sqlite/*.cpp
         *:*.cl
         *:*src/kernels/*.h
-        knownConditionTrueFalse:*src/kernels/static_composable_kernel/*/*,*src/composable_kernel/composable_kernel/*/*
+        knownConditionTrueFalse:*src/kernels/static_composable_kernel/*/*
         redundantAssignment:*src/kernels/static_composable_kernel/*/*
-        unreadVariable:*src/kernels/static_composable_kernel/*/*,*src/composable_kernel/external/*/*
+        unreadVariable:*src/kernels/static_composable_kernel/*/*
         unusedScopedObject:*src/kernels/static_composable_kernel/*/*
         wrongPrintfScanfArgNum:*src/kernels/static_composable_kernel/*/*
+        knownConditionTrueFalse:*src/composable_kernel/composable_kernel/*/*
+        unreadVariable:*src/composable_kernel/external/*/*
         unmatchedSuppression
     FORCE
     SOURCES

From b5d1fa38f6a3e199f417f36ac18e6063c3559062 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Mon, 9 Aug 2021 18:49:59 -0500
Subject: [PATCH 20/57] tidy

---
 src/composable_kernel/CMakeLists.txt                      | 2 +-
 .../gridwise_dynamic_gemm_xdlops_v2r3.hpp                 | 5 -----
 src/composable_kernel/host/driver_offline/CMakeLists.txt  | 1 -
 ...forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp | 3 ---
 .../host/driver_offline/src/conv_fwd_driver_offline.cpp   | 8 ++++----
 src/composable_kernel/host/host_tensor/CMakeLists.txt     | 2 +-
 6 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/src/composable_kernel/CMakeLists.txt b/src/composable_kernel/CMakeLists.txt
index c3c0eab529..e5a5258328 100644
--- a/src/composable_kernel/CMakeLists.txt
+++ b/src/composable_kernel/CMakeLists.txt
@@ -40,7 +40,7 @@ message(STATUS "Build with HIP ${hip_VERSION}")
 
 ## half
 #find_path(HALF_INCLUDE_DIR half.hpp)
-#message("HALF_INCLUDE_DIR: ${HALF_INCLUDE_DIR}")
+message("HALF_INCLUDE_DIR: ${HALF_INCLUDE_DIR}")
 
 ## tidy
 include(EnableCompilerWarnings)
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp
index 09b39ab2ca..4f02da1409 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp
@@ -411,9 +411,6 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
         constexpr auto a_block_space_size =
             math::integer_least_multiple(a_k0_m_k1_block_desc.GetElementSpaceSize(), max_lds_align);
 
-        constexpr auto b_block_space_size =
-            math::integer_least_multiple(b_k0_n_k1_block_desc.GetElementSpaceSize(), max_lds_align);
-
         FloatAB* p_a_block = p_shared_block;
         FloatAB* p_b_block = p_shared_block + a_block_space_size;
 
@@ -574,8 +571,6 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
                 make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
                     I1, I1, I1, I1, Number<M0>{}, Number<1>{}, Number<M2>{}, Number<1>{}));
 
-            StaticBuffer<AddressSpaceEnum_t::Vgpr, FloatC, BlkSize> c_blk_buf_;
-
             // calculate origin of thread output tensor on global memory
             //     blockwise GEMM c matrix starting index
             const auto c_thread_mtx_on_block =
diff --git a/src/composable_kernel/host/driver_offline/CMakeLists.txt b/src/composable_kernel/host/driver_offline/CMakeLists.txt
index 9743abbb0b..fec11e99af 100644
--- a/src/composable_kernel/host/driver_offline/CMakeLists.txt
+++ b/src/composable_kernel/host/driver_offline/CMakeLists.txt
@@ -9,7 +9,6 @@ include_directories(BEFORE
     ${PROJECT_SOURCE_DIR}/composable_kernel/include/problem_transform
     ${PROJECT_SOURCE_DIR}/composable_kernel/include/driver
     ${PROJECT_SOURCE_DIR}/external/rocm/include
-    ${PROJECT_SOURCE_DIR}/external/half/include
 )
 
 set(CONV_FWD_DRIVER_OFFLINE_SOURCE src/conv_fwd_driver_offline.cpp)
diff --git a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp b/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
index 8596630ad3..0d49c417de 100644
--- a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
@@ -338,9 +338,6 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nh
             const auto K = out_n_ho_wo_k_lengths[I3];
             const auto C = wei_k_y_x_c_lengths[I3];
 
-            const auto Hi = in_n_hi_wi_c_lengths[I1];
-            const auto Wi = in_n_hi_wi_c_lengths[I2];
-
             const auto Ho = out_n_ho_wo_k_lengths[I1];
             const auto Wo = out_n_ho_wo_k_lengths[I2];
 
diff --git a/src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp b/src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp
index 42a2d2f681..3358d5b98c 100644
--- a/src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp
+++ b/src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp
@@ -20,11 +20,11 @@
 #include "device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
 
 #define USE_DYNAMIC_MODE 1
-#define USE_CONV_FWD_V4R4_NCHW 0
-#define USE_CONV_FWD_V4R4R2_NHWC 0
-#define USE_CONV_FWD_V6R1_NCHW 0
+#define USE_CONV_FWD_V4R4_NCHW 1
+#define USE_CONV_FWD_V4R4R2_NHWC 1
+#define USE_CONV_FWD_V6R1_NCHW 1
 #define USE_CONV_FWD_V5R1_NCHW 0
-#define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
+#define USE_CONV_FWD_V4R4R2_XDL_NCHW 1
 #define USE_CONV_FWD_V4R4R4_XDL_NHWC 1
 
 enum ConvForwardAlgo
diff --git a/src/composable_kernel/host/host_tensor/CMakeLists.txt b/src/composable_kernel/host/host_tensor/CMakeLists.txt
index 90249fc664..3dcecf64e1 100644
--- a/src/composable_kernel/host/host_tensor/CMakeLists.txt
+++ b/src/composable_kernel/host/host_tensor/CMakeLists.txt
@@ -10,7 +10,7 @@ set(HOST_TENSOR_SOURCE
 ## the library target
 add_library(host_tensor SHARED ${HOST_TENSOR_SOURCE}) 
 
-#target_include_directories(host_tensor SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
+target_include_directories(host_tensor SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
 
 target_link_libraries(host_tensor PRIVATE hip::device)
 target_link_libraries(host_tensor INTERFACE hip::host)

From bf9c7a7336934414a812309426537abe65083e0c Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Tue, 10 Aug 2021 00:01:52 -0500
Subject: [PATCH 21/57] add c-style pointer cast

---
 .../include/utility/amd_address_space.hpp     | 18 +++++--
 .../include/utility/c_style_pointer_cast.hpp  | 20 ++++++++
 .../include/utility/common_header.hpp         |  1 +
 .../include/utility/dynamic_buffer.hpp        | 49 ++++++++++---------
 .../include/utility/type.hpp                  |  5 +-
 ...icit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp |  3 --
 6 files changed, 62 insertions(+), 34 deletions(-)
 create mode 100644 src/composable_kernel/composable_kernel/include/utility/c_style_pointer_cast.hpp

diff --git a/src/composable_kernel/composable_kernel/include/utility/amd_address_space.hpp b/src/composable_kernel/composable_kernel/include/utility/amd_address_space.hpp
index c5bb1b2cd0..a8010f951c 100644
--- a/src/composable_kernel/composable_kernel/include/utility/amd_address_space.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/amd_address_space.hpp
@@ -3,6 +3,9 @@
 
 #include "config.hpp"
 
+// Address Space for AMDGCN
+// https://llvm.org/docs/AMDGPUUsage.html#address-space
+
 namespace ck {
 
 enum AddressSpaceEnum_t
@@ -17,15 +20,24 @@ enum AddressSpaceEnum_t
 template <typename T>
 __device__ T* cast_pointer_to_generic_address_space(T CONSTANT* p)
 {
-    return (T*)p;
+    // cast a pointer in "Constant" address space (4) to "Generic" address space (0)
+    // only old style cast seems be able to be compiled
+#pragma clang diagnostic ignored "-Wold-style-cast"
+#pragma clang diagnostic push
+    return (T*)p; // NOLINT(old-style-cast)
+#pragma clang diagnostic pop
 }
 
 template <typename T>
 __host__ __device__ T CONSTANT* cast_pointer_to_constant_address_space(T* p)
 {
-    return (T CONSTANT*)p;
+    // cast a pointer in "Generic" address space (0) to "Constant" address space (4)
+    // only old style cast seems be able to be compiled
+#pragma clang diagnostic ignored "-Wold-style-cast"
+#pragma clang diagnostic push
+    return (T CONSTANT*)p; // NOLINT(old-style-cast)
+#pragma clang diagnostic pop
 }
 
 } // namespace ck
-
 #endif
diff --git a/src/composable_kernel/composable_kernel/include/utility/c_style_pointer_cast.hpp b/src/composable_kernel/composable_kernel/include/utility/c_style_pointer_cast.hpp
new file mode 100644
index 0000000000..f4b4b09a76
--- /dev/null
+++ b/src/composable_kernel/composable_kernel/include/utility/c_style_pointer_cast.hpp
@@ -0,0 +1,20 @@
+#ifndef CK_C_STYLE_POINTER_CAST_HPP
+#define CK_C_STYLE_POINTER_CAST_HPP
+
+#include "type.hpp"
+
+namespace ck {
+
+template <typename PY,
+          typename PX,
+          typename std::enable_if<is_pointer_v<PY> && is_pointer_v<PX>, bool>::type = false>
+__host__ __device__ PY c_style_pointer_cast(PX p_x)
+{
+#pragma clang diagnostic ignored "-Wold-style-cast"
+#pragma clang diagnostic push
+    return (PY)p_x; // NOLINT(old-style-cast)
+#pragma clang diagnostic pop
+}
+
+} // namespace ck
+#endif
diff --git a/src/composable_kernel/composable_kernel/include/utility/common_header.hpp b/src/composable_kernel/composable_kernel/include/utility/common_header.hpp
index 39b400b3cd..c4346e45d0 100644
--- a/src/composable_kernel/composable_kernel/include/utility/common_header.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/common_header.hpp
@@ -25,6 +25,7 @@
 #include "type.hpp"
 #include "magic_division.hpp"
 #include "utility.hpp"
+#include "c_style_pointer_cast.hpp"
 #include "amd_address_space.hpp"
 #include "amd_buffer_addressing.hpp"
 #include "static_buffer.hpp"
diff --git a/src/composable_kernel/composable_kernel/include/utility/dynamic_buffer.hpp b/src/composable_kernel/composable_kernel/include/utility/dynamic_buffer.hpp
index 34c28d7fef..b41639051f 100644
--- a/src/composable_kernel/composable_kernel/include/utility/dynamic_buffer.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/dynamic_buffer.hpp
@@ -1,9 +1,10 @@
 #ifndef CK_DYNAMIC_BUFFER_HPP
 #define CK_DYNAMIC_BUFFER_HPP
 
-namespace ck {
-
 #include "amd_buffer_addressing.hpp"
+#include "c_style_pointer_cast.hpp"
+
+namespace ck {
 
 template <AddressSpaceEnum_t BufferAddressSpace, typename T, typename ElementSpaceSize>
 struct DynamicBuffer
@@ -44,20 +45,20 @@ struct DynamicBuffer
         static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
                       "wrong! X need to be multiple T");
 
-        constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
-
         if constexpr(GetAddressSpace() == AddressSpaceEnum_t::Global)
         {
 #if CK_USE_AMD_BUFFER_ADDRESSING
+            constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
+
             return amd_buffer_load_v2<remove_cv_t<remove_reference_t<T>>, t_per_x>(
                 p_data_, i, is_valid_offset, element_space_size_);
 #else
-            return is_valid_offset ? *reinterpret_cast<const X*>(&p_data_[i]) : X{0};
+            return is_valid_offset ? *c_style_pointer_cast<const X*>(&p_data_[i]) : X{0};
 #endif
         }
         else
         {
-            return is_valid_offset ? *reinterpret_cast<const X*>(&p_data_[i]) : X{0};
+            return is_valid_offset ? *c_style_pointer_cast<const X*>(&p_data_[i]) : X{0};
         }
     }
 
@@ -78,17 +79,17 @@ struct DynamicBuffer
         static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
                       "wrong! X need to be multiple T");
 
-        constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
-
         if constexpr(GetAddressSpace() == AddressSpaceEnum_t::Global)
         {
 #if CK_USE_AMD_BUFFER_ADDRESSING
+            constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
+
             amd_buffer_store_v2<remove_cv_t<remove_reference_t<T>>, t_per_x>(
                 x, p_data_, i, is_valid_offset, element_space_size_);
 #else
             if(is_valid_offset)
             {
-                *reinterpret_cast<X*>(&p_data_[i]) = x;
+                *c_style_pointer_cast<X*>(&p_data_[i]) = x;
             }
 #endif
         }
@@ -97,7 +98,7 @@ struct DynamicBuffer
             if(is_valid_offset)
             {
 #if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
-                *reinterpret_cast<X*>(&p_data_[i]) = x;
+                *c_style_pointer_cast<X*>(&p_data_[i]) = x;
 #else
                 // HACK: compiler would lower IR "store<i8, 16> address_space(3)" into
                 // inefficient
@@ -128,24 +129,24 @@ struct DynamicBuffer
                     {
                         // HACK: cast pointer of x is bad
                         // TODO: remove this after compiler fix
-                        *reinterpret_cast<int8_t*>(&p_data_[i]) =
-                            *reinterpret_cast<const int8_t*>(&x);
+                        *c_style_pointer_cast<int8_t*>(&p_data_[i]) =
+                            *c_style_pointer_cast<const int8_t*>(&x);
                     }
                     else if constexpr(is_same<remove_cv_t<remove_reference_t<T>>, int8_t>::value &&
                                       is_same<remove_cv_t<remove_reference_t<X>>, int8x2_t>::value)
                     {
                         // HACK: cast pointer of x is bad
                         // TODO: remove this after compiler fix
-                        *reinterpret_cast<int16_t*>(&p_data_[i]) =
-                            *reinterpret_cast<const int16_t*>(&x);
+                        *c_style_pointer_cast<int16_t*>(&p_data_[i]) =
+                            *c_style_pointer_cast<const int16_t*>(&x);
                     }
                     else if constexpr(is_same<remove_cv_t<remove_reference_t<T>>, int8_t>::value &&
                                       is_same<remove_cv_t<remove_reference_t<X>>, int8x4_t>::value)
                     {
                         // HACK: cast pointer of x is bad
                         // TODO: remove this after compiler fix
-                        *reinterpret_cast<int32_t*>(&p_data_[i]) =
-                            *reinterpret_cast<const int32_t*>(&x);
+                        *c_style_pointer_cast<int32_t*>(&p_data_[i]) =
+                            *c_style_pointer_cast<const int32_t*>(&x);
                     }
                     else if constexpr(is_same<remove_cv_t<remove_reference_t<T>>,
                                               int8x4_t>::value &&
@@ -153,8 +154,8 @@ struct DynamicBuffer
                     {
                         // HACK: cast pointer of x is bad
                         // TODO: remove this after compiler fix
-                        *reinterpret_cast<int32_t*>(&p_data_[i]) =
-                            *reinterpret_cast<const int32_t*>(&x);
+                        *c_style_pointer_cast<int32_t*>(&p_data_[i]) =
+                            *c_style_pointer_cast<const int32_t*>(&x);
                     }
                     else if constexpr(is_same<remove_cv_t<remove_reference_t<T>>,
                                               int8x8_t>::value &&
@@ -162,8 +163,8 @@ struct DynamicBuffer
                     {
                         // HACK: cast pointer of x is bad
                         // TODO: remove this after compiler fix
-                        *reinterpret_cast<int32x2_t*>(&p_data_[i]) =
-                            *reinterpret_cast<const int32x2_t*>(&x);
+                        *c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
+                            *c_style_pointer_cast<const int32x2_t*>(&x);
                     }
                     else if constexpr(is_same<remove_cv_t<remove_reference_t<T>>,
                                               int8x16_t>::value &&
@@ -171,13 +172,13 @@ struct DynamicBuffer
                     {
                         // HACK: cast pointer of x is bad
                         // TODO: remove this after compiler fix
-                        *reinterpret_cast<int32x4_t*>(&p_data_[i]) =
-                            *reinterpret_cast<const int32x4_t*>(&x);
+                        *c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
+                            *c_style_pointer_cast<const int32x4_t*>(&x);
                     }
                 }
                 else
                 {
-                    *reinterpret_cast<X*>(&p_data_[i]) = x;
+                    *c_style_pointer_cast<X*>(&p_data_[i]) = x;
                 }
 #endif
             }
@@ -186,7 +187,7 @@ struct DynamicBuffer
         {
             if(is_valid_offset)
             {
-                *reinterpret_cast<X*>(&p_data_[i]) = x;
+                *c_style_pointer_cast<X*>(&p_data_[i]) = x;
             }
         }
     }
diff --git a/src/composable_kernel/composable_kernel/include/utility/type.hpp b/src/composable_kernel/composable_kernel/include/utility/type.hpp
index 32f7dfb569..12ed435658 100644
--- a/src/composable_kernel/composable_kernel/include/utility/type.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/type.hpp
@@ -22,10 +22,7 @@ template <typename T>
 using remove_cv_t = typename std::remove_cv<T>::type;
 
 template <typename T>
-constexpr std::remove_reference_t<T>&& move(T&& t) noexcept
-{
-    return static_cast<typename std::remove_reference<T>::type&&>(t);
-}
+inline constexpr bool is_pointer_v = std::is_pointer<T>::value;
 
 template <typename T>
 struct is_known_at_compile_time;
diff --git a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp b/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
index e1c6db8045..85c418c52f 100644
--- a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
@@ -290,9 +290,6 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_k
             const auto K = out_n_ho_wo_k_lengths[I3];
             const auto C = wei_k_y_x_c_lengths[I3];
 
-            const auto Hi = in_n_hi_wi_c_lengths[I1];
-            const auto Wi = in_n_hi_wi_c_lengths[I2];
-
             const auto Ho = out_n_ho_wo_k_lengths[I1];
             const auto Wo = out_n_ho_wo_k_lengths[I2];
 

From f258bf4ab2cd3b3271305517a20232e9e79cd997 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Tue, 10 Aug 2021 05:55:20 +0000
Subject: [PATCH 22/57] vector/scalar pointer cast use c-style pointer cast
 instead of reinterpret_cast

---
 .../include/tensor_operation/xdlops_gemm.hpp  | 20 +++++-----
 .../include/utility/amd_inline_asm.hpp        | 37 ++++++++++---------
 ...licit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp |  4 +-
 .../host/host_tensor/include/host_tensor.hpp  |  2 +-
 4 files changed, 32 insertions(+), 31 deletions(-)

diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/xdlops_gemm.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/xdlops_gemm.hpp
index 876a1174e7..affe096ace 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/xdlops_gemm.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/xdlops_gemm.hpp
@@ -350,8 +350,8 @@ struct mfma_info<mfma_instr::mfma_f32_32x32x2bf16>
               class FloatC>
     __device__ FloatC run(const FloatA* a, const FloatB* b, FloatC reg_c) const
     {
-        const auto p_a = reinterpret_cast<const ushort2_t*>(a);
-        const auto p_b = reinterpret_cast<const ushort2_t*>(b);
+        const auto p_a = c_style_pointer_cast<const ushort2_t*>(a);
+        const auto p_b = c_style_pointer_cast<const ushort2_t*>(b);
 
         return intrin_mfma_f32_32x32x2bf16<MPerXdlops, NPerXdlops, AStride, BStride>::run(
             p_a, p_b, reg_c);
@@ -384,8 +384,8 @@ struct mfma_info<mfma_instr::mfma_f32_32x32x4bf16>
               class FloatC>
     __device__ FloatC run(const FloatA* a, const FloatB* b, FloatC reg_c) const
     {
-        const auto p_a = reinterpret_cast<const ushort2_t*>(a);
-        const auto p_b = reinterpret_cast<const ushort2_t*>(b);
+        const auto p_a = c_style_pointer_cast<const ushort2_t*>(a);
+        const auto p_b = c_style_pointer_cast<const ushort2_t*>(b);
 
         return intrin_mfma_f32_32x32x4bf16(p_a, p_b, reg_c);
     }
@@ -417,8 +417,8 @@ struct mfma_info<mfma_instr::mfma_f32_16x16x8bf16>
               class FloatC>
     __device__ FloatC run(const FloatA* a, const FloatB* b, FloatC reg_c) const
     {
-        const auto p_a = reinterpret_cast<const ushort2_t*>(a);
-        const auto p_b = reinterpret_cast<const ushort2_t*>(b);
+        const auto p_a = c_style_pointer_cast<const ushort2_t*>(a);
+        const auto p_b = c_style_pointer_cast<const ushort2_t*>(b);
 
         return intrin_mfma_f32_16x16x8bf16(p_a, p_b, reg_c);
     }
@@ -450,8 +450,8 @@ struct mfma_info<mfma_instr::mfma_f32_16x16x2bf16>
               class FloatC>
     __device__ FloatC run(const FloatA* a, const FloatB* b, FloatC reg_c) const
     {
-        const auto p_a = reinterpret_cast<const ushort2_t*>(a);
-        const auto p_b = reinterpret_cast<const ushort2_t*>(b);
+        const auto p_a = c_style_pointer_cast<const ushort2_t*>(a);
+        const auto p_b = c_style_pointer_cast<const ushort2_t*>(b);
 
         return intrin_mfma_f32_16x16x2bf16<MPerXdlops, NPerXdlops>(p_a, p_b, reg_c);
     }
@@ -483,8 +483,8 @@ struct mfma_info<mfma_instr::mfma_f32_4x4x2bf16>
               class FloatC>
     __device__ FloatC run(const FloatA* a, const FloatB* b, FloatC reg_c) const
     {
-        const auto p_a = reinterpret_cast<const ushort2_t*>(a);
-        const auto p_b = reinterpret_cast<const ushort2_t*>(b);
+        const auto p_a = c_style_pointer_cast<const ushort2_t*>(a);
+        const auto p_b = c_style_pointer_cast<const ushort2_t*>(b);
 
         return intrin_mfma_f32_4x4x2bf16<MPerXdlops, NPerXdlops>::run(p_a, p_b, reg_c);
     }
diff --git a/src/composable_kernel/composable_kernel/include/utility/amd_inline_asm.hpp b/src/composable_kernel/composable_kernel/include/utility/amd_inline_asm.hpp
index ce80fc0549..34c68a35aa 100644
--- a/src/composable_kernel/composable_kernel/include/utility/amd_inline_asm.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/amd_inline_asm.hpp
@@ -2,6 +2,7 @@
 #define CK_AMD_INLINE_ASM_HPP
 
 #include "data_type.hpp"
+#include "c_style_pointer_cast.hpp"
 
 namespace ck {
 
@@ -53,9 +54,9 @@ __device__ void
 amd_assembly_outer_product_1x2(half4_t a, half4_t b0, half4_t b1, float& c0, float& c1)
 {
     // TODO remove pointer casting
-    const half2_t* p_a_half2  = reinterpret_cast<const half2_t*>(&a);
-    const half2_t* p_b0_half2 = reinterpret_cast<const half2_t*>(&b0);
-    const half2_t* p_b1_half2 = reinterpret_cast<const half2_t*>(&b1);
+    const half2_t* p_a_half2  = c_style_pointer_cast<const half2_t*>(&a);
+    const half2_t* p_b0_half2 = c_style_pointer_cast<const half2_t*>(&b0);
+    const half2_t* p_b1_half2 = c_style_pointer_cast<const half2_t*>(&b1);
 
     // do dot2 two times
     asm volatile("\n \
@@ -114,11 +115,11 @@ __device__ void amd_assembly_outer_product_1x4(half4_t a,
                                                float& c3)
 {
     // TODO remove pointer casting
-    const half2_t* p_a_half2  = reinterpret_cast<const half2_t*>(&a);
-    const half2_t* p_b0_half2 = reinterpret_cast<const half2_t*>(&b0);
-    const half2_t* p_b1_half2 = reinterpret_cast<const half2_t*>(&b1);
-    const half2_t* p_b2_half2 = reinterpret_cast<const half2_t*>(&b2);
-    const half2_t* p_b3_half2 = reinterpret_cast<const half2_t*>(&b3);
+    const half2_t* p_a_half2  = c_style_pointer_cast<const half2_t*>(&a);
+    const half2_t* p_b0_half2 = c_style_pointer_cast<const half2_t*>(&b0);
+    const half2_t* p_b1_half2 = c_style_pointer_cast<const half2_t*>(&b1);
+    const half2_t* p_b2_half2 = c_style_pointer_cast<const half2_t*>(&b2);
+    const half2_t* p_b3_half2 = c_style_pointer_cast<const half2_t*>(&b3);
 
     // do dot2 two times
     asm volatile("\n \
@@ -160,11 +161,11 @@ __device__ void amd_assembly_outer_product_1x4(half8_t a,
 {
 
     // TODO remove pointer casting
-    const half4_t* p_a_half4  = reinterpret_cast<const half4_t*>(&a);
-    const half4_t* p_b0_half4 = reinterpret_cast<const half4_t*>(&b0);
-    const half4_t* p_b1_half4 = reinterpret_cast<const half4_t*>(&b1);
-    const half4_t* p_b2_half4 = reinterpret_cast<const half4_t*>(&b2);
-    const half4_t* p_b3_half4 = reinterpret_cast<const half4_t*>(&b3);
+    const half4_t* p_a_half4  = c_style_pointer_cast<const half4_t*>(&a);
+    const half4_t* p_b0_half4 = c_style_pointer_cast<const half4_t*>(&b0);
+    const half4_t* p_b1_half4 = c_style_pointer_cast<const half4_t*>(&b1);
+    const half4_t* p_b2_half4 = c_style_pointer_cast<const half4_t*>(&b2);
+    const half4_t* p_b3_half4 = c_style_pointer_cast<const half4_t*>(&b3);
 
     amd_assembly_outer_product_1x4(
         p_a_half4[0], p_b0_half4[0], p_b1_half4[0], p_b2_half4[0], p_b3_half4[0], c0, c1, c2, c3);
@@ -184,11 +185,11 @@ __device__ void amd_assembly_outer_product_1x4(half16_t a,
                                                float& c3)
 {
     // TODO remove pointer casting
-    const half8_t* p_a_half8  = reinterpret_cast<const half8_t*>(&a);
-    const half8_t* p_b0_half8 = reinterpret_cast<const half8_t*>(&b0);
-    const half8_t* p_b1_half8 = reinterpret_cast<const half8_t*>(&b1);
-    const half8_t* p_b2_half8 = reinterpret_cast<const half8_t*>(&b2);
-    const half8_t* p_b3_half8 = reinterpret_cast<const half8_t*>(&b3);
+    const half8_t* p_a_half8  = c_style_pointer_cast<const half8_t*>(&a);
+    const half8_t* p_b0_half8 = c_style_pointer_cast<const half8_t*>(&b0);
+    const half8_t* p_b1_half8 = c_style_pointer_cast<const half8_t*>(&b1);
+    const half8_t* p_b2_half8 = c_style_pointer_cast<const half8_t*>(&b2);
+    const half8_t* p_b3_half8 = c_style_pointer_cast<const half8_t*>(&b3);
 
     amd_assembly_outer_product_1x4(
         p_a_half8[0], p_b0_half8[0], p_b1_half8[0], p_b2_half8[0], p_b3_half8[0], c0, c1, c2, c3);
diff --git a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp b/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp
index d553d2586c..667150317e 100644
--- a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp
+++ b/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp
@@ -51,7 +51,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhw
     const auto out_n_ho_wo_k_desc =
         make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths);
 
-#if 1
+#if 0
     // [M, N, K0, K1] = [128, 128, 8, 1] for fp32
     // cdata = 64, BlockSize = 256
     constexpr index_t BlockSize = 256;
@@ -81,7 +81,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhw
     using GemmBBlockTransferDstVectorTensorLengths_K0_N0_N1_K1 = Sequence<1, 1, 1, 1>;
 
     constexpr index_t GemmCThreadTransferDstScalarPerVector_N11 = 4;
-#elif 1
+#elif 0
     // [M, N, K0, K1] = [128, 128, 8, 2] for fp16
     // cdata = 64, BlockSize = 256
     constexpr index_t BlockSize = 256;
diff --git a/src/composable_kernel/host/host_tensor/include/host_tensor.hpp b/src/composable_kernel/host/host_tensor/include/host_tensor.hpp
index 70778a4a94..06aed0a0c1 100644
--- a/src/composable_kernel/host/host_tensor/include/host_tensor.hpp
+++ b/src/composable_kernel/host/host_tensor/include/host_tensor.hpp
@@ -34,7 +34,7 @@ std::ostream& LogRangeAsType(std::ostream& os, Range&& range, std::string delim)
             first = false;
         else
             os << delim;
-        os << T{v};
+        os << static_cast<T>(v);
     }
     return os;
 }

From 15467d56df50a9ddde671143c7d591c12b637a99 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Tue, 10 Aug 2021 06:20:24 +0000
Subject: [PATCH 23/57] fix clang warning suppression

---
 src/composable_kernel/CMakeLists.txt                 | 12 +-----------
 .../include/utility/amd_address_space.hpp            |  9 +++++----
 .../include/utility/c_style_pointer_cast.hpp         |  5 +++--
 ...ard_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp |  2 +-
 .../driver_offline/src/conv_fwd_driver_offline.cpp   | 10 +++++-----
 5 files changed, 15 insertions(+), 23 deletions(-)

diff --git a/src/composable_kernel/CMakeLists.txt b/src/composable_kernel/CMakeLists.txt
index e5a5258328..6581bae8b9 100644
--- a/src/composable_kernel/CMakeLists.txt
+++ b/src/composable_kernel/CMakeLists.txt
@@ -166,21 +166,11 @@ enable_cppcheck(
         duplicateCondition
         noExplicitConstructor
         passedByValue
-        # preprocessorErrorDirective
+        preprocessorErrorDirective
         shadowVariable
         unusedFunction
         unusedPrivateFunction
         unusedStructMember
-        # Ignore initializer lists in the tests
-       #useInitializationList:*test/*.cpp
-       #*:*src/sqlite/*.cpp
-       #*:*.cl
-       #*:*src/kernels/*.h
-       #knownConditionTrueFalse:*src/kernels/composable_kernel/*/*
-       #redundantAssignment:*src/kernels/composable_kernel/*/*
-       #unreadVariable:*src/kernels/composable_kernel/*/*
-       #unusedScopedObject:*src/kernels/composable_kernel/*/*
-       #wrongPrintfScanfArgNum:*src/kernels/composable_kernel/*/*
         unmatchedSuppression
     FORCE
     SOURCES
diff --git a/src/composable_kernel/composable_kernel/include/utility/amd_address_space.hpp b/src/composable_kernel/composable_kernel/include/utility/amd_address_space.hpp
index a8010f951c..24c95b27af 100644
--- a/src/composable_kernel/composable_kernel/include/utility/amd_address_space.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/amd_address_space.hpp
@@ -2,6 +2,7 @@
 #define CK_AMD_ADDRESS_SPACE_HPP
 
 #include "config.hpp"
+#include "c_style_pointer_cast.hpp"
 
 // Address Space for AMDGCN
 // https://llvm.org/docs/AMDGPUUsage.html#address-space
@@ -21,9 +22,9 @@ template <typename T>
 __device__ T* cast_pointer_to_generic_address_space(T CONSTANT* p)
 {
     // cast a pointer in "Constant" address space (4) to "Generic" address space (0)
-    // only old style cast seems be able to be compiled
-#pragma clang diagnostic ignored "-Wold-style-cast"
+    // only c-style pointer cast seems be able to be compiled
 #pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wold-style-cast"
     return (T*)p; // NOLINT(old-style-cast)
 #pragma clang diagnostic pop
 }
@@ -32,9 +33,9 @@ template <typename T>
 __host__ __device__ T CONSTANT* cast_pointer_to_constant_address_space(T* p)
 {
     // cast a pointer in "Generic" address space (0) to "Constant" address space (4)
-    // only old style cast seems be able to be compiled
-#pragma clang diagnostic ignored "-Wold-style-cast"
+    // only c-style pointer cast seems be able to be compiled
 #pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wold-style-cast"
     return (T CONSTANT*)p; // NOLINT(old-style-cast)
 #pragma clang diagnostic pop
 }
diff --git a/src/composable_kernel/composable_kernel/include/utility/c_style_pointer_cast.hpp b/src/composable_kernel/composable_kernel/include/utility/c_style_pointer_cast.hpp
index f4b4b09a76..29cd3d07ca 100644
--- a/src/composable_kernel/composable_kernel/include/utility/c_style_pointer_cast.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/c_style_pointer_cast.hpp
@@ -10,9 +10,10 @@ template <typename PY,
           typename std::enable_if<is_pointer_v<PY> && is_pointer_v<PX>, bool>::type = false>
 __host__ __device__ PY c_style_pointer_cast(PX p_x)
 {
-#pragma clang diagnostic ignored "-Wold-style-cast"
 #pragma clang diagnostic push
-    return (PY)p_x; // NOLINT(old-style-cast)
+#pragma clang diagnostic ignored "-Wold-style-cast"
+#pragma clang diagnostic ignored "-Wcast-align"
+    return (PY)p_x; // NOLINT(old-style-cast, cast-align)
 #pragma clang diagnostic pop
 }
 
diff --git a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp b/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp
index 667150317e..64557b642e 100644
--- a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp
+++ b/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp
@@ -81,7 +81,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhw
     using GemmBBlockTransferDstVectorTensorLengths_K0_N0_N1_K1 = Sequence<1, 1, 1, 1>;
 
     constexpr index_t GemmCThreadTransferDstScalarPerVector_N11 = 4;
-#elif 0
+#elif 1
     // [M, N, K0, K1] = [128, 128, 8, 2] for fp16
     // cdata = 64, BlockSize = 256
     constexpr index_t BlockSize = 256;
diff --git a/src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp b/src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp
index 3358d5b98c..f88d1b831e 100644
--- a/src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp
+++ b/src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp
@@ -20,12 +20,12 @@
 #include "device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
 
 #define USE_DYNAMIC_MODE 1
-#define USE_CONV_FWD_V4R4_NCHW 1
+#define USE_CONV_FWD_V4R4_NCHW 0
 #define USE_CONV_FWD_V4R4R2_NHWC 1
-#define USE_CONV_FWD_V6R1_NCHW 1
+#define USE_CONV_FWD_V6R1_NCHW 0
 #define USE_CONV_FWD_V5R1_NCHW 0
-#define USE_CONV_FWD_V4R4R2_XDL_NCHW 1
-#define USE_CONV_FWD_V4R4R4_XDL_NHWC 1
+#define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
+#define USE_CONV_FWD_V4R4R4_XDL_NHWC 0
 
 enum ConvForwardAlgo
 {
@@ -126,7 +126,7 @@ int main(int argc, char* argv[])
     const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
 #endif
 
-#if 1
+#if 0
     using in_data_t  = float;
     using acc_data_t = float;
     using out_data_t = float;

From 485800f4049126dd303f39c21358bf13dd1ae56c Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Tue, 10 Aug 2021 07:07:11 +0000
Subject: [PATCH 24/57] tidy

---
 CMakeLists.txt                                |    3 +
 .../external/half/include/half.hpp            | 5670 -----------------
 2 files changed, 3 insertions(+), 5670 deletions(-)
 delete mode 100644 src/composable_kernel/external/half/include/half.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1d59d1a975..8e16ac7f09 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -569,6 +569,9 @@ enable_cppcheck(
         unusedScopedObject:*src/kernels/static_composable_kernel/*/*
         wrongPrintfScanfArgNum:*src/kernels/static_composable_kernel/*/*
         knownConditionTrueFalse:*src/composable_kernel/composable_kernel/*/*
+        identicalConditionAfterEarlyExit:*src/composable_kernel/composable_kernel/*/*
+        duplicateExpression:*src/composable_kernel/composable_kernel/*/*
+        unreadVariable:*src/composable_kernel/composable_kernel/*/*
         unreadVariable:*src/composable_kernel/external/*/*
         unmatchedSuppression
     FORCE
diff --git a/src/composable_kernel/external/half/include/half.hpp b/src/composable_kernel/external/half/include/half.hpp
deleted file mode 100644
index 25f543881f..0000000000
--- a/src/composable_kernel/external/half/include/half.hpp
+++ /dev/null
@@ -1,5670 +0,0 @@
-// half - IEEE 754-based half-precision floating-point library.
-//
-// Copyright (c) 2012-2019 Christian Rau <rauy@users.sourceforge.net>
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
-// associated documentation
-// files (the "Software"), to deal in the Software without restriction, including without limitation
-// the rights to use, copy,
-// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit
-// persons to whom the
-// Software is furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in all copies or
-// substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
-// NOT LIMITED TO THE
-// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
-// SHALL THE AUTHORS OR
-// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
-// CONTRACT, TORT OR OTHERWISE,
-// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-// SOFTWARE.
-
-// Version 2.1.0
-
-/// \file
-/// Main header file for half-precision functionality.
-
-#ifndef HALF_HALF_HPP
-#define HALF_HALF_HPP
-
-#define HALF_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
-
-#if defined(__INTEL_COMPILER)
-#define HALF_ICC_VERSION __INTEL_COMPILER
-#elif defined(__ICC)
-#define HALF_ICC_VERSION __ICC
-#elif defined(__ICL)
-#define HALF_ICC_VERSION __ICL
-#else
-#define HALF_ICC_VERSION 0
-#endif
-
-// check C++11 language features
-#if defined(__clang__) // clang
-#if __has_feature(cxx_static_assert) && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
-#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
-#endif
-#if __has_feature(cxx_constexpr) && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
-#define HALF_ENABLE_CPP11_CONSTEXPR 1
-#endif
-#if __has_feature(cxx_noexcept) && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
-#define HALF_ENABLE_CPP11_NOEXCEPT 1
-#endif
-#if __has_feature(cxx_user_literals) && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
-#define HALF_ENABLE_CPP11_USER_LITERALS 1
-#endif
-#if __has_feature(cxx_thread_local) && !defined(HALF_ENABLE_CPP11_THREAD_LOCAL)
-#define HALF_ENABLE_CPP11_THREAD_LOCAL 1
-#endif
-#if(defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L) && \
-    !defined(HALF_ENABLE_CPP11_LONG_LONG)
-#define HALF_ENABLE_CPP11_LONG_LONG 1
-#endif
-#elif HALF_ICC_VERSION && defined(__INTEL_CXX11_MODE__) // Intel C++
-#if HALF_ICC_VERSION >= 1500 && !defined(HALF_ENABLE_CPP11_THREAD_LOCAL)
-#define HALF_ENABLE_CPP11_THREAD_LOCAL 1
-#endif
-#if HALF_ICC_VERSION >= 1500 && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
-#define HALF_ENABLE_CPP11_USER_LITERALS 1
-#endif
-#if HALF_ICC_VERSION >= 1400 && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
-#define HALF_ENABLE_CPP11_CONSTEXPR 1
-#endif
-#if HALF_ICC_VERSION >= 1400 && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
-#define HALF_ENABLE_CPP11_NOEXCEPT 1
-#endif
-#if HALF_ICC_VERSION >= 1110 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
-#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
-#endif
-#if HALF_ICC_VERSION >= 1110 && !defined(HALF_ENABLE_CPP11_LONG_LONG)
-#define HALF_ENABLE_CPP11_LONG_LONG 1
-#endif
-#elif defined(__GNUC__) // gcc
-#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L
-#if HALF_GCC_VERSION >= 408 && !defined(HALF_ENABLE_CPP11_THREAD_LOCAL)
-#define HALF_ENABLE_CPP11_THREAD_LOCAL 1
-#endif
-#if HALF_GCC_VERSION >= 407 && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
-#define HALF_ENABLE_CPP11_USER_LITERALS 1
-#endif
-#if HALF_GCC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
-#define HALF_ENABLE_CPP11_CONSTEXPR 1
-#endif
-#if HALF_GCC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
-#define HALF_ENABLE_CPP11_NOEXCEPT 1
-#endif
-#if HALF_GCC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
-#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
-#endif
-#if !defined(HALF_ENABLE_CPP11_LONG_LONG)
-#define HALF_ENABLE_CPP11_LONG_LONG 1
-#endif
-#endif
-#define HALF_TWOS_COMPLEMENT_INT 1
-#elif defined(_MSC_VER) // Visual C++
-#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_THREAD_LOCAL)
-#define HALF_ENABLE_CPP11_THREAD_LOCAL 1
-#endif
-#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
-#define HALF_ENABLE_CPP11_USER_LITERALS 1
-#endif
-#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
-#define HALF_ENABLE_CPP11_CONSTEXPR 1
-#endif
-#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
-#define HALF_ENABLE_CPP11_NOEXCEPT 1
-#endif
-#if _MSC_VER >= 1600 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
-#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
-#endif
-#if _MSC_VER >= 1310 && !defined(HALF_ENABLE_CPP11_LONG_LONG)
-#define HALF_ENABLE_CPP11_LONG_LONG 1
-#endif
-#define HALF_TWOS_COMPLEMENT_INT 1
-#define HALF_POP_WARNINGS 1
-#pragma warning(push)
-#pragma warning(disable : 4099 4127 4146) // struct vs class, constant in if, negative unsigned
-#endif
-
-// check C++11 library features
-#include <utility>
-#if defined(_LIBCPP_VERSION) // libc++
-#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103
-#ifndef HALF_ENABLE_CPP11_TYPE_TRAITS
-#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
-#endif
-#ifndef HALF_ENABLE_CPP11_CSTDINT
-#define HALF_ENABLE_CPP11_CSTDINT 1
-#endif
-#ifndef HALF_ENABLE_CPP11_CMATH
-#define HALF_ENABLE_CPP11_CMATH 1
-#endif
-#ifndef HALF_ENABLE_CPP11_HASH
-#define HALF_ENABLE_CPP11_HASH 1
-#endif
-#ifndef HALF_ENABLE_CPP11_CFENV
-#define HALF_ENABLE_CPP11_CFENV 1
-#endif
-#endif
-#elif defined(__GLIBCXX__) // libstdc++
-#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103
-#ifdef __clang__
-#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_TYPE_TRAITS)
-#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
-#endif
-#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CSTDINT)
-#define HALF_ENABLE_CPP11_CSTDINT 1
-#endif
-#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CMATH)
-#define HALF_ENABLE_CPP11_CMATH 1
-#endif
-#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_HASH)
-#define HALF_ENABLE_CPP11_HASH 1
-#endif
-#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CFENV)
-#define HALF_ENABLE_CPP11_CFENV 1
-#endif
-#else
-#if HALF_GCC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_TYPE_TRAITS)
-#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
-#endif
-#if HALF_GCC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CSTDINT)
-#define HALF_ENABLE_CPP11_CSTDINT 1
-#endif
-#if HALF_GCC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CMATH)
-#define HALF_ENABLE_CPP11_CMATH 1
-#endif
-#if HALF_GCC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_HASH)
-#define HALF_ENABLE_CPP11_HASH 1
-#endif
-#if HALF_GCC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CFENV)
-#define HALF_ENABLE_CPP11_CFENV 1
-#endif
-#endif
-#endif
-#elif defined(_CPPLIB_VER) // Dinkumware/Visual C++
-#if _CPPLIB_VER >= 520 && !defined(HALF_ENABLE_CPP11_TYPE_TRAITS)
-#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
-#endif
-#if _CPPLIB_VER >= 520 && !defined(HALF_ENABLE_CPP11_CSTDINT)
-#define HALF_ENABLE_CPP11_CSTDINT 1
-#endif
-#if _CPPLIB_VER >= 520 && !defined(HALF_ENABLE_CPP11_HASH)
-#define HALF_ENABLE_CPP11_HASH 1
-#endif
-#if _CPPLIB_VER >= 610 && !defined(HALF_ENABLE_CPP11_CMATH)
-#define HALF_ENABLE_CPP11_CMATH 1
-#endif
-#if _CPPLIB_VER >= 610 && !defined(HALF_ENABLE_CPP11_CFENV)
-#define HALF_ENABLE_CPP11_CFENV 1
-#endif
-#endif
-#undef HALF_GCC_VERSION
-#undef HALF_ICC_VERSION
-
-// any error throwing C++ exceptions?
-#if defined(HALF_ERRHANDLING_THROW_INVALID) || defined(HALF_ERRHANDLING_THROW_DIVBYZERO) ||  \
-    defined(HALF_ERRHANDLING_THROW_OVERFLOW) || defined(HALF_ERRHANDLING_THROW_UNDERFLOW) || \
-    defined(HALF_ERRHANDLING_THROW_INEXACT)
-#define HALF_ERRHANDLING_THROWS 1
-#endif
-
-// any error handling enabled?
-#define HALF_ERRHANDLING                                                          \
-    (HALF_ERRHANDLING_FLAGS || HALF_ERRHANDLING_ERRNO || HALF_ERRHANDLING_FENV || \
-     HALF_ERRHANDLING_THROWS)
-
-#if HALF_ERRHANDLING
-#define HALF_UNUSED_NOERR(name) name
-#else
-#define HALF_UNUSED_NOERR(name)
-#endif
-
-// support constexpr
-#if HALF_ENABLE_CPP11_CONSTEXPR
-#define HALF_CONSTEXPR constexpr
-#define HALF_CONSTEXPR_CONST constexpr
-#if HALF_ERRHANDLING
-#define HALF_CONSTEXPR_NOERR
-#else
-#define HALF_CONSTEXPR_NOERR constexpr
-#endif
-#else
-#define HALF_CONSTEXPR
-#define HALF_CONSTEXPR_CONST const
-#define HALF_CONSTEXPR_NOERR
-#endif
-
-// support noexcept
-#if HALF_ENABLE_CPP11_NOEXCEPT
-#define HALF_NOEXCEPT noexcept
-#define HALF_NOTHROW noexcept
-#else
-#define HALF_NOEXCEPT
-#define HALF_NOTHROW throw()
-#endif
-
-// support thread storage
-#if HALF_ENABLE_CPP11_THREAD_LOCAL
-#define HALF_THREAD_LOCAL thread_local
-#else
-#define HALF_THREAD_LOCAL static
-#endif
-
-#include <utility>
-#include <algorithm>
-#include <istream>
-#include <ostream>
-#include <limits>
-#include <stdexcept>
-#include <climits>
-#include <cmath>
-#include <cstring>
-#include <cstdlib>
-#if HALF_ENABLE_CPP11_TYPE_TRAITS
-#include <type_traits>
-#endif
-#if HALF_ENABLE_CPP11_CSTDINT
-#include <cstdint>
-#endif
-#if HALF_ERRHANDLING_ERRNO
-#include <cerrno>
-#endif
-#if HALF_ENABLE_CPP11_CFENV
-#include <cfenv>
-#endif
-#if HALF_ENABLE_CPP11_HASH
-#include <functional>
-#endif
-#if HALF_ENABLE_F16C_INTRINSICS
-#include <immintrin.h>
-#endif
-
-#ifndef HALF_ENABLE_F16C_INTRINSICS
-/// Enable F16C intruction set intrinsics.
-/// Defining this to 1 enables the use of [F16C compiler
-/// intrinsics](https://en.wikipedia.org/wiki/F16C) for converting between
-/// half-precision and single-precision values which may result in improved performance. This will
-/// not perform additional checks
-/// for support of the F16C instruction set, so an appropriate target platform is required when
-/// enabling this feature.
-///
-/// Unless predefined it will be enabled automatically when the `__F16C__` symbol is defined, which
-/// some compilers do on supporting platforms.
-#define HALF_ENABLE_F16C_INTRINSICS __F16C__
-#endif
-
-#ifdef HALF_DOXYGEN_ONLY
-/// Type for internal floating-point computations.
-/// This can be predefined to a built-in floating-point type (`float`, `double` or `long double`) to
-/// override the internal
-/// half-precision implementation to use this type for computing arithmetic operations and
-/// mathematical function (if available).
-/// This can result in improved performance for arithmetic operators and mathematical functions but
-/// might cause results to
-/// deviate from the specified half-precision rounding mode and inhibits proper detection of
-/// half-precision exceptions.
-#define HALF_ARITHMETIC_TYPE (undefined)
-
-/// Enable internal exception flags.
-/// Defining this to 1 causes operations on half-precision values to raise internal floating-point
-/// exception flags according to
-/// the IEEE 754 standard. These can then be cleared and checked with clearexcept(), testexcept().
-#define HALF_ERRHANDLING_FLAGS 0
-
-/// Enable exception propagation to `errno`.
-/// Defining this to 1 causes operations on half-precision values to propagate floating-point
-/// exceptions to
-/// [errno](https://en.cppreference.com/w/cpp/error/errno) from `<cerrno>`. Specifically this will
-/// propagate domain errors as
-/// [EDOM](https://en.cppreference.com/w/cpp/error/errno_macros) and pole, overflow and underflow
-/// errors as
-/// [ERANGE](https://en.cppreference.com/w/cpp/error/errno_macros). Inexact errors won't be
-/// propagated.
-#define HALF_ERRHANDLING_ERRNO 0
-
-/// Enable exception propagation to built-in floating-point platform.
-/// Defining this to 1 causes operations on half-precision values to propagate floating-point
-/// exceptions to the built-in
-/// single- and double-precision implementation's exception flags using the
-/// [C++11 floating-point environment control](https://en.cppreference.com/w/cpp/numeric/fenv) from
-/// `<cfenv>`. However, this
-/// does not work in reverse and single- or double-precision exceptions will not raise the
-/// corresponding half-precision
-/// exception flags, nor will explicitly clearing flags clear the corresponding built-in flags.
-#define HALF_ERRHANDLING_FENV 0
-
-/// Throw C++ exception on domain errors.
-/// Defining this to a string literal causes operations on half-precision values to throw a
-/// [std::domain_error](https://en.cppreference.com/w/cpp/error/domain_error) with the specified
-/// message on domain errors.
-#define HALF_ERRHANDLING_THROW_INVALID (undefined)
-
-/// Throw C++ exception on pole errors.
-/// Defining this to a string literal causes operations on half-precision values to throw a
-/// [std::domain_error](https://en.cppreference.com/w/cpp/error/domain_error) with the specified
-/// message on pole errors.
-#define HALF_ERRHANDLING_THROW_DIVBYZERO (undefined)
-
-/// Throw C++ exception on overflow errors.
-/// Defining this to a string literal causes operations on half-precision values to throw a
-/// [std::overflow_error](https://en.cppreference.com/w/cpp/error/overflow_error) with the specified
-/// message on overflows.
-#define HALF_ERRHANDLING_THROW_OVERFLOW (undefined)
-
-/// Throw C++ exception on underflow errors.
-/// Defining this to a string literal causes operations on half-precision values to throw a
-/// [std::underflow_error](https://en.cppreference.com/w/cpp/error/underflow_error) with the
-/// specified message on underflows.
-#define HALF_ERRHANDLING_THROW_UNDERFLOW (undefined)
-
-/// Throw C++ exception on rounding errors.
-/// Defining this to 1 causes operations on half-precision values to throw a
-/// [std::range_error](https://en.cppreference.com/w/cpp/error/range_error) with the specified
-/// message on general rounding errors.
-#define HALF_ERRHANDLING_THROW_INEXACT (undefined)
-#endif
-
-#ifndef HALF_ERRHANDLING_OVERFLOW_TO_INEXACT
-/// Raise INEXACT exception on overflow.
-/// Defining this to 1 (default) causes overflow errors to automatically raise inexact exceptions in
-/// addition.
-/// These will be raised after any possible handling of the underflow exception.
-#define HALF_ERRHANDLING_OVERFLOW_TO_INEXACT 1
-#endif
-
-#ifndef HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT
-/// Raise INEXACT exception on underflow.
-/// Defining this to 1 (default) causes underflow errors to automatically raise inexact exceptions
-/// in addition.
-/// These will be raised after any possible handling of the underflow exception.
-///
-/// **Note:** This will actually cause underflow (and the accompanying inexact) exceptions to be
-/// raised *only* when the result
-/// is inexact, while if disabled bare underflow errors will be raised for *any* (possibly exact)
-/// subnormal result.
-#define HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT 1
-#endif
-
-/// Default rounding mode.
-/// This specifies the rounding mode used for all conversions between [half](\ref half_float::half)s
-/// and more precise types
-/// (unless using half_cast() and specifying the rounding mode directly) as well as in arithmetic
-/// operations and mathematical
-/// functions. It can be redefined (before including half.hpp) to one of the standard rounding modes
-/// using their respective
-/// constants or the equivalent values of
-/// [std::float_round_style](https://en.cppreference.com/w/cpp/types/numeric_limits/float_round_style):
-///
-/// `std::float_round_style`         | value | rounding
-/// ---------------------------------|-------|-------------------------
-/// `std::round_indeterminate`       | -1    | fastest
-/// `std::round_toward_zero`         | 0     | toward zero
-/// `std::round_to_nearest`          | 1     | to nearest (default)
-/// `std::round_toward_infinity`     | 2     | toward positive infinity
-/// `std::round_toward_neg_infinity` | 3     | toward negative infinity
-///
-/// By default this is set to `1` (`std::round_to_nearest`), which rounds results to the nearest
-/// representable value. It can even
-/// be set to
-/// [std::numeric_limits<float>::round_style](https://en.cppreference.com/w/cpp/types/numeric_limits/round_style)
-/// to synchronize
-/// the rounding mode with that of the built-in single-precision implementation (which is likely
-/// `std::round_to_nearest`, though).
-#ifndef HALF_ROUND_STYLE
-#define HALF_ROUND_STYLE 1 // = std::round_to_nearest
-#endif
-
-/// Value signaling overflow.
-/// In correspondence with `HUGE_VAL[F|L]` from `<cmath>` this symbol expands to a positive value
-/// signaling the overflow of an
-/// operation, in particular it just evaluates to positive infinity.
-///
-/// **See also:** Documentation for
-/// [HUGE_VAL](https://en.cppreference.com/w/cpp/numeric/math/HUGE_VAL)
-#define HUGE_VALH std::numeric_limits<half_float::half>::infinity()
-
-/// Fast half-precision fma function.
-/// This symbol is defined if the fma() function generally executes as fast as, or faster than, a
-/// separate
-/// half-precision multiplication followed by an addition, which is always the case.
-///
-/// **See also:** Documentation for
-/// [FP_FAST_FMA](https://en.cppreference.com/w/cpp/numeric/math/fma)
-#define FP_FAST_FMAH 1
-
-///	Half rounding mode.
-/// In correspondence with `FLT_ROUNDS` from `<cfloat>` this symbol expands to the rounding mode
-/// used for
-/// half-precision operations. It is an alias for [HALF_ROUND_STYLE](\ref HALF_ROUND_STYLE).
-///
-/// **See also:** Documentation for
-/// [FLT_ROUNDS](https://en.cppreference.com/w/cpp/types/climits/FLT_ROUNDS)
-#define HLF_ROUNDS HALF_ROUND_STYLE
-
-#ifndef FP_ILOGB0
-#define FP_ILOGB0 INT_MIN
-#endif
-#ifndef FP_ILOGBNAN
-#define FP_ILOGBNAN INT_MAX
-#endif
-#ifndef FP_SUBNORMAL
-#define FP_SUBNORMAL 0
-#endif
-#ifndef FP_ZERO
-#define FP_ZERO 1
-#endif
-#ifndef FP_NAN
-#define FP_NAN 2
-#endif
-#ifndef FP_INFINITE
-#define FP_INFINITE 3
-#endif
-#ifndef FP_NORMAL
-#define FP_NORMAL 4
-#endif
-
-#if !HALF_ENABLE_CPP11_CFENV && !defined(FE_ALL_EXCEPT)
-#define FE_INVALID 0x10
-#define FE_DIVBYZERO 0x08
-#define FE_OVERFLOW 0x04
-#define FE_UNDERFLOW 0x02
-#define FE_INEXACT 0x01
-#define FE_ALL_EXCEPT (FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW | FE_UNDERFLOW | FE_INEXACT)
-#endif
-
-/// Main namespace for half-precision functionality.
-/// This namespace contains all the functionality provided by the library.
-namespace half_float {
-class half;
-
-#if HALF_ENABLE_CPP11_USER_LITERALS
-/// Library-defined half-precision literals.
-/// Import this namespace to enable half-precision floating-point literals:
-/// ~~~~{.cpp}
-/// using namespace half_float::literal;
-/// half_float::half = 4.2_h;
-/// ~~~~
-namespace literal {
-half operator"" _h(long double);
-}
-#endif
-
-/// \internal
-/// \brief Implementation details.
-namespace detail {
-#if HALF_ENABLE_CPP11_TYPE_TRAITS
-/// Conditional type.
-template <bool B, typename T, typename F>
-struct conditional : std::conditional<B, T, F>
-{
-};
-
-/// Helper for tag dispatching.
-template <bool B>
-struct bool_type : std::integral_constant<bool, B>
-{
-};
-using std::false_type;
-using std::true_type;
-
-/// Type traits for floating-point types.
-template <typename T>
-struct is_float : std::is_floating_point<T>
-{
-};
-#else
-/// Conditional type.
-template <bool, typename T, typename>
-struct conditional
-{
-    typedef T type;
-};
-template <typename T, typename F>
-struct conditional<false, T, F>
-{
-    typedef F type;
-};
-
-/// Helper for tag dispatching.
-template <bool>
-struct bool_type
-{
-};
-typedef bool_type<true> true_type;
-typedef bool_type<false> false_type;
-
-/// Type traits for floating-point types.
-template <typename>
-struct is_float : false_type
-{
-};
-template <typename T>
-struct is_float<const T> : is_float<T>
-{
-};
-template <typename T>
-struct is_float<volatile T> : is_float<T>
-{
-};
-template <typename T>
-struct is_float<const volatile T> : is_float<T>
-{
-};
-template <>
-struct is_float<float> : true_type
-{
-};
-template <>
-struct is_float<double> : true_type
-{
-};
-template <>
-struct is_float<long double> : true_type
-{
-};
-#endif
-
-/// Type traits for floating-point bits.
-template <typename T>
-struct bits
-{
-    typedef unsigned char type;
-};
-template <typename T>
-struct bits<const T> : bits<T>
-{
-};
-template <typename T>
-struct bits<volatile T> : bits<T>
-{
-};
-template <typename T>
-struct bits<const volatile T> : bits<T>
-{
-};
-
-#if HALF_ENABLE_CPP11_CSTDINT
-/// Unsigned integer of (at least) 16 bits width.
-typedef std::uint_least16_t uint16;
-
-/// Fastest unsigned integer of (at least) 32 bits width.
-typedef std::uint_fast32_t uint32;
-
-/// Fastest signed integer of (at least) 32 bits width.
-typedef std::int_fast32_t int32;
-
-/// Unsigned integer of (at least) 32 bits width.
-template <>
-struct bits<float>
-{
-    typedef std::uint_least32_t type;
-};
-
-/// Unsigned integer of (at least) 64 bits width.
-template <>
-struct bits<double>
-{
-    typedef std::uint_least64_t type;
-};
-#else
-/// Unsigned integer of (at least) 16 bits width.
-typedef unsigned short uint16;
-
-/// Fastest unsigned integer of (at least) 32 bits width.
-typedef unsigned long uint32;
-
-/// Fastest unsigned integer of (at least) 32 bits width.
-typedef long int32;
-
-/// Unsigned integer of (at least) 32 bits width.
-template <>
-struct bits<float>
-    : conditional<std::numeric_limits<unsigned int>::digits >= 32, unsigned int, unsigned long>
-{
-};
-
-#if HALF_ENABLE_CPP11_LONG_LONG
-/// Unsigned integer of (at least) 64 bits width.
-template <>
-struct bits<double> : conditional<std::numeric_limits<unsigned long>::digits >= 64,
-                                  unsigned long,
-                                  unsigned long long>
-{
-};
-#else
-/// Unsigned integer of (at least) 64 bits width.
-template <>
-struct bits<double>
-{
-    typedef unsigned long type;
-};
-#endif
-#endif
-
-#ifdef HALF_ARITHMETIC_TYPE
-/// Type to use for arithmetic computations and mathematic functions internally.
-typedef HALF_ARITHMETIC_TYPE internal_t;
-#endif
-
-/// Tag type for binary construction.
-struct binary_t
-{
-};
-
-/// Tag for binary construction.
-HALF_CONSTEXPR_CONST binary_t binary = binary_t();
-
-/// \name Implementation defined classification and arithmetic
-/// \{
-
-/// Check for infinity.
-/// \tparam T argument type (builtin floating-point type)
-/// \param arg value to query
-/// \retval true if infinity
-/// \retval false else
-template <typename T>
-bool builtin_isinf(T arg)
-{
-#if HALF_ENABLE_CPP11_CMATH
-    return std::isinf(arg);
-#elif defined(_MSC_VER)
-    return !::_finite(static_cast<double>(arg)) && !::_isnan(static_cast<double>(arg));
-#else
-    return arg == std::numeric_limits<T>::infinity() || arg == -std::numeric_limits<T>::infinity();
-#endif
-}
-
-/// Check for NaN.
-/// \tparam T argument type (builtin floating-point type)
-/// \param arg value to query
-/// \retval true if not a number
-/// \retval false else
-template <typename T>
-bool builtin_isnan(T arg)
-{
-#if HALF_ENABLE_CPP11_CMATH
-    return std::isnan(arg);
-#elif defined(_MSC_VER)
-    return ::_isnan(static_cast<double>(arg)) != 0;
-#else
-    return arg != arg;
-#endif
-}
-
-/// Check sign.
-/// \tparam T argument type (builtin floating-point type)
-/// \param arg value to query
-/// \retval true if signbit set
-/// \retval false else
-template <typename T>
-bool builtin_signbit(T arg)
-{
-#if HALF_ENABLE_CPP11_CMATH
-    return std::signbit(arg);
-#else
-    return arg < T() || (arg == T() && T(1) / arg < T());
-#endif
-}
-
-/// Platform-independent sign mask.
-/// \param arg integer value in two's complement
-/// \retval -1 if \a arg negative
-/// \retval 0 if \a arg positive
-inline uint32 sign_mask(uint32 arg)
-{
-    static const int N = std::numeric_limits<uint32>::digits - 1;
-#if HALF_TWOS_COMPLEMENT_INT
-    return static_cast<int32>(arg) >> N;
-#else
-    return -((arg >> N) & 1);
-#endif
-}
-
-/// Platform-independent arithmetic right shift.
-/// \param arg integer value in two's complement
-/// \param i shift amount (at most 31)
-/// \return \a arg right shifted for \a i bits with possible sign extension
-inline uint32 arithmetic_shift(uint32 arg, int i)
-{
-#if HALF_TWOS_COMPLEMENT_INT
-    return static_cast<int32>(arg) >> i;
-#else
-    return static_cast<int32>(arg) / (static_cast<int32>(1) << i) -
-           ((arg >> (std::numeric_limits<uint32>::digits - 1)) & 1);
-#endif
-}
-
-/// \}
-/// \name Error handling
-/// \{
-
-/// Internal exception flags.
-/// \return reference to global exception flags
-inline int& errflags()
-{
-    HALF_THREAD_LOCAL int flags = 0;
-    return flags;
-}
-
-/// Raise floating-point exception.
-/// \param flags exceptions to raise
-/// \param cond condition to raise exceptions for
-inline void raise(int HALF_UNUSED_NOERR(flags), bool HALF_UNUSED_NOERR(cond) = true)
-{
-#if HALF_ERRHANDLING
-    if(!cond)
-        return;
-#if HALF_ERRHANDLING_FLAGS
-    errflags() |= flags;
-#endif
-#if HALF_ERRHANDLING_ERRNO
-    if(flags & FE_INVALID)
-        errno = EDOM;
-    else if(flags & (FE_DIVBYZERO | FE_OVERFLOW | FE_UNDERFLOW))
-        errno = ERANGE;
-#endif
-#if HALF_ERRHANDLING_FENV && HALF_ENABLE_CPP11_CFENV
-    std::feraiseexcept(flags);
-#endif
-#ifdef HALF_ERRHANDLING_THROW_INVALID
-    if(flags & FE_INVALID)
-        throw std::domain_error(HALF_ERRHANDLING_THROW_INVALID);
-#endif
-#ifdef HALF_ERRHANDLING_THROW_DIVBYZERO
-    if(flags & FE_DIVBYZERO)
-        throw std::domain_error(HALF_ERRHANDLING_THROW_DIVBYZERO);
-#endif
-#ifdef HALF_ERRHANDLING_THROW_OVERFLOW
-    if(flags & FE_OVERFLOW)
-        throw std::overflow_error(HALF_ERRHANDLING_THROW_OVERFLOW);
-#endif
-#ifdef HALF_ERRHANDLING_THROW_UNDERFLOW
-    if(flags & FE_UNDERFLOW)
-        throw std::underflow_error(HALF_ERRHANDLING_THROW_UNDERFLOW);
-#endif
-#ifdef HALF_ERRHANDLING_THROW_INEXACT
-    if(flags & FE_INEXACT)
-        throw std::range_error(HALF_ERRHANDLING_THROW_INEXACT);
-#endif
-#if HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT
-    if((flags & FE_UNDERFLOW) && !(flags & FE_INEXACT))
-        raise(FE_INEXACT);
-#endif
-#if HALF_ERRHANDLING_OVERFLOW_TO_INEXACT
-    if((flags & FE_OVERFLOW) && !(flags & FE_INEXACT))
-        raise(FE_INEXACT);
-#endif
-#endif
-}
-
-/// Check and signal for any NaN.
-/// \param x first half-precision value to check
-/// \param y second half-precision value to check
-/// \retval true if either \a x or \a y is NaN
-/// \retval false else
-/// \exception FE_INVALID if \a x or \a y is NaN
-inline HALF_CONSTEXPR_NOERR bool compsignal(unsigned int x, unsigned int y)
-{
-#if HALF_ERRHANDLING
-    raise(FE_INVALID, (x & 0x7FFF) > 0x7C00 || (y & 0x7FFF) > 0x7C00);
-#endif
-    return (x & 0x7FFF) > 0x7C00 || (y & 0x7FFF) > 0x7C00;
-}
-
-/// Signal and silence signaling NaN.
-/// \param nan half-precision NaN value
-/// \return quiet NaN
-/// \exception FE_INVALID if \a nan is signaling NaN
-inline HALF_CONSTEXPR_NOERR unsigned int signal(unsigned int nan)
-{
-#if HALF_ERRHANDLING
-    raise(FE_INVALID, !(nan & 0x200));
-#endif
-    return nan | 0x200;
-}
-
-/// Signal and silence signaling NaNs.
-/// \param x first half-precision value to check
-/// \param y second half-precision value to check
-/// \return quiet NaN
-/// \exception FE_INVALID if \a x or \a y is signaling NaN
-inline HALF_CONSTEXPR_NOERR unsigned int signal(unsigned int x, unsigned int y)
-{
-#if HALF_ERRHANDLING
-    raise(FE_INVALID,
-          ((x & 0x7FFF) > 0x7C00 && !(x & 0x200)) || ((y & 0x7FFF) > 0x7C00 && !(y & 0x200)));
-#endif
-    return ((x & 0x7FFF) > 0x7C00) ? (x | 0x200) : (y | 0x200);
-}
-
-/// Signal and silence signaling NaNs.
-/// \param x first half-precision value to check
-/// \param y second half-precision value to check
-/// \param z third half-precision value to check
-/// \return quiet NaN
-/// \exception FE_INVALID if \a x, \a y or \a z is signaling NaN
-inline HALF_CONSTEXPR_NOERR unsigned int signal(unsigned int x, unsigned int y, unsigned int z)
-{
-#if HALF_ERRHANDLING
-    raise(FE_INVALID,
-          ((x & 0x7FFF) > 0x7C00 && !(x & 0x200)) || ((y & 0x7FFF) > 0x7C00 && !(y & 0x200)) ||
-              ((z & 0x7FFF) > 0x7C00 && !(z & 0x200)));
-#endif
-    return ((x & 0x7FFF) > 0x7C00) ? (x | 0x200)
-                                   : ((y & 0x7FFF) > 0x7C00) ? (y | 0x200) : (z | 0x200);
-}
-
-/// Select value or signaling NaN.
-/// \param x preferred half-precision value
-/// \param y ignored half-precision value except for signaling NaN
-/// \return \a y if signaling NaN, \a x otherwise
-/// \exception FE_INVALID if \a y is signaling NaN
-inline HALF_CONSTEXPR_NOERR unsigned int select(unsigned int x, unsigned int HALF_UNUSED_NOERR(y))
-{
-#if HALF_ERRHANDLING
-    return (((y & 0x7FFF) > 0x7C00) && !(y & 0x200)) ? signal(y) : x;
-#else
-    return x;
-#endif
-}
-
-/// Raise domain error and return NaN.
-/// return quiet NaN
-/// \exception FE_INVALID
-inline HALF_CONSTEXPR_NOERR unsigned int invalid()
-{
-#if HALF_ERRHANDLING
-    raise(FE_INVALID);
-#endif
-    return 0x7FFF;
-}
-
-/// Raise pole error and return infinity.
-/// \param sign half-precision value with sign bit only
-/// \return half-precision infinity with sign of \a sign
-/// \exception FE_DIVBYZERO
-inline HALF_CONSTEXPR_NOERR unsigned int pole(unsigned int sign = 0)
-{
-#if HALF_ERRHANDLING
-    raise(FE_DIVBYZERO);
-#endif
-    return sign | 0x7C00;
-}
-
-/// Check value for underflow.
-/// \param arg non-zero half-precision value to check
-/// \return \a arg
-/// \exception FE_UNDERFLOW if arg is subnormal
-inline HALF_CONSTEXPR_NOERR unsigned int check_underflow(unsigned int arg)
-{
-#if HALF_ERRHANDLING && !HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT
-    raise(FE_UNDERFLOW, !(arg & 0x7C00));
-#endif
-    return arg;
-}
-
-/// \}
-/// \name Conversion and rounding
-/// \{
-
-/// Half-precision overflow.
-/// \tparam R rounding mode to use
-/// \param sign half-precision value with sign bit only
-/// \return rounded overflowing half-precision value
-/// \exception FE_OVERFLOW
-template <std::float_round_style R>
-HALF_CONSTEXPR_NOERR unsigned int overflow(unsigned int sign = 0)
-{
-#if HALF_ERRHANDLING
-    raise(FE_OVERFLOW);
-#endif
-    return (R == std::round_toward_infinity)
-               ? (sign + 0x7C00 - (sign >> 15))
-               : (R == std::round_toward_neg_infinity)
-                     ? (sign + 0x7BFF + (sign >> 15))
-                     : (R == std::round_toward_zero) ? (sign | 0x7BFF) : (sign | 0x7C00);
-}
-
-/// Half-precision underflow.
-/// \tparam R rounding mode to use
-/// \param sign half-precision value with sign bit only
-/// \return rounded underflowing half-precision value
-/// \exception FE_UNDERFLOW
-template <std::float_round_style R>
-HALF_CONSTEXPR_NOERR unsigned int underflow(unsigned int sign = 0)
-{
-#if HALF_ERRHANDLING
-    raise(FE_UNDERFLOW);
-#endif
-    return (R == std::round_toward_infinity)
-               ? (sign + 1 - (sign >> 15))
-               : (R == std::round_toward_neg_infinity) ? (sign + (sign >> 15)) : sign;
-}
-
-/// Round half-precision number.
-/// \tparam R rounding mode to use
-/// \tparam I `true` to always raise INEXACT exception, `false` to raise only for rounded results
-/// \param value finite half-precision number to round
-/// \param g guard bit (most significant discarded bit)
-/// \param s sticky bit (or of all but the most significant discarded bits)
-/// \return rounded half-precision value
-/// \exception FE_OVERFLOW on overflows
-/// \exception FE_UNDERFLOW on underflows
-/// \exception FE_INEXACT if value had to be rounded or \a I is `true`
-template <std::float_round_style R, bool I>
-HALF_CONSTEXPR_NOERR unsigned int rounded(unsigned int value, int g, int s)
-{
-#if HALF_ERRHANDLING
-    value += (R == std::round_to_nearest)
-                 ? (g & (s | value))
-                 : (R == std::round_toward_infinity)
-                       ? (~(value >> 15) & (g | s))
-                       : (R == std::round_toward_neg_infinity) ? ((value >> 15) & (g | s)) : 0;
-    if((value & 0x7C00) == 0x7C00)
-        raise(FE_OVERFLOW);
-    else if(value & 0x7C00)
-        raise(FE_INEXACT, I || (g | s) != 0);
-    else
-        raise(FE_UNDERFLOW, !(HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT) || I || (g | s) != 0);
-    return value;
-#else
-    return (R == std::round_to_nearest)
-               ? (value + (g & (s | value)))
-               : (R == std::round_toward_infinity)
-                     ? (value + (~(value >> 15) & (g | s)))
-                     : (R == std::round_toward_neg_infinity) ? (value + ((value >> 15) & (g | s)))
-                                                             : value;
-#endif
-}
-
-/// Round half-precision number to nearest integer value.
-/// \tparam R rounding mode to use
-/// \tparam E `true` for round to even, `false` for round away from zero
-/// \tparam I `true` to raise INEXACT exception (if inexact), `false` to never raise it
-/// \param value half-precision value to round
-/// \return half-precision bits for nearest integral value
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_INEXACT if value had to be rounded and \a I is `true`
-template <std::float_round_style R, bool E, bool I>
-unsigned int integral(unsigned int value)
-{
-    unsigned int abs = value & 0x7FFF;
-    if(abs < 0x3C00)
-    {
-        raise(FE_INEXACT, I);
-        return ((R == std::round_to_nearest)
-                    ? (0x3C00 & -static_cast<unsigned>(abs >= (0x3800 + E)))
-                    : (R == std::round_toward_infinity)
-                          ? (0x3C00 & -(~(value >> 15) & (abs != 0)))
-                          : (R == std::round_toward_neg_infinity)
-                                ? (0x3C00 & -static_cast<unsigned>(value > 0x8000))
-                                : 0) |
-               (value & 0x8000);
-    }
-    if(abs >= 0x6400)
-        return (abs > 0x7C00) ? signal(value) : value;
-    unsigned int exp = 25 - (abs >> 10), mask = (1 << exp) - 1;
-    raise(FE_INEXACT, I && (value & mask));
-    return (((R == std::round_to_nearest)
-                 ? ((1 << (exp - 1)) - (~(value >> exp) & E))
-                 : (R == std::round_toward_infinity)
-                       ? (mask & ((value >> 15) - 1))
-                       : (R == std::round_toward_neg_infinity) ? (mask & -(value >> 15)) : 0) +
-            value) &
-           ~mask;
-}
-
-/// Convert fixed point to half-precision floating-point.
-/// \tparam R rounding mode to use
-/// \tparam F number of fractional bits (at least 11)
-/// \tparam S `true` for signed, `false` for unsigned
-/// \tparam N `true` for additional normalization step, `false` if already normalized to 1.F
-/// \tparam I `true` to always raise INEXACT exception, `false` to raise only for rounded results
-/// \param m mantissa in Q1.F fixed point format
-/// \param exp exponent
-/// \param sign half-precision value with sign bit only
-/// \param s sticky bit (or of all but the most significant already discarded bits)
-/// \return value converted to half-precision
-/// \exception FE_OVERFLOW on overflows
-/// \exception FE_UNDERFLOW on underflows
-/// \exception FE_INEXACT if value had to be rounded or \a I is `true`
-template <std::float_round_style R, unsigned int F, bool S, bool N, bool I>
-unsigned int fixed2half(uint32 m, int exp = 14, unsigned int sign = 0, int s = 0)
-{
-    if(S)
-    {
-        uint32 msign = sign_mask(m);
-        m            = (m ^ msign) - msign;
-        sign         = msign & 0x8000;
-    }
-    if(N)
-        for(; m < (static_cast<uint32>(1) << F) && exp; m <<= 1, --exp)
-            ;
-    else if(exp < 0)
-        return rounded<R, I>(sign + (m >> (F - 10 - exp)),
-                             (m >> (F - 11 - exp)) & 1,
-                             s | ((m & ((static_cast<uint32>(1) << (F - 11 - exp)) - 1)) != 0));
-    return rounded<R, I>(sign + (exp << 10) + (m >> (F - 10)),
-                         (m >> (F - 11)) & 1,
-                         s | ((m & ((static_cast<uint32>(1) << (F - 11)) - 1)) != 0));
-}
-
-/// Convert IEEE single-precision to half-precision.
-/// Credit for this goes to [Jeroen van der
-/// Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf).
-/// \tparam R rounding mode to use
-/// \param value single-precision value to convert
-/// \return rounded half-precision value
-/// \exception FE_OVERFLOW on overflows
-/// \exception FE_UNDERFLOW on underflows
-/// \exception FE_INEXACT if value had to be rounded
-template <std::float_round_style R>
-unsigned int float2half_impl(float value, true_type)
-{
-#if HALF_ENABLE_F16C_INTRINSICS
-    return _mm_cvtsi128_si32(_mm_cvtps_ph(_mm_set_ss(value),
-                                          (R == std::round_to_nearest)
-                                              ? _MM_FROUND_TO_NEAREST_INT
-                                              : (R == std::round_toward_zero)
-                                                    ? _MM_FROUND_TO_ZERO
-                                                    : (R == std::round_toward_infinity)
-                                                          ? _MM_FROUND_TO_POS_INF
-                                                          : (R == std::round_toward_neg_infinity)
-                                                                ? _MM_FROUND_TO_NEG_INF
-                                                                : _MM_FROUND_CUR_DIRECTION));
-#else
-    bits<float>::type fbits;
-    std::memcpy(&fbits, &value, sizeof(float));
-#if 1
-    unsigned int sign = (fbits >> 16) & 0x8000;
-    fbits &= 0x7FFFFFFF;
-    if(fbits >= 0x7F800000)
-        return sign | 0x7C00 | ((fbits > 0x7F800000) ? (0x200 | ((fbits >> 13) & 0x3FF)) : 0);
-    if(fbits >= 0x47800000)
-        return overflow<R>(sign);
-    if(fbits >= 0x38800000)
-        return rounded<R, false>(sign | (((fbits >> 23) - 112) << 10) | ((fbits >> 13) & 0x3FF),
-                                 (fbits >> 12) & 1,
-                                 (fbits & 0xFFF) != 0);
-    if(fbits >= 0x33000000)
-    {
-        int i = 125 - (fbits >> 23);
-        fbits = (fbits & 0x7FFFFF) | 0x800000;
-        return rounded<R, false>(sign | (fbits >> (i + 1)),
-                                 (fbits >> i) & 1,
-                                 (fbits & ((static_cast<uint32>(1) << i) - 1)) != 0);
-    }
-    if(fbits != 0)
-        return underflow<R>(sign);
-    return sign;
-#else
-    static const uint16 base_table[512] = {
-        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-        0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040,
-        0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000,
-        0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00, 0x4000, 0x4400, 0x4800, 0x4C00,
-        0x5000, 0x5400, 0x5800, 0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800,
-        0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF,
-        0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF,
-        0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF,
-        0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF,
-        0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF,
-        0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF,
-        0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF,
-        0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF,
-        0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF,
-        0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF,
-        0x7BFF, 0x7BFF, 0x7C00, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8002, 0x8004, 0x8008,
-        0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 0x8200, 0x8400, 0x8800, 0x8C00, 0x9000, 0x9400,
-        0x9800, 0x9C00, 0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00, 0xC000,
-        0xC400, 0xC800, 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00,
-        0xF000, 0xF400, 0xF800, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF,
-        0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF,
-        0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF,
-        0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF,
-        0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF,
-        0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF,
-        0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF,
-        0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF,
-        0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF,
-        0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF,
-        0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFC00};
-    static const unsigned char shift_table[256] = {
-        24, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
-        25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
-        25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
-        25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
-        25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 24, 23, 22, 21, 20, 19, 18, 17,
-        16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
-        13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
-        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
-        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
-        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
-        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
-        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13};
-    int sexp = fbits >> 23, exp = sexp & 0xFF, i = shift_table[exp];
-    fbits &= 0x7FFFFF;
-    uint32 m = (fbits | ((exp != 0) << 23)) & -static_cast<uint32>(exp != 0xFF);
-    return rounded<R, false>(base_table[sexp] + (fbits >> i),
-                             (m >> (i - 1)) & 1,
-                             (((static_cast<uint32>(1) << (i - 1)) - 1) & m) != 0);
-#endif
-#endif
-}
-
-/// Convert IEEE double-precision to half-precision.
-/// \tparam R rounding mode to use
-/// \param value double-precision value to convert
-/// \return rounded half-precision value
-/// \exception FE_OVERFLOW on overflows
-/// \exception FE_UNDERFLOW on underflows
-/// \exception FE_INEXACT if value had to be rounded
-template <std::float_round_style R>
-unsigned int float2half_impl(double value, true_type)
-{
-#if HALF_ENABLE_F16C_INTRINSICS
-    if(R == std::round_indeterminate)
-        return _mm_cvtsi128_si32(
-            _mm_cvtps_ph(_mm_cvtpd_ps(_mm_set_sd(value)), _MM_FROUND_CUR_DIRECTION));
-#endif
-    bits<double>::type dbits;
-    std::memcpy(&dbits, &value, sizeof(double));
-    uint32 hi = dbits >> 32, lo = dbits & 0xFFFFFFFF;
-    unsigned int sign = (hi >> 16) & 0x8000;
-    hi &= 0x7FFFFFFF;
-    if(hi >= 0x7FF00000)
-        return sign | 0x7C00 | ((dbits & 0xFFFFFFFFFFFFF) ? (0x200 | ((hi >> 10) & 0x3FF)) : 0);
-    if(hi >= 0x40F00000)
-        return overflow<R>(sign);
-    if(hi >= 0x3F100000)
-        return rounded<R, false>(sign | (((hi >> 20) - 1008) << 10) | ((hi >> 10) & 0x3FF),
-                                 (hi >> 9) & 1,
-                                 ((hi & 0x1FF) | lo) != 0);
-    if(hi >= 0x3E600000)
-    {
-        int i = 1018 - (hi >> 20);
-        hi    = (hi & 0xFFFFF) | 0x100000;
-        return rounded<R, false>(sign | (hi >> (i + 1)),
-                                 (hi >> i) & 1,
-                                 ((hi & ((static_cast<uint32>(1) << i) - 1)) | lo) != 0);
-    }
-    if((hi | lo) != 0)
-        return underflow<R>(sign);
-    return sign;
-}
-
-/// Convert non-IEEE floating-point to half-precision.
-/// \tparam R rounding mode to use
-/// \tparam T source type (builtin floating-point type)
-/// \param value floating-point value to convert
-/// \return rounded half-precision value
-/// \exception FE_OVERFLOW on overflows
-/// \exception FE_UNDERFLOW on underflows
-/// \exception FE_INEXACT if value had to be rounded
-template <std::float_round_style R, typename T>
-unsigned int float2half_impl(T value, ...)
-{
-    unsigned int hbits = static_cast<unsigned>(builtin_signbit(value)) << 15;
-    if(value == T())
-        return hbits;
-    if(builtin_isnan(value))
-        return hbits | 0x7FFF;
-    if(builtin_isinf(value))
-        return hbits | 0x7C00;
-    int exp;
-    std::frexp(value, &exp);
-    if(exp > 16)
-        return overflow<R>(hbits);
-    if(exp < -13)
-        value = std::ldexp(value, 25);
-    else
-    {
-        value = std::ldexp(value, 12 - exp);
-        hbits |= ((exp + 13) << 10);
-    }
-    T ival, frac = std::modf(value, &ival);
-    int m = std::abs(static_cast<int>(ival));
-    return rounded<R, false>(hbits + (m >> 1), m & 1, frac != T());
-}
-
-/// Convert floating-point to half-precision.
-/// \tparam R rounding mode to use
-/// \tparam T source type (builtin floating-point type)
-/// \param value floating-point value to convert
-/// \return rounded half-precision value
-/// \exception FE_OVERFLOW on overflows
-/// \exception FE_UNDERFLOW on underflows
-/// \exception FE_INEXACT if value had to be rounded
-template <std::float_round_style R, typename T>
-unsigned int float2half(T value)
-{
-    return float2half_impl<R>(value,
-                              bool_type < std::numeric_limits<T>::is_iec559 &&
-                                  sizeof(typename bits<T>::type) == sizeof(T) > ());
-}
-
-/// Convert integer to half-precision floating-point.
-/// \tparam R rounding mode to use
-/// \tparam T type to convert (builtin integer type)
-/// \param value integral value to convert
-/// \return rounded half-precision value
-/// \exception FE_OVERFLOW on overflows
-/// \exception FE_INEXACT if value had to be rounded
-template <std::float_round_style R, typename T>
-unsigned int int2half(T value)
-{
-    unsigned int bits = static_cast<unsigned>(value < 0) << 15;
-    if(!value)
-        return bits;
-    if(bits)
-        value = -value;
-    if(value > 0xFFFF)
-        return overflow<R>(bits);
-    unsigned int m = static_cast<unsigned int>(value), exp = 24;
-    for(; m < 0x400; m <<= 1, --exp)
-        ;
-    for(; m > 0x7FF; m >>= 1, ++exp)
-        ;
-    bits |= (exp << 10) + m;
-    return (exp > 24) ? rounded<R, false>(
-                            bits, (value >> (exp - 25)) & 1, (((1 << (exp - 25)) - 1) & value) != 0)
-                      : bits;
-}
-
-/// Convert half-precision to IEEE single-precision.
-/// Credit for this goes to [Jeroen van der
-/// Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf).
-/// \param value half-precision value to convert
-/// \return single-precision value
-inline float half2float_impl(unsigned int value, float, true_type)
-{
-#if HALF_ENABLE_F16C_INTRINSICS
-    return _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(value)));
-#else
-#if 0
-			bits<float>::type fbits = static_cast<bits<float>::type>(value&0x8000) << 16;
-			int abs = value & 0x7FFF;
-			if(abs)
-			{
-				fbits |= 0x38000000 << static_cast<unsigned>(abs>=0x7C00);
-				for(; abs<0x400; abs<<=1,fbits-=0x800000) ;
-				fbits += static_cast<bits<float>::type>(abs) << 13;
-			}
-#else
-    static const bits<float>::type mantissa_table[2048] = {
-        0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, 0x34C00000,
-        0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000,
-        0x35600000, 0x35700000, 0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000,
-        0x35A80000, 0x35B00000, 0x35B80000, 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000,
-        0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000, 0x36000000, 0x36040000, 0x36080000,
-        0x360C0000, 0x36100000, 0x36140000, 0x36180000, 0x361C0000, 0x36200000, 0x36240000,
-        0x36280000, 0x362C0000, 0x36300000, 0x36340000, 0x36380000, 0x363C0000, 0x36400000,
-        0x36440000, 0x36480000, 0x364C0000, 0x36500000, 0x36540000, 0x36580000, 0x365C0000,
-        0x36600000, 0x36640000, 0x36680000, 0x366C0000, 0x36700000, 0x36740000, 0x36780000,
-        0x367C0000, 0x36800000, 0x36820000, 0x36840000, 0x36860000, 0x36880000, 0x368A0000,
-        0x368C0000, 0x368E0000, 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000,
-        0x369A0000, 0x369C0000, 0x369E0000, 0x36A00000, 0x36A20000, 0x36A40000, 0x36A60000,
-        0x36A80000, 0x36AA0000, 0x36AC0000, 0x36AE0000, 0x36B00000, 0x36B20000, 0x36B40000,
-        0x36B60000, 0x36B80000, 0x36BA0000, 0x36BC0000, 0x36BE0000, 0x36C00000, 0x36C20000,
-        0x36C40000, 0x36C60000, 0x36C80000, 0x36CA0000, 0x36CC0000, 0x36CE0000, 0x36D00000,
-        0x36D20000, 0x36D40000, 0x36D60000, 0x36D80000, 0x36DA0000, 0x36DC0000, 0x36DE0000,
-        0x36E00000, 0x36E20000, 0x36E40000, 0x36E60000, 0x36E80000, 0x36EA0000, 0x36EC0000,
-        0x36EE0000, 0x36F00000, 0x36F20000, 0x36F40000, 0x36F60000, 0x36F80000, 0x36FA0000,
-        0x36FC0000, 0x36FE0000, 0x37000000, 0x37010000, 0x37020000, 0x37030000, 0x37040000,
-        0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, 0x370A0000, 0x370B0000,
-        0x370C0000, 0x370D0000, 0x370E0000, 0x370F0000, 0x37100000, 0x37110000, 0x37120000,
-        0x37130000, 0x37140000, 0x37150000, 0x37160000, 0x37170000, 0x37180000, 0x37190000,
-        0x371A0000, 0x371B0000, 0x371C0000, 0x371D0000, 0x371E0000, 0x371F0000, 0x37200000,
-        0x37210000, 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000,
-        0x37280000, 0x37290000, 0x372A0000, 0x372B0000, 0x372C0000, 0x372D0000, 0x372E0000,
-        0x372F0000, 0x37300000, 0x37310000, 0x37320000, 0x37330000, 0x37340000, 0x37350000,
-        0x37360000, 0x37370000, 0x37380000, 0x37390000, 0x373A0000, 0x373B0000, 0x373C0000,
-        0x373D0000, 0x373E0000, 0x373F0000, 0x37400000, 0x37410000, 0x37420000, 0x37430000,
-        0x37440000, 0x37450000, 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374A0000,
-        0x374B0000, 0x374C0000, 0x374D0000, 0x374E0000, 0x374F0000, 0x37500000, 0x37510000,
-        0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, 0x37580000,
-        0x37590000, 0x375A0000, 0x375B0000, 0x375C0000, 0x375D0000, 0x375E0000, 0x375F0000,
-        0x37600000, 0x37610000, 0x37620000, 0x37630000, 0x37640000, 0x37650000, 0x37660000,
-        0x37670000, 0x37680000, 0x37690000, 0x376A0000, 0x376B0000, 0x376C0000, 0x376D0000,
-        0x376E0000, 0x376F0000, 0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000,
-        0x37750000, 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377A0000, 0x377B0000,
-        0x377C0000, 0x377D0000, 0x377E0000, 0x377F0000, 0x37800000, 0x37808000, 0x37810000,
-        0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, 0x37840000, 0x37848000,
-        0x37850000, 0x37858000, 0x37860000, 0x37868000, 0x37870000, 0x37878000, 0x37880000,
-        0x37888000, 0x37890000, 0x37898000, 0x378A0000, 0x378A8000, 0x378B0000, 0x378B8000,
-        0x378C0000, 0x378C8000, 0x378D0000, 0x378D8000, 0x378E0000, 0x378E8000, 0x378F0000,
-        0x378F8000, 0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000,
-        0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, 0x37960000,
-        0x37968000, 0x37970000, 0x37978000, 0x37980000, 0x37988000, 0x37990000, 0x37998000,
-        0x379A0000, 0x379A8000, 0x379B0000, 0x379B8000, 0x379C0000, 0x379C8000, 0x379D0000,
-        0x379D8000, 0x379E0000, 0x379E8000, 0x379F0000, 0x379F8000, 0x37A00000, 0x37A08000,
-        0x37A10000, 0x37A18000, 0x37A20000, 0x37A28000, 0x37A30000, 0x37A38000, 0x37A40000,
-        0x37A48000, 0x37A50000, 0x37A58000, 0x37A60000, 0x37A68000, 0x37A70000, 0x37A78000,
-        0x37A80000, 0x37A88000, 0x37A90000, 0x37A98000, 0x37AA0000, 0x37AA8000, 0x37AB0000,
-        0x37AB8000, 0x37AC0000, 0x37AC8000, 0x37AD0000, 0x37AD8000, 0x37AE0000, 0x37AE8000,
-        0x37AF0000, 0x37AF8000, 0x37B00000, 0x37B08000, 0x37B10000, 0x37B18000, 0x37B20000,
-        0x37B28000, 0x37B30000, 0x37B38000, 0x37B40000, 0x37B48000, 0x37B50000, 0x37B58000,
-        0x37B60000, 0x37B68000, 0x37B70000, 0x37B78000, 0x37B80000, 0x37B88000, 0x37B90000,
-        0x37B98000, 0x37BA0000, 0x37BA8000, 0x37BB0000, 0x37BB8000, 0x37BC0000, 0x37BC8000,
-        0x37BD0000, 0x37BD8000, 0x37BE0000, 0x37BE8000, 0x37BF0000, 0x37BF8000, 0x37C00000,
-        0x37C08000, 0x37C10000, 0x37C18000, 0x37C20000, 0x37C28000, 0x37C30000, 0x37C38000,
-        0x37C40000, 0x37C48000, 0x37C50000, 0x37C58000, 0x37C60000, 0x37C68000, 0x37C70000,
-        0x37C78000, 0x37C80000, 0x37C88000, 0x37C90000, 0x37C98000, 0x37CA0000, 0x37CA8000,
-        0x37CB0000, 0x37CB8000, 0x37CC0000, 0x37CC8000, 0x37CD0000, 0x37CD8000, 0x37CE0000,
-        0x37CE8000, 0x37CF0000, 0x37CF8000, 0x37D00000, 0x37D08000, 0x37D10000, 0x37D18000,
-        0x37D20000, 0x37D28000, 0x37D30000, 0x37D38000, 0x37D40000, 0x37D48000, 0x37D50000,
-        0x37D58000, 0x37D60000, 0x37D68000, 0x37D70000, 0x37D78000, 0x37D80000, 0x37D88000,
-        0x37D90000, 0x37D98000, 0x37DA0000, 0x37DA8000, 0x37DB0000, 0x37DB8000, 0x37DC0000,
-        0x37DC8000, 0x37DD0000, 0x37DD8000, 0x37DE0000, 0x37DE8000, 0x37DF0000, 0x37DF8000,
-        0x37E00000, 0x37E08000, 0x37E10000, 0x37E18000, 0x37E20000, 0x37E28000, 0x37E30000,
-        0x37E38000, 0x37E40000, 0x37E48000, 0x37E50000, 0x37E58000, 0x37E60000, 0x37E68000,
-        0x37E70000, 0x37E78000, 0x37E80000, 0x37E88000, 0x37E90000, 0x37E98000, 0x37EA0000,
-        0x37EA8000, 0x37EB0000, 0x37EB8000, 0x37EC0000, 0x37EC8000, 0x37ED0000, 0x37ED8000,
-        0x37EE0000, 0x37EE8000, 0x37EF0000, 0x37EF8000, 0x37F00000, 0x37F08000, 0x37F10000,
-        0x37F18000, 0x37F20000, 0x37F28000, 0x37F30000, 0x37F38000, 0x37F40000, 0x37F48000,
-        0x37F50000, 0x37F58000, 0x37F60000, 0x37F68000, 0x37F70000, 0x37F78000, 0x37F80000,
-        0x37F88000, 0x37F90000, 0x37F98000, 0x37FA0000, 0x37FA8000, 0x37FB0000, 0x37FB8000,
-        0x37FC0000, 0x37FC8000, 0x37FD0000, 0x37FD8000, 0x37FE0000, 0x37FE8000, 0x37FF0000,
-        0x37FF8000, 0x38000000, 0x38004000, 0x38008000, 0x3800C000, 0x38010000, 0x38014000,
-        0x38018000, 0x3801C000, 0x38020000, 0x38024000, 0x38028000, 0x3802C000, 0x38030000,
-        0x38034000, 0x38038000, 0x3803C000, 0x38040000, 0x38044000, 0x38048000, 0x3804C000,
-        0x38050000, 0x38054000, 0x38058000, 0x3805C000, 0x38060000, 0x38064000, 0x38068000,
-        0x3806C000, 0x38070000, 0x38074000, 0x38078000, 0x3807C000, 0x38080000, 0x38084000,
-        0x38088000, 0x3808C000, 0x38090000, 0x38094000, 0x38098000, 0x3809C000, 0x380A0000,
-        0x380A4000, 0x380A8000, 0x380AC000, 0x380B0000, 0x380B4000, 0x380B8000, 0x380BC000,
-        0x380C0000, 0x380C4000, 0x380C8000, 0x380CC000, 0x380D0000, 0x380D4000, 0x380D8000,
-        0x380DC000, 0x380E0000, 0x380E4000, 0x380E8000, 0x380EC000, 0x380F0000, 0x380F4000,
-        0x380F8000, 0x380FC000, 0x38100000, 0x38104000, 0x38108000, 0x3810C000, 0x38110000,
-        0x38114000, 0x38118000, 0x3811C000, 0x38120000, 0x38124000, 0x38128000, 0x3812C000,
-        0x38130000, 0x38134000, 0x38138000, 0x3813C000, 0x38140000, 0x38144000, 0x38148000,
-        0x3814C000, 0x38150000, 0x38154000, 0x38158000, 0x3815C000, 0x38160000, 0x38164000,
-        0x38168000, 0x3816C000, 0x38170000, 0x38174000, 0x38178000, 0x3817C000, 0x38180000,
-        0x38184000, 0x38188000, 0x3818C000, 0x38190000, 0x38194000, 0x38198000, 0x3819C000,
-        0x381A0000, 0x381A4000, 0x381A8000, 0x381AC000, 0x381B0000, 0x381B4000, 0x381B8000,
-        0x381BC000, 0x381C0000, 0x381C4000, 0x381C8000, 0x381CC000, 0x381D0000, 0x381D4000,
-        0x381D8000, 0x381DC000, 0x381E0000, 0x381E4000, 0x381E8000, 0x381EC000, 0x381F0000,
-        0x381F4000, 0x381F8000, 0x381FC000, 0x38200000, 0x38204000, 0x38208000, 0x3820C000,
-        0x38210000, 0x38214000, 0x38218000, 0x3821C000, 0x38220000, 0x38224000, 0x38228000,
-        0x3822C000, 0x38230000, 0x38234000, 0x38238000, 0x3823C000, 0x38240000, 0x38244000,
-        0x38248000, 0x3824C000, 0x38250000, 0x38254000, 0x38258000, 0x3825C000, 0x38260000,
-        0x38264000, 0x38268000, 0x3826C000, 0x38270000, 0x38274000, 0x38278000, 0x3827C000,
-        0x38280000, 0x38284000, 0x38288000, 0x3828C000, 0x38290000, 0x38294000, 0x38298000,
-        0x3829C000, 0x382A0000, 0x382A4000, 0x382A8000, 0x382AC000, 0x382B0000, 0x382B4000,
-        0x382B8000, 0x382BC000, 0x382C0000, 0x382C4000, 0x382C8000, 0x382CC000, 0x382D0000,
-        0x382D4000, 0x382D8000, 0x382DC000, 0x382E0000, 0x382E4000, 0x382E8000, 0x382EC000,
-        0x382F0000, 0x382F4000, 0x382F8000, 0x382FC000, 0x38300000, 0x38304000, 0x38308000,
-        0x3830C000, 0x38310000, 0x38314000, 0x38318000, 0x3831C000, 0x38320000, 0x38324000,
-        0x38328000, 0x3832C000, 0x38330000, 0x38334000, 0x38338000, 0x3833C000, 0x38340000,
-        0x38344000, 0x38348000, 0x3834C000, 0x38350000, 0x38354000, 0x38358000, 0x3835C000,
-        0x38360000, 0x38364000, 0x38368000, 0x3836C000, 0x38370000, 0x38374000, 0x38378000,
-        0x3837C000, 0x38380000, 0x38384000, 0x38388000, 0x3838C000, 0x38390000, 0x38394000,
-        0x38398000, 0x3839C000, 0x383A0000, 0x383A4000, 0x383A8000, 0x383AC000, 0x383B0000,
-        0x383B4000, 0x383B8000, 0x383BC000, 0x383C0000, 0x383C4000, 0x383C8000, 0x383CC000,
-        0x383D0000, 0x383D4000, 0x383D8000, 0x383DC000, 0x383E0000, 0x383E4000, 0x383E8000,
-        0x383EC000, 0x383F0000, 0x383F4000, 0x383F8000, 0x383FC000, 0x38400000, 0x38404000,
-        0x38408000, 0x3840C000, 0x38410000, 0x38414000, 0x38418000, 0x3841C000, 0x38420000,
-        0x38424000, 0x38428000, 0x3842C000, 0x38430000, 0x38434000, 0x38438000, 0x3843C000,
-        0x38440000, 0x38444000, 0x38448000, 0x3844C000, 0x38450000, 0x38454000, 0x38458000,
-        0x3845C000, 0x38460000, 0x38464000, 0x38468000, 0x3846C000, 0x38470000, 0x38474000,
-        0x38478000, 0x3847C000, 0x38480000, 0x38484000, 0x38488000, 0x3848C000, 0x38490000,
-        0x38494000, 0x38498000, 0x3849C000, 0x384A0000, 0x384A4000, 0x384A8000, 0x384AC000,
-        0x384B0000, 0x384B4000, 0x384B8000, 0x384BC000, 0x384C0000, 0x384C4000, 0x384C8000,
-        0x384CC000, 0x384D0000, 0x384D4000, 0x384D8000, 0x384DC000, 0x384E0000, 0x384E4000,
-        0x384E8000, 0x384EC000, 0x384F0000, 0x384F4000, 0x384F8000, 0x384FC000, 0x38500000,
-        0x38504000, 0x38508000, 0x3850C000, 0x38510000, 0x38514000, 0x38518000, 0x3851C000,
-        0x38520000, 0x38524000, 0x38528000, 0x3852C000, 0x38530000, 0x38534000, 0x38538000,
-        0x3853C000, 0x38540000, 0x38544000, 0x38548000, 0x3854C000, 0x38550000, 0x38554000,
-        0x38558000, 0x3855C000, 0x38560000, 0x38564000, 0x38568000, 0x3856C000, 0x38570000,
-        0x38574000, 0x38578000, 0x3857C000, 0x38580000, 0x38584000, 0x38588000, 0x3858C000,
-        0x38590000, 0x38594000, 0x38598000, 0x3859C000, 0x385A0000, 0x385A4000, 0x385A8000,
-        0x385AC000, 0x385B0000, 0x385B4000, 0x385B8000, 0x385BC000, 0x385C0000, 0x385C4000,
-        0x385C8000, 0x385CC000, 0x385D0000, 0x385D4000, 0x385D8000, 0x385DC000, 0x385E0000,
-        0x385E4000, 0x385E8000, 0x385EC000, 0x385F0000, 0x385F4000, 0x385F8000, 0x385FC000,
-        0x38600000, 0x38604000, 0x38608000, 0x3860C000, 0x38610000, 0x38614000, 0x38618000,
-        0x3861C000, 0x38620000, 0x38624000, 0x38628000, 0x3862C000, 0x38630000, 0x38634000,
-        0x38638000, 0x3863C000, 0x38640000, 0x38644000, 0x38648000, 0x3864C000, 0x38650000,
-        0x38654000, 0x38658000, 0x3865C000, 0x38660000, 0x38664000, 0x38668000, 0x3866C000,
-        0x38670000, 0x38674000, 0x38678000, 0x3867C000, 0x38680000, 0x38684000, 0x38688000,
-        0x3868C000, 0x38690000, 0x38694000, 0x38698000, 0x3869C000, 0x386A0000, 0x386A4000,
-        0x386A8000, 0x386AC000, 0x386B0000, 0x386B4000, 0x386B8000, 0x386BC000, 0x386C0000,
-        0x386C4000, 0x386C8000, 0x386CC000, 0x386D0000, 0x386D4000, 0x386D8000, 0x386DC000,
-        0x386E0000, 0x386E4000, 0x386E8000, 0x386EC000, 0x386F0000, 0x386F4000, 0x386F8000,
-        0x386FC000, 0x38700000, 0x38704000, 0x38708000, 0x3870C000, 0x38710000, 0x38714000,
-        0x38718000, 0x3871C000, 0x38720000, 0x38724000, 0x38728000, 0x3872C000, 0x38730000,
-        0x38734000, 0x38738000, 0x3873C000, 0x38740000, 0x38744000, 0x38748000, 0x3874C000,
-        0x38750000, 0x38754000, 0x38758000, 0x3875C000, 0x38760000, 0x38764000, 0x38768000,
-        0x3876C000, 0x38770000, 0x38774000, 0x38778000, 0x3877C000, 0x38780000, 0x38784000,
-        0x38788000, 0x3878C000, 0x38790000, 0x38794000, 0x38798000, 0x3879C000, 0x387A0000,
-        0x387A4000, 0x387A8000, 0x387AC000, 0x387B0000, 0x387B4000, 0x387B8000, 0x387BC000,
-        0x387C0000, 0x387C4000, 0x387C8000, 0x387CC000, 0x387D0000, 0x387D4000, 0x387D8000,
-        0x387DC000, 0x387E0000, 0x387E4000, 0x387E8000, 0x387EC000, 0x387F0000, 0x387F4000,
-        0x387F8000, 0x387FC000, 0x38000000, 0x38002000, 0x38004000, 0x38006000, 0x38008000,
-        0x3800A000, 0x3800C000, 0x3800E000, 0x38010000, 0x38012000, 0x38014000, 0x38016000,
-        0x38018000, 0x3801A000, 0x3801C000, 0x3801E000, 0x38020000, 0x38022000, 0x38024000,
-        0x38026000, 0x38028000, 0x3802A000, 0x3802C000, 0x3802E000, 0x38030000, 0x38032000,
-        0x38034000, 0x38036000, 0x38038000, 0x3803A000, 0x3803C000, 0x3803E000, 0x38040000,
-        0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804A000, 0x3804C000, 0x3804E000,
-        0x38050000, 0x38052000, 0x38054000, 0x38056000, 0x38058000, 0x3805A000, 0x3805C000,
-        0x3805E000, 0x38060000, 0x38062000, 0x38064000, 0x38066000, 0x38068000, 0x3806A000,
-        0x3806C000, 0x3806E000, 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000,
-        0x3807A000, 0x3807C000, 0x3807E000, 0x38080000, 0x38082000, 0x38084000, 0x38086000,
-        0x38088000, 0x3808A000, 0x3808C000, 0x3808E000, 0x38090000, 0x38092000, 0x38094000,
-        0x38096000, 0x38098000, 0x3809A000, 0x3809C000, 0x3809E000, 0x380A0000, 0x380A2000,
-        0x380A4000, 0x380A6000, 0x380A8000, 0x380AA000, 0x380AC000, 0x380AE000, 0x380B0000,
-        0x380B2000, 0x380B4000, 0x380B6000, 0x380B8000, 0x380BA000, 0x380BC000, 0x380BE000,
-        0x380C0000, 0x380C2000, 0x380C4000, 0x380C6000, 0x380C8000, 0x380CA000, 0x380CC000,
-        0x380CE000, 0x380D0000, 0x380D2000, 0x380D4000, 0x380D6000, 0x380D8000, 0x380DA000,
-        0x380DC000, 0x380DE000, 0x380E0000, 0x380E2000, 0x380E4000, 0x380E6000, 0x380E8000,
-        0x380EA000, 0x380EC000, 0x380EE000, 0x380F0000, 0x380F2000, 0x380F4000, 0x380F6000,
-        0x380F8000, 0x380FA000, 0x380FC000, 0x380FE000, 0x38100000, 0x38102000, 0x38104000,
-        0x38106000, 0x38108000, 0x3810A000, 0x3810C000, 0x3810E000, 0x38110000, 0x38112000,
-        0x38114000, 0x38116000, 0x38118000, 0x3811A000, 0x3811C000, 0x3811E000, 0x38120000,
-        0x38122000, 0x38124000, 0x38126000, 0x38128000, 0x3812A000, 0x3812C000, 0x3812E000,
-        0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813A000, 0x3813C000,
-        0x3813E000, 0x38140000, 0x38142000, 0x38144000, 0x38146000, 0x38148000, 0x3814A000,
-        0x3814C000, 0x3814E000, 0x38150000, 0x38152000, 0x38154000, 0x38156000, 0x38158000,
-        0x3815A000, 0x3815C000, 0x3815E000, 0x38160000, 0x38162000, 0x38164000, 0x38166000,
-        0x38168000, 0x3816A000, 0x3816C000, 0x3816E000, 0x38170000, 0x38172000, 0x38174000,
-        0x38176000, 0x38178000, 0x3817A000, 0x3817C000, 0x3817E000, 0x38180000, 0x38182000,
-        0x38184000, 0x38186000, 0x38188000, 0x3818A000, 0x3818C000, 0x3818E000, 0x38190000,
-        0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819A000, 0x3819C000, 0x3819E000,
-        0x381A0000, 0x381A2000, 0x381A4000, 0x381A6000, 0x381A8000, 0x381AA000, 0x381AC000,
-        0x381AE000, 0x381B0000, 0x381B2000, 0x381B4000, 0x381B6000, 0x381B8000, 0x381BA000,
-        0x381BC000, 0x381BE000, 0x381C0000, 0x381C2000, 0x381C4000, 0x381C6000, 0x381C8000,
-        0x381CA000, 0x381CC000, 0x381CE000, 0x381D0000, 0x381D2000, 0x381D4000, 0x381D6000,
-        0x381D8000, 0x381DA000, 0x381DC000, 0x381DE000, 0x381E0000, 0x381E2000, 0x381E4000,
-        0x381E6000, 0x381E8000, 0x381EA000, 0x381EC000, 0x381EE000, 0x381F0000, 0x381F2000,
-        0x381F4000, 0x381F6000, 0x381F8000, 0x381FA000, 0x381FC000, 0x381FE000, 0x38200000,
-        0x38202000, 0x38204000, 0x38206000, 0x38208000, 0x3820A000, 0x3820C000, 0x3820E000,
-        0x38210000, 0x38212000, 0x38214000, 0x38216000, 0x38218000, 0x3821A000, 0x3821C000,
-        0x3821E000, 0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822A000,
-        0x3822C000, 0x3822E000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, 0x38238000,
-        0x3823A000, 0x3823C000, 0x3823E000, 0x38240000, 0x38242000, 0x38244000, 0x38246000,
-        0x38248000, 0x3824A000, 0x3824C000, 0x3824E000, 0x38250000, 0x38252000, 0x38254000,
-        0x38256000, 0x38258000, 0x3825A000, 0x3825C000, 0x3825E000, 0x38260000, 0x38262000,
-        0x38264000, 0x38266000, 0x38268000, 0x3826A000, 0x3826C000, 0x3826E000, 0x38270000,
-        0x38272000, 0x38274000, 0x38276000, 0x38278000, 0x3827A000, 0x3827C000, 0x3827E000,
-        0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828A000, 0x3828C000,
-        0x3828E000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, 0x38298000, 0x3829A000,
-        0x3829C000, 0x3829E000, 0x382A0000, 0x382A2000, 0x382A4000, 0x382A6000, 0x382A8000,
-        0x382AA000, 0x382AC000, 0x382AE000, 0x382B0000, 0x382B2000, 0x382B4000, 0x382B6000,
-        0x382B8000, 0x382BA000, 0x382BC000, 0x382BE000, 0x382C0000, 0x382C2000, 0x382C4000,
-        0x382C6000, 0x382C8000, 0x382CA000, 0x382CC000, 0x382CE000, 0x382D0000, 0x382D2000,
-        0x382D4000, 0x382D6000, 0x382D8000, 0x382DA000, 0x382DC000, 0x382DE000, 0x382E0000,
-        0x382E2000, 0x382E4000, 0x382E6000, 0x382E8000, 0x382EA000, 0x382EC000, 0x382EE000,
-        0x382F0000, 0x382F2000, 0x382F4000, 0x382F6000, 0x382F8000, 0x382FA000, 0x382FC000,
-        0x382FE000, 0x38300000, 0x38302000, 0x38304000, 0x38306000, 0x38308000, 0x3830A000,
-        0x3830C000, 0x3830E000, 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000,
-        0x3831A000, 0x3831C000, 0x3831E000, 0x38320000, 0x38322000, 0x38324000, 0x38326000,
-        0x38328000, 0x3832A000, 0x3832C000, 0x3832E000, 0x38330000, 0x38332000, 0x38334000,
-        0x38336000, 0x38338000, 0x3833A000, 0x3833C000, 0x3833E000, 0x38340000, 0x38342000,
-        0x38344000, 0x38346000, 0x38348000, 0x3834A000, 0x3834C000, 0x3834E000, 0x38350000,
-        0x38352000, 0x38354000, 0x38356000, 0x38358000, 0x3835A000, 0x3835C000, 0x3835E000,
-        0x38360000, 0x38362000, 0x38364000, 0x38366000, 0x38368000, 0x3836A000, 0x3836C000,
-        0x3836E000, 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837A000,
-        0x3837C000, 0x3837E000, 0x38380000, 0x38382000, 0x38384000, 0x38386000, 0x38388000,
-        0x3838A000, 0x3838C000, 0x3838E000, 0x38390000, 0x38392000, 0x38394000, 0x38396000,
-        0x38398000, 0x3839A000, 0x3839C000, 0x3839E000, 0x383A0000, 0x383A2000, 0x383A4000,
-        0x383A6000, 0x383A8000, 0x383AA000, 0x383AC000, 0x383AE000, 0x383B0000, 0x383B2000,
-        0x383B4000, 0x383B6000, 0x383B8000, 0x383BA000, 0x383BC000, 0x383BE000, 0x383C0000,
-        0x383C2000, 0x383C4000, 0x383C6000, 0x383C8000, 0x383CA000, 0x383CC000, 0x383CE000,
-        0x383D0000, 0x383D2000, 0x383D4000, 0x383D6000, 0x383D8000, 0x383DA000, 0x383DC000,
-        0x383DE000, 0x383E0000, 0x383E2000, 0x383E4000, 0x383E6000, 0x383E8000, 0x383EA000,
-        0x383EC000, 0x383EE000, 0x383F0000, 0x383F2000, 0x383F4000, 0x383F6000, 0x383F8000,
-        0x383FA000, 0x383FC000, 0x383FE000, 0x38400000, 0x38402000, 0x38404000, 0x38406000,
-        0x38408000, 0x3840A000, 0x3840C000, 0x3840E000, 0x38410000, 0x38412000, 0x38414000,
-        0x38416000, 0x38418000, 0x3841A000, 0x3841C000, 0x3841E000, 0x38420000, 0x38422000,
-        0x38424000, 0x38426000, 0x38428000, 0x3842A000, 0x3842C000, 0x3842E000, 0x38430000,
-        0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843A000, 0x3843C000, 0x3843E000,
-        0x38440000, 0x38442000, 0x38444000, 0x38446000, 0x38448000, 0x3844A000, 0x3844C000,
-        0x3844E000, 0x38450000, 0x38452000, 0x38454000, 0x38456000, 0x38458000, 0x3845A000,
-        0x3845C000, 0x3845E000, 0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000,
-        0x3846A000, 0x3846C000, 0x3846E000, 0x38470000, 0x38472000, 0x38474000, 0x38476000,
-        0x38478000, 0x3847A000, 0x3847C000, 0x3847E000, 0x38480000, 0x38482000, 0x38484000,
-        0x38486000, 0x38488000, 0x3848A000, 0x3848C000, 0x3848E000, 0x38490000, 0x38492000,
-        0x38494000, 0x38496000, 0x38498000, 0x3849A000, 0x3849C000, 0x3849E000, 0x384A0000,
-        0x384A2000, 0x384A4000, 0x384A6000, 0x384A8000, 0x384AA000, 0x384AC000, 0x384AE000,
-        0x384B0000, 0x384B2000, 0x384B4000, 0x384B6000, 0x384B8000, 0x384BA000, 0x384BC000,
-        0x384BE000, 0x384C0000, 0x384C2000, 0x384C4000, 0x384C6000, 0x384C8000, 0x384CA000,
-        0x384CC000, 0x384CE000, 0x384D0000, 0x384D2000, 0x384D4000, 0x384D6000, 0x384D8000,
-        0x384DA000, 0x384DC000, 0x384DE000, 0x384E0000, 0x384E2000, 0x384E4000, 0x384E6000,
-        0x384E8000, 0x384EA000, 0x384EC000, 0x384EE000, 0x384F0000, 0x384F2000, 0x384F4000,
-        0x384F6000, 0x384F8000, 0x384FA000, 0x384FC000, 0x384FE000, 0x38500000, 0x38502000,
-        0x38504000, 0x38506000, 0x38508000, 0x3850A000, 0x3850C000, 0x3850E000, 0x38510000,
-        0x38512000, 0x38514000, 0x38516000, 0x38518000, 0x3851A000, 0x3851C000, 0x3851E000,
-        0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852A000, 0x3852C000,
-        0x3852E000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, 0x38538000, 0x3853A000,
-        0x3853C000, 0x3853E000, 0x38540000, 0x38542000, 0x38544000, 0x38546000, 0x38548000,
-        0x3854A000, 0x3854C000, 0x3854E000, 0x38550000, 0x38552000, 0x38554000, 0x38556000,
-        0x38558000, 0x3855A000, 0x3855C000, 0x3855E000, 0x38560000, 0x38562000, 0x38564000,
-        0x38566000, 0x38568000, 0x3856A000, 0x3856C000, 0x3856E000, 0x38570000, 0x38572000,
-        0x38574000, 0x38576000, 0x38578000, 0x3857A000, 0x3857C000, 0x3857E000, 0x38580000,
-        0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858A000, 0x3858C000, 0x3858E000,
-        0x38590000, 0x38592000, 0x38594000, 0x38596000, 0x38598000, 0x3859A000, 0x3859C000,
-        0x3859E000, 0x385A0000, 0x385A2000, 0x385A4000, 0x385A6000, 0x385A8000, 0x385AA000,
-        0x385AC000, 0x385AE000, 0x385B0000, 0x385B2000, 0x385B4000, 0x385B6000, 0x385B8000,
-        0x385BA000, 0x385BC000, 0x385BE000, 0x385C0000, 0x385C2000, 0x385C4000, 0x385C6000,
-        0x385C8000, 0x385CA000, 0x385CC000, 0x385CE000, 0x385D0000, 0x385D2000, 0x385D4000,
-        0x385D6000, 0x385D8000, 0x385DA000, 0x385DC000, 0x385DE000, 0x385E0000, 0x385E2000,
-        0x385E4000, 0x385E6000, 0x385E8000, 0x385EA000, 0x385EC000, 0x385EE000, 0x385F0000,
-        0x385F2000, 0x385F4000, 0x385F6000, 0x385F8000, 0x385FA000, 0x385FC000, 0x385FE000,
-        0x38600000, 0x38602000, 0x38604000, 0x38606000, 0x38608000, 0x3860A000, 0x3860C000,
-        0x3860E000, 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861A000,
-        0x3861C000, 0x3861E000, 0x38620000, 0x38622000, 0x38624000, 0x38626000, 0x38628000,
-        0x3862A000, 0x3862C000, 0x3862E000, 0x38630000, 0x38632000, 0x38634000, 0x38636000,
-        0x38638000, 0x3863A000, 0x3863C000, 0x3863E000, 0x38640000, 0x38642000, 0x38644000,
-        0x38646000, 0x38648000, 0x3864A000, 0x3864C000, 0x3864E000, 0x38650000, 0x38652000,
-        0x38654000, 0x38656000, 0x38658000, 0x3865A000, 0x3865C000, 0x3865E000, 0x38660000,
-        0x38662000, 0x38664000, 0x38666000, 0x38668000, 0x3866A000, 0x3866C000, 0x3866E000,
-        0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867A000, 0x3867C000,
-        0x3867E000, 0x38680000, 0x38682000, 0x38684000, 0x38686000, 0x38688000, 0x3868A000,
-        0x3868C000, 0x3868E000, 0x38690000, 0x38692000, 0x38694000, 0x38696000, 0x38698000,
-        0x3869A000, 0x3869C000, 0x3869E000, 0x386A0000, 0x386A2000, 0x386A4000, 0x386A6000,
-        0x386A8000, 0x386AA000, 0x386AC000, 0x386AE000, 0x386B0000, 0x386B2000, 0x386B4000,
-        0x386B6000, 0x386B8000, 0x386BA000, 0x386BC000, 0x386BE000, 0x386C0000, 0x386C2000,
-        0x386C4000, 0x386C6000, 0x386C8000, 0x386CA000, 0x386CC000, 0x386CE000, 0x386D0000,
-        0x386D2000, 0x386D4000, 0x386D6000, 0x386D8000, 0x386DA000, 0x386DC000, 0x386DE000,
-        0x386E0000, 0x386E2000, 0x386E4000, 0x386E6000, 0x386E8000, 0x386EA000, 0x386EC000,
-        0x386EE000, 0x386F0000, 0x386F2000, 0x386F4000, 0x386F6000, 0x386F8000, 0x386FA000,
-        0x386FC000, 0x386FE000, 0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000,
-        0x3870A000, 0x3870C000, 0x3870E000, 0x38710000, 0x38712000, 0x38714000, 0x38716000,
-        0x38718000, 0x3871A000, 0x3871C000, 0x3871E000, 0x38720000, 0x38722000, 0x38724000,
-        0x38726000, 0x38728000, 0x3872A000, 0x3872C000, 0x3872E000, 0x38730000, 0x38732000,
-        0x38734000, 0x38736000, 0x38738000, 0x3873A000, 0x3873C000, 0x3873E000, 0x38740000,
-        0x38742000, 0x38744000, 0x38746000, 0x38748000, 0x3874A000, 0x3874C000, 0x3874E000,
-        0x38750000, 0x38752000, 0x38754000, 0x38756000, 0x38758000, 0x3875A000, 0x3875C000,
-        0x3875E000, 0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876A000,
-        0x3876C000, 0x3876E000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, 0x38778000,
-        0x3877A000, 0x3877C000, 0x3877E000, 0x38780000, 0x38782000, 0x38784000, 0x38786000,
-        0x38788000, 0x3878A000, 0x3878C000, 0x3878E000, 0x38790000, 0x38792000, 0x38794000,
-        0x38796000, 0x38798000, 0x3879A000, 0x3879C000, 0x3879E000, 0x387A0000, 0x387A2000,
-        0x387A4000, 0x387A6000, 0x387A8000, 0x387AA000, 0x387AC000, 0x387AE000, 0x387B0000,
-        0x387B2000, 0x387B4000, 0x387B6000, 0x387B8000, 0x387BA000, 0x387BC000, 0x387BE000,
-        0x387C0000, 0x387C2000, 0x387C4000, 0x387C6000, 0x387C8000, 0x387CA000, 0x387CC000,
-        0x387CE000, 0x387D0000, 0x387D2000, 0x387D4000, 0x387D6000, 0x387D8000, 0x387DA000,
-        0x387DC000, 0x387DE000, 0x387E0000, 0x387E2000, 0x387E4000, 0x387E6000, 0x387E8000,
-        0x387EA000, 0x387EC000, 0x387EE000, 0x387F0000, 0x387F2000, 0x387F4000, 0x387F6000,
-        0x387F8000, 0x387FA000, 0x387FC000, 0x387FE000};
-    static const bits<float>::type exponent_table[64] = {
-        0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, 0x03000000,
-        0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, 0x06000000, 0x06800000,
-        0x07000000, 0x07800000, 0x08000000, 0x08800000, 0x09000000, 0x09800000, 0x0A000000,
-        0x0A800000, 0x0B000000, 0x0B800000, 0x0C000000, 0x0C800000, 0x0D000000, 0x0D800000,
-        0x0E000000, 0x0E800000, 0x0F000000, 0x47800000, 0x80000000, 0x80800000, 0x81000000,
-        0x81800000, 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000,
-        0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000, 0x88000000,
-        0x88800000, 0x89000000, 0x89800000, 0x8A000000, 0x8A800000, 0x8B000000, 0x8B800000,
-        0x8C000000, 0x8C800000, 0x8D000000, 0x8D800000, 0x8E000000, 0x8E800000, 0x8F000000,
-        0xC7800000};
-    static const unsigned short offset_table[64] = {
-        0,    1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024,
-        1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024,
-        1024, 1024, 1024, 1024, 1024, 1024, 0,    1024, 1024, 1024, 1024, 1024, 1024,
-        1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024,
-        1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024};
-    bits<float>::type fbits =
-        mantissa_table[offset_table[value >> 10] + (value & 0x3FF)] + exponent_table[value >> 10];
-#endif
-    float out;
-    std::memcpy(&out, &fbits, sizeof(float));
-    return out;
-#endif
-}
-
-/// Convert half-precision to IEEE double-precision.
-/// \param value half-precision value to convert
-/// \return double-precision value
-inline double half2float_impl(unsigned int value, double, true_type)
-{
-#if HALF_ENABLE_F16C_INTRINSICS
-    return _mm_cvtsd_f64(_mm_cvtps_pd(_mm_cvtph_ps(_mm_cvtsi32_si128(value))));
-#else
-    uint32 hi        = static_cast<uint32>(value & 0x8000) << 16;
-    unsigned int abs = value & 0x7FFF;
-    if(abs)
-    {
-        hi |= 0x3F000000 << static_cast<unsigned>(abs >= 0x7C00);
-        for(; abs < 0x400; abs <<= 1, hi -= 0x100000)
-            ;
-        hi += static_cast<uint32>(abs) << 10;
-    }
-    bits<double>::type dbits = static_cast<bits<double>::type>(hi) << 32;
-    double out;
-    std::memcpy(&out, &dbits, sizeof(double));
-    return out;
-#endif
-}
-
-/// Convert half-precision to non-IEEE floating-point.
-/// \tparam T type to convert to (builtin integer type)
-/// \param value half-precision value to convert
-/// \return floating-point value
-template <typename T>
-T half2float_impl(unsigned int value, T, ...)
-{
-    T out;
-    unsigned int abs = value & 0x7FFF;
-    if(abs > 0x7C00)
-        out =
-            (std::numeric_limits<T>::has_signaling_NaN && !(abs & 0x200))
-                ? std::numeric_limits<T>::signaling_NaN()
-                : std::numeric_limits<T>::has_quiet_NaN ? std::numeric_limits<T>::quiet_NaN() : T();
-    else if(abs == 0x7C00)
-        out = std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity()
-                                                   : std::numeric_limits<T>::max();
-    else if(abs > 0x3FF)
-        out = std::ldexp(static_cast<T>((abs & 0x3FF) | 0x400), (abs >> 10) - 25);
-    else
-        out = std::ldexp(static_cast<T>(abs), -24);
-    return (value & 0x8000) ? -out : out;
-}
-
-/// Convert half-precision to floating-point.
-/// \tparam T type to convert to (builtin integer type)
-/// \param value half-precision value to convert
-/// \return floating-point value
-template <typename T>
-T half2float(unsigned int value)
-{
-    return half2float_impl(value,
-                           T(),
-                           bool_type < std::numeric_limits<T>::is_iec559 &&
-                               sizeof(typename bits<T>::type) == sizeof(T) > ());
-}
-
-/// Convert half-precision floating-point to integer.
-/// \tparam R rounding mode to use
-/// \tparam E `true` for round to even, `false` for round away from zero
-/// \tparam I `true` to raise INEXACT exception (if inexact), `false` to never raise it
-/// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding
-/// any implicit sign bits)
-/// \param value half-precision value to convert
-/// \return rounded integer value
-/// \exception FE_INVALID if value is not representable in type \a T
-/// \exception FE_INEXACT if value had to be rounded and \a I is `true`
-template <std::float_round_style R, bool E, bool I, typename T>
-T half2int(unsigned int value)
-{
-    unsigned int abs = value & 0x7FFF;
-    if(abs >= 0x7C00)
-    {
-        raise(FE_INVALID);
-        return (value & 0x8000) ? std::numeric_limits<T>::min() : std::numeric_limits<T>::max();
-    }
-    if(abs < 0x3800)
-    {
-        raise(FE_INEXACT, I);
-        return (R == std::round_toward_infinity)
-                   ? T(~(value >> 15) & (abs != 0))
-                   : (R == std::round_toward_neg_infinity) ? -T(value > 0x8000) : T();
-    }
-    int exp        = 25 - (abs >> 10);
-    unsigned int m = (value & 0x3FF) | 0x400;
-    int32 i        = static_cast<int32>(
-        (exp <= 0)
-            ? (m << -exp)
-            : ((m + ((R == std::round_to_nearest) ? ((1 << (exp - 1)) - (~(m >> exp) & E))
-                                                  : (R == std::round_toward_infinity)
-                                                        ? (((1 << exp) - 1) & ((value >> 15) - 1))
-                                                        : (R == std::round_toward_neg_infinity)
-                                                              ? (((1 << exp) - 1) & -(value >> 15))
-                                                              : 0)) >>
-               exp));
-    if((!std::numeric_limits<T>::is_signed && (value & 0x8000)) ||
-       (std::numeric_limits<T>::digits < 16 &&
-        ((value & 0x8000) ? (-i < std::numeric_limits<T>::min())
-                          : (i > std::numeric_limits<T>::max()))))
-        raise(FE_INVALID);
-    else if(I && exp > 0 && (m & ((1 << exp) - 1)))
-        raise(FE_INEXACT);
-    return static_cast<T>((value & 0x8000) ? -i : i);
-}
-
-/// \}
-/// \name Mathematics
-/// \{
-
-/// upper part of 64-bit multiplication.
-/// \tparam R rounding mode to use
-/// \param x first factor
-/// \param y second factor
-/// \return upper 32 bit of \a x * \a y
-template <std::float_round_style R>
-uint32 mulhi(uint32 x, uint32 y)
-{
-    uint32 xy = (x >> 16) * (y & 0xFFFF), yx = (x & 0xFFFF) * (y >> 16),
-           c = (xy & 0xFFFF) + (yx & 0xFFFF) + (((x & 0xFFFF) * (y & 0xFFFF)) >> 16);
-    return (x >> 16) * (y >> 16) + (xy >> 16) + (yx >> 16) + (c >> 16) +
-           ((R == std::round_to_nearest)
-                ? ((c >> 15) & 1)
-                : (R == std::round_toward_infinity) ? ((c & 0xFFFF) != 0) : 0);
-}
-
-/// 64-bit multiplication.
-/// \param x first factor
-/// \param y second factor
-/// \return upper 32 bit of \a x * \a y rounded to nearest
-inline uint32 multiply64(uint32 x, uint32 y)
-{
-#if HALF_ENABLE_CPP11_LONG_LONG
-    return static_cast<uint32>(
-        (static_cast<unsigned long long>(x) * static_cast<unsigned long long>(y) + 0x80000000) >>
-        32);
-#else
-    return mulhi<std::round_to_nearest>(x, y);
-#endif
-}
-
-/// 64-bit division.
-/// \param x upper 32 bit of dividend
-/// \param y divisor
-/// \param s variable to store sticky bit for rounding
-/// \return (\a x << 32) / \a y
-inline uint32 divide64(uint32 x, uint32 y, int& s)
-{
-#if HALF_ENABLE_CPP11_LONG_LONG
-    unsigned long long xx = static_cast<unsigned long long>(x) << 32;
-    return s              = (xx % y != 0), static_cast<uint32>(xx / y);
-#else
-    y >>= 1;
-    uint32 rem = x, div = 0;
-    for(unsigned int i = 0; i < 32; ++i)
-    {
-        div <<= 1;
-        if(rem >= y)
-        {
-            rem -= y;
-            div |= 1;
-        }
-        rem <<= 1;
-    }
-    return s    = rem > 1, div;
-#endif
-}
-
-/// Half precision positive modulus.
-/// \tparam Q `true` to compute full quotient, `false` else
-/// \tparam R `true` to compute signed remainder, `false` for positive remainder
-/// \param x first operand as positive finite half-precision value
-/// \param y second operand as positive finite half-precision value
-/// \param quo adress to store quotient at, `nullptr` if \a Q `false`
-/// \return modulus of \a x / \a y
-template <bool Q, bool R>
-unsigned int mod(unsigned int x, unsigned int y, int* quo = NULL)
-{
-    unsigned int q = 0;
-    if(x > y)
-    {
-        int absx = x, absy = y, expx = 0, expy = 0;
-        for(; absx < 0x400; absx <<= 1, --expx)
-            ;
-        for(; absy < 0x400; absy <<= 1, --expy)
-            ;
-        expx += absx >> 10;
-        expy += absy >> 10;
-        int mx = (absx & 0x3FF) | 0x400, my = (absy & 0x3FF) | 0x400;
-        for(int d = expx - expy; d; --d)
-        {
-            if(!Q && mx == my)
-                return 0;
-            if(mx >= my)
-            {
-                mx -= my;
-                q += Q;
-            }
-            mx <<= 1;
-            q <<= static_cast<int>(Q);
-        }
-        if(!Q && mx == my)
-            return 0;
-        if(mx >= my)
-        {
-            mx -= my;
-            ++q;
-        }
-        if(Q)
-        {
-            q &= (1 << (std::numeric_limits<int>::digits - 1)) - 1;
-            if(!mx)
-                return *quo = q, 0;
-        }
-        for(; mx < 0x400; mx <<= 1, --expy)
-            ;
-        x = (expy > 0) ? ((expy << 10) | (mx & 0x3FF)) : (mx >> (1 - expy));
-    }
-    if(R)
-    {
-        unsigned int a, b;
-        if(y < 0x800)
-        {
-            a = (x < 0x400) ? (x << 1) : (x + 0x400);
-            b = y;
-        }
-        else
-        {
-            a = x;
-            b = y - 0x400;
-        }
-        if(a > b || (a == b && (q & 1)))
-        {
-            int exp = (y >> 10) + (y <= 0x3FF), d = exp - (x >> 10) - (x <= 0x3FF);
-            int m = (((y & 0x3FF) | ((y > 0x3FF) << 10)) << 1) -
-                    (((x & 0x3FF) | ((x > 0x3FF) << 10)) << (1 - d));
-            for(; m < 0x800 && exp > 1; m <<= 1, --exp)
-                ;
-            x = 0x8000 + ((exp - 1) << 10) + (m >> 1);
-            q += Q;
-        }
-    }
-    if(Q)
-        *quo = q;
-    return x;
-}
-
-/// Fixed point square root.
-/// \tparam F number of fractional bits
-/// \param r radicand in Q1.F fixed point format
-/// \param exp exponent
-/// \return square root as Q1.F/2
-template <unsigned int F>
-uint32 sqrt(uint32& r, int& exp)
-{
-    int i = exp & 1;
-    r <<= i;
-    exp      = (exp - i) / 2;
-    uint32 m = 0;
-    for(uint32 bit = static_cast<uint32>(1) << F; bit; bit >>= 2)
-    {
-        if(r < m + bit)
-            m >>= 1;
-        else
-        {
-            r -= m + bit;
-            m = (m >> 1) + bit;
-        }
-    }
-    return m;
-}
-
-/// Fixed point binary exponential.
-/// This uses the BKM algorithm in E-mode.
-/// \param m exponent in [0,1) as Q0.31
-/// \param n number of iterations (at most 32)
-/// \return 2 ^ \a m as Q1.31
-inline uint32 exp2(uint32 m, unsigned int n = 32)
-{
-    static const uint32 logs[] = {
-        0x80000000, 0x4AE00D1D, 0x2934F098, 0x15C01A3A, 0x0B31FB7D, 0x05AEB4DD, 0x02DCF2D1,
-        0x016FE50B, 0x00B84E23, 0x005C3E10, 0x002E24CA, 0x001713D6, 0x000B8A47, 0x0005C53B,
-        0x0002E2A3, 0x00017153, 0x0000B8AA, 0x00005C55, 0x00002E2B, 0x00001715, 0x00000B8B,
-        0x000005C5, 0x000002E3, 0x00000171, 0x000000B9, 0x0000005C, 0x0000002E, 0x00000017,
-        0x0000000C, 0x00000006, 0x00000003, 0x00000001};
-    if(!m)
-        return 0x80000000;
-    uint32 mx = 0x80000000, my = 0;
-    for(unsigned int i = 1; i < n; ++i)
-    {
-        uint32 mz = my + logs[i];
-        if(mz <= m)
-        {
-            my = mz;
-            mx += mx >> i;
-        }
-    }
-    return mx;
-}
-
-/// Fixed point binary logarithm.
-/// This uses the BKM algorithm in L-mode.
-/// \param m mantissa in [1,2) as Q1.30
-/// \param n number of iterations (at most 32)
-/// \return log2(\a m) as Q0.31
-inline uint32 log2(uint32 m, unsigned int n = 32)
-{
-    static const uint32 logs[] = {
-        0x80000000, 0x4AE00D1D, 0x2934F098, 0x15C01A3A, 0x0B31FB7D, 0x05AEB4DD, 0x02DCF2D1,
-        0x016FE50B, 0x00B84E23, 0x005C3E10, 0x002E24CA, 0x001713D6, 0x000B8A47, 0x0005C53B,
-        0x0002E2A3, 0x00017153, 0x0000B8AA, 0x00005C55, 0x00002E2B, 0x00001715, 0x00000B8B,
-        0x000005C5, 0x000002E3, 0x00000171, 0x000000B9, 0x0000005C, 0x0000002E, 0x00000017,
-        0x0000000C, 0x00000006, 0x00000003, 0x00000001};
-    if(m == 0x40000000)
-        return 0;
-    uint32 mx = 0x40000000, my = 0;
-    for(unsigned int i = 1; i < n; ++i)
-    {
-        uint32 mz = mx + (mx >> i);
-        if(mz <= m)
-        {
-            mx = mz;
-            my += logs[i];
-        }
-    }
-    return my;
-}
-
-/// Fixed point sine and cosine.
-/// This uses the CORDIC algorithm in rotation mode.
-/// \param mz angle in [-pi/2,pi/2] as Q1.30
-/// \param n number of iterations (at most 31)
-/// \return sine and cosine of \a mz as Q1.30
-inline std::pair<uint32, uint32> sincos(uint32 mz, unsigned int n = 31)
-{
-    static const uint32 angles[] = {
-        0x3243F6A9, 0x1DAC6705, 0x0FADBAFD, 0x07F56EA7, 0x03FEAB77, 0x01FFD55C, 0x00FFFAAB,
-        0x007FFF55, 0x003FFFEB, 0x001FFFFD, 0x00100000, 0x00080000, 0x00040000, 0x00020000,
-        0x00010000, 0x00008000, 0x00004000, 0x00002000, 0x00001000, 0x00000800, 0x00000400,
-        0x00000200, 0x00000100, 0x00000080, 0x00000040, 0x00000020, 0x00000010, 0x00000008,
-        0x00000004, 0x00000002, 0x00000001};
-    uint32 mx = 0x26DD3B6A, my = 0;
-    for(unsigned int i = 0; i < n; ++i)
-    {
-        uint32 sign = sign_mask(mz);
-        uint32 tx   = mx - (arithmetic_shift(my, i) ^ sign) + sign;
-        uint32 ty   = my + (arithmetic_shift(mx, i) ^ sign) - sign;
-        mx          = tx;
-        my          = ty;
-        mz -= (angles[i] ^ sign) - sign;
-    }
-    return std::make_pair(my, mx);
-}
-
-/// Fixed point arc tangent.
-/// This uses the CORDIC algorithm in vectoring mode.
-/// \param my y coordinate as Q0.30
-/// \param mx x coordinate as Q0.30
-/// \param n number of iterations (at most 31)
-/// \return arc tangent of \a my / \a mx as Q1.30
-inline uint32 atan2(uint32 my, uint32 mx, unsigned int n = 31)
-{
-    static const uint32 angles[] = {
-        0x3243F6A9, 0x1DAC6705, 0x0FADBAFD, 0x07F56EA7, 0x03FEAB77, 0x01FFD55C, 0x00FFFAAB,
-        0x007FFF55, 0x003FFFEB, 0x001FFFFD, 0x00100000, 0x00080000, 0x00040000, 0x00020000,
-        0x00010000, 0x00008000, 0x00004000, 0x00002000, 0x00001000, 0x00000800, 0x00000400,
-        0x00000200, 0x00000100, 0x00000080, 0x00000040, 0x00000020, 0x00000010, 0x00000008,
-        0x00000004, 0x00000002, 0x00000001};
-    uint32 mz = 0;
-    for(unsigned int i = 0; i < n; ++i)
-    {
-        uint32 sign = sign_mask(my);
-        uint32 tx   = mx + (arithmetic_shift(my, i) ^ sign) - sign;
-        uint32 ty   = my - (arithmetic_shift(mx, i) ^ sign) + sign;
-        mx          = tx;
-        my          = ty;
-        mz += (angles[i] ^ sign) - sign;
-    }
-    return mz;
-}
-
-/// Reduce argument for trigonometric functions.
-/// \param abs half-precision floating-point value
-/// \param k value to take quarter period
-/// \return \a abs reduced to [-pi/4,pi/4] as Q0.30
-inline uint32 angle_arg(unsigned int abs, int& k)
-{
-    uint32 m = (abs & 0x3FF) | ((abs > 0x3FF) << 10);
-    int exp  = (abs >> 10) + (abs <= 0x3FF) - 15;
-    if(abs < 0x3A48)
-        return k = 0, m << (exp + 20);
-#if HALF_ENABLE_CPP11_LONG_LONG
-    unsigned long long y = m * 0xA2F9836E4E442, mask = (1ULL << (62 - exp)) - 1,
-                       yi = (y + (mask >> 1)) & ~mask, f = y - yi;
-    uint32 sign = -static_cast<uint32>(f >> 63);
-    k           = static_cast<int>(yi >> (62 - exp));
-    return (multiply64(static_cast<uint32>((sign ? -f : f) >> (31 - exp)), 0xC90FDAA2) ^ sign) -
-           sign;
-#else
-    uint32 yh   = m * 0xA2F98 + mulhi<std::round_toward_zero>(m, 0x36E4E442),
-           yl   = (m * 0x36E4E442) & 0xFFFFFFFF;
-    uint32 mask = (static_cast<uint32>(1) << (30 - exp)) - 1, yi = (yh + (mask >> 1)) & ~mask,
-           sign = -static_cast<uint32>(yi > yh);
-    k           = static_cast<int>(yi >> (30 - exp));
-    uint32 fh = (yh ^ sign) + (yi ^ ~sign) - ~sign, fl = (yl ^ sign) - sign;
-    return (multiply64((exp > -1)
-                           ? (((fh << (1 + exp)) & 0xFFFFFFFF) | ((fl & 0xFFFFFFFF) >> (31 - exp)))
-                           : fh,
-                       0xC90FDAA2) ^
-            sign) -
-           sign;
-#endif
-}
-
-/// Get arguments for atan2 function.
-/// \param abs half-precision floating-point value
-/// \return \a abs and sqrt(1 - \a abs^2) as Q0.30
-inline std::pair<uint32, uint32> atan2_args(unsigned int abs)
-{
-    int exp = -15;
-    for(; abs < 0x400; abs <<= 1, --exp)
-        ;
-    exp += abs >> 10;
-    uint32 my = ((abs & 0x3FF) | 0x400) << 5, r = my * my;
-    int rexp = 2 * exp;
-    r        = 0x40000000 -
-        ((rexp > -31) ? ((r >> -rexp) | ((r & ((static_cast<uint32>(1) << -rexp) - 1)) != 0)) : 1);
-    for(rexp = 0; r < 0x40000000; r <<= 1, --rexp)
-        ;
-    uint32 mx = sqrt<30>(r, rexp);
-    int d     = exp - rexp;
-    if(d < 0)
-        return std::make_pair((d < -14) ? ((my >> (-d - 14)) + ((my >> (-d - 15)) & 1))
-                                        : (my << (14 + d)),
-                              (mx << 14) + (r << 13) / mx);
-    if(d > 0)
-        return std::make_pair(my << 14,
-                              (d > 14)
-                                  ? ((mx >> (d - 14)) + ((mx >> (d - 15)) & 1))
-                                  : ((d == 14) ? mx : ((mx << (14 - d)) + (r << (13 - d)) / mx)));
-    return std::make_pair(my << 13, (mx << 13) + (r << 12) / mx);
-}
-
-/// Get exponentials for hyperbolic computation
-/// \param abs half-precision floating-point value
-/// \param exp variable to take unbiased exponent of larger result
-/// \param n number of BKM iterations (at most 32)
-/// \return exp(abs) and exp(-\a abs) as Q1.31 with same exponent
-inline std::pair<uint32, uint32> hyperbolic_args(unsigned int abs, int& exp, unsigned int n = 32)
-{
-    uint32 mx = detail::multiply64(static_cast<uint32>((abs & 0x3FF) + ((abs > 0x3FF) << 10)) << 21,
-                                   0xB8AA3B29),
-           my;
-    int e = (abs >> 10) + (abs <= 0x3FF);
-    if(e < 14)
-    {
-        exp = 0;
-        mx >>= 14 - e;
-    }
-    else
-    {
-        exp = mx >> (45 - e);
-        mx  = (mx << (e - 14)) & 0x7FFFFFFF;
-    }
-    mx    = exp2(mx, n);
-    int d = exp << 1, s;
-    if(mx > 0x80000000)
-    {
-        my = divide64(0x80000000, mx, s);
-        my |= s;
-        ++d;
-    }
-    else
-        my = mx;
-    return std::make_pair(
-        mx, (d < 31) ? ((my >> d) | ((my & ((static_cast<uint32>(1) << d) - 1)) != 0)) : 1);
-}
-
-/// Postprocessing for binary exponential.
-/// \tparam R rounding mode to use
-/// \tparam I `true` to always raise INEXACT exception, `false` to raise only for rounded results
-/// \param m mantissa as Q1.31
-/// \param exp absolute value of unbiased exponent
-/// \param esign sign of actual exponent
-/// \param sign sign bit of result
-/// \return value converted to half-precision
-/// \exception FE_OVERFLOW on overflows
-/// \exception FE_UNDERFLOW on underflows
-/// \exception FE_INEXACT if value had to be rounded or \a I is `true`
-template <std::float_round_style R, bool I>
-unsigned int exp2_post(uint32 m, int exp, bool esign, unsigned int sign = 0)
-{
-    int s = 0;
-    if(esign)
-    {
-        if(m > 0x80000000)
-        {
-            m = divide64(0x80000000, m, s);
-            ++exp;
-        }
-        if(exp > 25)
-            return underflow<R>(sign);
-        else if(exp == 25)
-            return rounded<R, I>(sign, 1, (m & 0x7FFFFFFF) != 0);
-        exp = -exp;
-    }
-    else if(exp > 15)
-        return overflow<R>(sign);
-    return fixed2half<R, 31, false, false, I>(m, exp + 14, sign, s);
-}
-
-/// Postprocessing for binary logarithm.
-/// \tparam R rounding mode to use
-/// \tparam L logarithm for base transformation as Q1.31
-/// \param m fractional part of logarithm as Q0.31
-/// \param ilog signed integer part of logarithm
-/// \param exp biased exponent of result
-/// \param sign sign bit of result
-/// \return value base-transformed and converted to half-precision
-/// \exception FE_OVERFLOW on overflows
-/// \exception FE_UNDERFLOW on underflows
-/// \exception FE_INEXACT if no other exception occurred
-template <std::float_round_style R, uint32 L>
-unsigned int log2_post(uint32 m, int ilog, int exp, unsigned int sign = 0)
-{
-    uint32 msign = sign_mask(ilog);
-    m            = (((static_cast<uint32>(ilog) << 27) + (m >> 4)) ^ msign) - msign;
-    if(!m)
-        return 0;
-    for(; m < 0x80000000; m <<= 1, --exp)
-        ;
-    int i = m >= L, s;
-    exp += i;
-    m >>= 1 + i;
-    sign ^= msign & 0x8000;
-    if(exp < -11)
-        return underflow<R>(sign);
-    m = divide64(m, L, s);
-    return fixed2half<R, 30, false, false, true>(m, exp, sign, 1);
-}
-
-/// Hypotenuse square root and postprocessing.
-/// \tparam R rounding mode to use
-/// \param r mantissa as Q2.30
-/// \param exp unbiased exponent
-/// \return square root converted to half-precision
-/// \exception FE_OVERFLOW on overflows
-/// \exception FE_UNDERFLOW on underflows
-/// \exception FE_INEXACT if value had to be rounded
-template <std::float_round_style R>
-unsigned int hypot_post(uint32 r, int exp)
-{
-    int i = r >> 31;
-    if((exp += i) > 46)
-        return overflow<R>();
-    if(exp < -34)
-        return underflow<R>();
-    r        = (r >> i) | (r & i);
-    uint32 m = sqrt<30>(r, exp += 15);
-    return fixed2half<R, 15, false, false, false>(m, exp - 1, 0, r != 0);
-}
-
-/// Division and postprocessing for tangents.
-/// \tparam R rounding mode to use
-/// \param my dividend as Q1.31
-/// \param mx divisor as Q1.31
-/// \param exp biased exponent of result
-/// \param sign sign bit of result
-/// \return quotient converted to half-precision
-/// \exception FE_OVERFLOW on overflows
-/// \exception FE_UNDERFLOW on underflows
-/// \exception FE_INEXACT if no other exception occurred
-template <std::float_round_style R>
-unsigned int tangent_post(uint32 my, uint32 mx, int exp, unsigned int sign = 0)
-{
-    int i = my >= mx, s;
-    exp += i;
-    if(exp > 29)
-        return overflow<R>(sign);
-    if(exp < -11)
-        return underflow<R>(sign);
-    uint32 m = divide64(my >> (i + 1), mx, s);
-    return fixed2half<R, 30, false, false, true>(m, exp, sign, s);
-}
-
-/// Area function and postprocessing.
-/// This computes the value directly in Q2.30 using the representation `asinh|acosh(x) =
-/// log(x+sqrt(x^2+|-1))`.
-/// \tparam R rounding mode to use
-/// \tparam S `true` for asinh, `false` for acosh
-/// \param arg half-precision argument
-/// \return asinh|acosh(\a arg) converted to half-precision
-/// \exception FE_OVERFLOW on overflows
-/// \exception FE_UNDERFLOW on underflows
-/// \exception FE_INEXACT if no other exception occurred
-template <std::float_round_style R, bool S>
-unsigned int area(unsigned int arg)
-{
-    int abs = arg & 0x7FFF, expx = (abs >> 10) + (abs <= 0x3FF) - 15, expy = -15, ilog, i;
-    uint32 mx = static_cast<uint32>((abs & 0x3FF) | ((abs > 0x3FF) << 10)) << 20, my, r;
-    for(; abs < 0x400; abs <<= 1, --expy)
-        ;
-    expy += abs >> 10;
-    r = ((abs & 0x3FF) | 0x400) << 5;
-    r *= r;
-    i    = r >> 31;
-    expy = 2 * expy + i;
-    r >>= i;
-    if(S)
-    {
-        if(expy < 0)
-        {
-            r    = 0x40000000 + ((expy > -30) ? ((r >> -expy) |
-                                              ((r & ((static_cast<uint32>(1) << -expy) - 1)) != 0))
-                                           : 1);
-            expy = 0;
-        }
-        else
-        {
-            r += 0x40000000 >> expy;
-            i = r >> 31;
-            r = (r >> i) | (r & i);
-            expy += i;
-        }
-    }
-    else
-    {
-        r -= 0x40000000 >> expy;
-        for(; r < 0x40000000; r <<= 1, --expy)
-            ;
-    }
-    my = sqrt<30>(r, expy);
-    my = (my << 15) + (r << 14) / my;
-    if(S)
-    {
-        mx >>= expy - expx;
-        ilog = expy;
-    }
-    else
-    {
-        my >>= expx - expy;
-        ilog = expx;
-    }
-    my += mx;
-    i                  = my >> 31;
-    static const int G = S && (R == std::round_to_nearest);
-    return log2_post<R, 0xB8AA3B2A>(
-        log2(my >> i, 26 + S + G) + (G << 3), ilog + i, 17, arg & (static_cast<unsigned>(S) << 15));
-}
-
-/// Class for 1.31 unsigned floating-point computation
-struct f31
-{
-    /// Constructor.
-    /// \param mant mantissa as 1.31
-    /// \param e exponent
-    HALF_CONSTEXPR f31(uint32 mant, int e) : m(mant), exp(e) {}
-
-    /// Constructor.
-    /// \param abs unsigned half-precision value
-    f31(unsigned int abs) : exp(-15)
-    {
-        for(; abs < 0x400; abs <<= 1, --exp)
-            ;
-        m = static_cast<uint32>((abs & 0x3FF) | 0x400) << 21;
-        exp += (abs >> 10);
-    }
-
-    /// Addition operator.
-    /// \param a first operand
-    /// \param b second operand
-    /// \return \a a + \a b
-    friend f31 operator+(f31 a, f31 b)
-    {
-        if(b.exp > a.exp)
-            std::swap(a, b);
-        int d    = a.exp - b.exp;
-        uint32 m = a.m + ((d < 32) ? (b.m >> d) : 0);
-        int i    = (m & 0xFFFFFFFF) < a.m;
-        return f31(((m + i) >> i) | 0x80000000, a.exp + i);
-    }
-
-    /// Subtraction operator.
-    /// \param a first operand
-    /// \param b second operand
-    /// \return \a a - \a b
-    friend f31 operator-(f31 a, f31 b)
-    {
-        int d = a.exp - b.exp, exp = a.exp;
-        uint32 m = a.m - ((d < 32) ? (b.m >> d) : 0);
-        if(!m)
-            return f31(0, -32);
-        for(; m < 0x80000000; m <<= 1, --exp)
-            ;
-        return f31(m, exp);
-    }
-
-    /// Multiplication operator.
-    /// \param a first operand
-    /// \param b second operand
-    /// \return \a a * \a b
-    friend f31 operator*(f31 a, f31 b)
-    {
-        uint32 m = multiply64(a.m, b.m);
-        int i    = m >> 31;
-        return f31(m << (1 - i), a.exp + b.exp + i);
-    }
-
-    /// Division operator.
-    /// \param a first operand
-    /// \param b second operand
-    /// \return \a a / \a b
-    friend f31 operator/(f31 a, f31 b)
-    {
-        int i    = a.m >= b.m, s;
-        uint32 m = divide64((a.m + i) >> i, b.m, s);
-        return f31(m, a.exp - b.exp + i - 1);
-    }
-
-    uint32 m; ///< mantissa as 1.31.
-    int exp;  ///< exponent.
-};
-
-/// Error function and postprocessing.
-/// This computes the value directly in Q1.31 using the approximations given
-/// [here](https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions).
-/// \tparam R rounding mode to use
-/// \tparam C `true` for comlementary error function, `false` else
-/// \param arg half-precision function argument
-/// \return approximated value of error function in half-precision
-/// \exception FE_OVERFLOW on overflows
-/// \exception FE_UNDERFLOW on underflows
-/// \exception FE_INEXACT if no other exception occurred
-template <std::float_round_style R, bool C>
-unsigned int erf(unsigned int arg)
-{
-    unsigned int abs = arg & 0x7FFF, sign = arg & 0x8000;
-    f31 x(abs), x2                        = x * x * f31(0xB8AA3B29, 0),
-                t = f31(0x80000000, 0) / (f31(0x80000000, 0) + f31(0xA7BA054A, -2) * x), t2 = t * t;
-    f31 e = ((f31(0x87DC2213, 0) * t2 + f31(0xB5F0E2AE, 0)) * t2 + f31(0x82790637, -2) -
-             (f31(0xBA00E2B8, 0) * t2 + f31(0x91A98E62, -2)) * t) *
-            t /
-            ((x2.exp < 0) ? f31(exp2((x2.exp > -32) ? (x2.m >> -x2.exp) : 0, 30), 0)
-                          : f31(exp2((x2.m << x2.exp) & 0x7FFFFFFF, 22), x2.m >> (31 - x2.exp)));
-    return (!C || sign)
-               ? fixed2half<R, 31, false, true, true>(
-                     0x80000000 - (e.m >> (C - e.exp)), 14 + C, sign & (C - 1U))
-               : (e.exp < -25)
-                     ? underflow<R>()
-                     : fixed2half<R, 30, false, false, true>(e.m >> 1, e.exp + 14, 0, e.m & 1);
-}
-
-/// Gamma function and postprocessing.
-/// This approximates the value of either the gamma function or its logarithm directly in Q1.31.
-/// \tparam R rounding mode to use
-/// \tparam L `true` for lograithm of gamma function, `false` for gamma function
-/// \param arg half-precision floating-point value
-/// \return lgamma/tgamma(\a arg) in half-precision
-/// \exception FE_OVERFLOW on overflows
-/// \exception FE_UNDERFLOW on underflows
-/// \exception FE_INEXACT if \a arg is not a positive integer
-template <std::float_round_style R, bool L>
-unsigned int gamma(unsigned int arg)
-{
-    /*			static const double p[] ={ 2.50662827563479526904, 225.525584619175212544,
-       -268.295973841304927459, 80.9030806934622512966, -5.00757863970517583837,
-       0.0114684895434781459556 }; double t = arg + 4.65, s = p[0]; for(unsigned int i=0; i<5; ++i)
-                                s += p[i+1] / (arg+i);
-                        return std::log(s) + (arg-0.5)*std::log(t) - t;
-*/ static const f31 pi(0xC90FDAA2, 1), lbe(0xB8AA3B29, 0);
-    unsigned int abs = arg & 0x7FFF, sign = arg & 0x8000;
-    bool bsign = sign != 0;
-    f31 z(abs), x = sign ? (z + f31(0x80000000, 0)) : z, t = x + f31(0x94CCCCCD, 2),
-                s = f31(0xA06C9901, 1) + f31(0xBBE654E2, -7) / (x + f31(0x80000000, 2)) +
-                    f31(0xA1CE6098, 6) / (x + f31(0x80000000, 1)) + f31(0xE1868CB7, 7) / x -
-                    f31(0x8625E279, 8) / (x + f31(0x80000000, 0)) -
-                    f31(0xA03E158F, 2) / (x + f31(0xC0000000, 1));
-    int i = (s.exp >= 2) + (s.exp >= 4) + (s.exp >= 8) + (s.exp >= 16);
-    s     = f31((static_cast<uint32>(s.exp) << (31 - i)) + (log2(s.m >> 1, 28) >> i), i) / lbe;
-    if(x.exp != -1 || x.m != 0x80000000)
-    {
-        i     = (t.exp >= 2) + (t.exp >= 4) + (t.exp >= 8);
-        f31 l = f31((static_cast<uint32>(t.exp) << (31 - i)) + (log2(t.m >> 1, 30) >> i), i) / lbe;
-        s     = (x.exp < -1) ? (s - (f31(0x80000000, -1) - x) * l)
-                         : (s + (x - f31(0x80000000, -1)) * l);
-    }
-    s = x.exp ? (s - t) : (t - s);
-    if(bsign)
-    {
-        if(z.exp >= 0)
-        {
-            sign &= (L | ((z.m >> (31 - z.exp)) & 1)) - 1;
-            for(z = f31((z.m << (1 + z.exp)) & 0xFFFFFFFF, -1); z.m < 0x80000000;
-                z.m <<= 1, --z.exp)
-                ;
-        }
-        if(z.exp == -1)
-            z = f31(0x80000000, 0) - z;
-        if(z.exp < -1)
-        {
-            z   = z * pi;
-            z.m = sincos(z.m >> (1 - z.exp), 30).first;
-            for(z.exp = 1; z.m < 0x80000000; z.m <<= 1, --z.exp)
-                ;
-        }
-        else
-            z = f31(0x80000000, 0);
-    }
-    if(L)
-    {
-        if(bsign)
-        {
-            f31 l(0x92868247, 0);
-            if(z.exp < 0)
-            {
-                uint32 m = log2((z.m + 1) >> 1, 27);
-                z        = f31(-((static_cast<uint32>(z.exp) << 26) + (m >> 5)), 5);
-                for(; z.m < 0x80000000; z.m <<= 1, --z.exp)
-                    ;
-                l = l + z / lbe;
-            }
-            sign = static_cast<unsigned>(x.exp && (l.exp < s.exp || (l.exp == s.exp && l.m < s.m)))
-                   << 15;
-            s = sign ? (s - l) : x.exp ? (l - s) : (l + s);
-        }
-        else
-        {
-            sign = static_cast<unsigned>(x.exp == 0) << 15;
-            if(s.exp < -24)
-                return underflow<R>(sign);
-            if(s.exp > 15)
-                return overflow<R>(sign);
-        }
-    }
-    else
-    {
-        s = s * lbe;
-        uint32 m;
-        if(s.exp < 0)
-        {
-            m     = s.m >> -s.exp;
-            s.exp = 0;
-        }
-        else
-        {
-            m     = (s.m << s.exp) & 0x7FFFFFFF;
-            s.exp = (s.m >> (31 - s.exp));
-        }
-        s.m = exp2(m, 27);
-        if(!x.exp)
-            s = f31(0x80000000, 0) / s;
-        if(bsign)
-        {
-            if(z.exp < 0)
-                s = s * z;
-            s = pi / s;
-            if(s.exp < -24)
-                return underflow<R>(sign);
-        }
-        else if(z.exp > 0 && !(z.m & ((1 << (31 - z.exp)) - 1)))
-            return ((s.exp + 14) << 10) + (s.m >> 21);
-        if(s.exp > 15)
-            return overflow<R>(sign);
-    }
-    return fixed2half<R, 31, false, false, true>(s.m, s.exp + 14, sign);
-}
-/// \}
-
-template <typename, typename, std::float_round_style>
-struct half_caster;
-} // namespace detail
-
-/// Half-precision floating-point type.
-/// This class implements an IEEE-conformant half-precision floating-point type with the usual
-/// arithmetic
-/// operators and conversions. It is implicitly convertible to single-precision floating-point,
-/// which makes artihmetic
-/// expressions and functions with mixed-type operands to be of the most precise operand type.
-///
-/// According to the C++98/03 definition, the half type is not a POD type. But according to C++11's
-/// less strict and
-/// extended definitions it is both a standard layout type and a trivially copyable type (even if
-/// not a POD type), which
-/// means it can be standard-conformantly copied using raw binary copies. But in this context some
-/// more words about the
-/// actual size of the type. Although the half is representing an IEEE 16-bit type, it does not
-/// neccessarily have to be of
-/// exactly 16-bits size. But on any reasonable implementation the actual binary representation of
-/// this type will most
-/// probably not ivolve any additional "magic" or padding beyond the simple binary representation of
-/// the underlying 16-bit
-/// IEEE number, even if not strictly guaranteed by the standard. But even then it only has an
-/// actual size of 16 bits if
-/// your C++ implementation supports an unsigned integer type of exactly 16 bits width. But this
-/// should be the case on
-/// nearly any reasonable platform.
-///
-/// So if your C++ implementation is not totally exotic or imposes special alignment requirements,
-/// it is a reasonable
-/// assumption that the data of a half is just comprised of the 2 bytes of the underlying IEEE
-/// representation.
-class half
-{
-    public:
-    /// \name Construction and assignment
-    /// \{
-
-    /// Default constructor.
-    /// This initializes the half to 0. Although this does not match the builtin types'
-    /// default-initialization semantics
-    /// and may be less efficient than no initialization, it is needed to provide proper
-    /// value-initialization semantics.
-    HALF_CONSTEXPR half() HALF_NOEXCEPT : data_() {}
-
-    /// Conversion constructor.
-    /// \param rhs float to convert
-    /// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-    explicit half(float rhs)
-        : data_(static_cast<detail::uint16>(detail::float2half<round_style>(rhs)))
-    {
-    }
-
-    /// Conversion to single-precision.
-    /// \return single precision value representing expression value
-    operator float() const { return detail::half2float<float>(data_); }
-
-    /// Assignment operator.
-    /// \param rhs single-precision value to copy from
-    /// \return reference to this half
-    /// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-    half& operator=(float rhs)
-    {
-        data_ = static_cast<detail::uint16>(detail::float2half<round_style>(rhs));
-        return *this;
-    }
-
-    /// \}
-    /// \name Arithmetic updates
-    /// \{
-
-    /// Arithmetic assignment.
-    /// \tparam T type of concrete half expression
-    /// \param rhs half expression to add
-    /// \return reference to this half
-    /// \exception FE_... according to operator+(half,half)
-    half& operator+=(half rhs) { return *this = *this + rhs; }
-
-    /// Arithmetic assignment.
-    /// \tparam T type of concrete half expression
-    /// \param rhs half expression to subtract
-    /// \return reference to this half
-    /// \exception FE_... according to operator-(half,half)
-    half& operator-=(half rhs) { return *this = *this - rhs; }
-
-    /// Arithmetic assignment.
-    /// \tparam T type of concrete half expression
-    /// \param rhs half expression to multiply with
-    /// \return reference to this half
-    /// \exception FE_... according to operator*(half,half)
-    half& operator*=(half rhs) { return *this = *this * rhs; }
-
-    /// Arithmetic assignment.
-    /// \tparam T type of concrete half expression
-    /// \param rhs half expression to divide by
-    /// \return reference to this half
-    /// \exception FE_... according to operator/(half,half)
-    half& operator/=(half rhs) { return *this = *this / rhs; }
-
-    /// Arithmetic assignment.
-    /// \param rhs single-precision value to add
-    /// \return reference to this half
-    /// \exception FE_... according to operator=()
-    half& operator+=(float rhs) { return *this = *this + rhs; }
-
-    /// Arithmetic assignment.
-    /// \param rhs single-precision value to subtract
-    /// \return reference to this half
-    /// \exception FE_... according to operator=()
-    half& operator-=(float rhs) { return *this = *this - rhs; }
-
-    /// Arithmetic assignment.
-    /// \param rhs single-precision value to multiply with
-    /// \return reference to this half
-    /// \exception FE_... according to operator=()
-    half& operator*=(float rhs) { return *this = *this * rhs; }
-
-    /// Arithmetic assignment.
-    /// \param rhs single-precision value to divide by
-    /// \return reference to this half
-    /// \exception FE_... according to operator=()
-    half& operator/=(float rhs) { return *this = *this / rhs; }
-
-    /// \}
-    /// \name Increment and decrement
-    /// \{
-
-    /// Prefix increment.
-    /// \return incremented half value
-    /// \exception FE_... according to operator+(half,half)
-    half& operator++() { return *this = *this + half(detail::binary, 0x3C00); }
-
-    /// Prefix decrement.
-    /// \return decremented half value
-    /// \exception FE_... according to operator-(half,half)
-    half& operator--() { return *this = *this + half(detail::binary, 0xBC00); }
-
-    /// Postfix increment.
-    /// \return non-incremented half value
-    /// \exception FE_... according to operator+(half,half)
-    half operator++(int)
-    {
-        half out(*this);
-        ++*this;
-        return out;
-    }
-
-    /// Postfix decrement.
-    /// \return non-decremented half value
-    /// \exception FE_... according to operator-(half,half)
-    half operator--(int)
-    {
-        half out(*this);
-        --*this;
-        return out;
-    }
-    /// \}
-
-    private:
-    /// Rounding mode to use
-    static const std::float_round_style round_style = (std::float_round_style)(HALF_ROUND_STYLE);
-
-    /// Constructor.
-    /// \param bits binary representation to set half to
-    HALF_CONSTEXPR half(detail::binary_t, unsigned int bits) HALF_NOEXCEPT
-        : data_(static_cast<detail::uint16>(bits))
-    {
-    }
-
-    /// Internal binary representation
-    detail::uint16 data_;
-
-#ifndef HALF_DOXYGEN_ONLY
-    friend HALF_CONSTEXPR_NOERR bool operator==(half, half);
-    friend HALF_CONSTEXPR_NOERR bool operator!=(half, half);
-    friend HALF_CONSTEXPR_NOERR bool operator<(half, half);
-    friend HALF_CONSTEXPR_NOERR bool operator>(half, half);
-    friend HALF_CONSTEXPR_NOERR bool operator<=(half, half);
-    friend HALF_CONSTEXPR_NOERR bool operator>=(half, half);
-    friend HALF_CONSTEXPR half operator-(half);
-    friend half operator+(half, half);
-    friend half operator-(half, half);
-    friend half operator*(half, half);
-    friend half operator/(half, half);
-    template <typename charT, typename traits>
-    friend std::basic_ostream<charT, traits>& operator<<(std::basic_ostream<charT, traits>&, half);
-    template <typename charT, typename traits>
-    friend std::basic_istream<charT, traits>& operator>>(std::basic_istream<charT, traits>&, half&);
-    friend HALF_CONSTEXPR half fabs(half);
-    friend half fmod(half, half);
-    friend half remainder(half, half);
-    friend half remquo(half, half, int*);
-    friend half fma(half, half, half);
-    friend HALF_CONSTEXPR_NOERR half fmax(half, half);
-    friend HALF_CONSTEXPR_NOERR half fmin(half, half);
-    friend half fdim(half, half);
-    friend half nanh(const char*);
-    friend half exp(half);
-    friend half exp2(half);
-    friend half expm1(half);
-    friend half log(half);
-    friend half log10(half);
-    friend half log2(half);
-    friend half log1p(half);
-    friend half sqrt(half);
-    friend half cbrt(half);
-    friend half hypot(half, half);
-    friend half hypot(half, half, half);
-    friend half pow(half, half);
-    friend void sincos(half, half*, half*);
-    friend half sin(half);
-    friend half cos(half);
-    friend half tan(half);
-    friend half asin(half);
-    friend half acos(half);
-    friend half atan(half);
-    friend half atan2(half, half);
-    friend half sinh(half);
-    friend half cosh(half);
-    friend half tanh(half);
-    friend half asinh(half);
-    friend half acosh(half);
-    friend half atanh(half);
-    friend half erf(half);
-    friend half erfc(half);
-    friend half lgamma(half);
-    friend half tgamma(half);
-    friend half ceil(half);
-    friend half floor(half);
-    friend half trunc(half);
-    friend half round(half);
-    friend long lround(half);
-    friend half rint(half);
-    friend long lrint(half);
-    friend half nearbyint(half);
-#ifdef HALF_ENABLE_CPP11_LONG_LONG
-    friend long long llround(half);
-    friend long long llrint(half);
-#endif
-    friend half frexp(half, int*);
-    friend half scalbln(half, long);
-    friend half modf(half, half*);
-    friend int ilogb(half);
-    friend half logb(half);
-    friend half nextafter(half, half);
-    friend half nexttoward(half, long double);
-    friend HALF_CONSTEXPR half copysign(half, half);
-    friend HALF_CONSTEXPR int fpclassify(half);
-    friend HALF_CONSTEXPR bool isfinite(half);
-    friend HALF_CONSTEXPR bool isinf(half);
-    friend HALF_CONSTEXPR bool isnan(half);
-    friend HALF_CONSTEXPR bool isnormal(half);
-    friend HALF_CONSTEXPR bool signbit(half);
-    friend HALF_CONSTEXPR bool isgreater(half, half);
-    friend HALF_CONSTEXPR bool isgreaterequal(half, half);
-    friend HALF_CONSTEXPR bool isless(half, half);
-    friend HALF_CONSTEXPR bool islessequal(half, half);
-    friend HALF_CONSTEXPR bool islessgreater(half, half);
-    template <typename, typename, std::float_round_style>
-    friend struct detail::half_caster;
-    friend class std::numeric_limits<half>;
-#if HALF_ENABLE_CPP11_HASH
-    friend struct std::hash<half>;
-#endif
-#if HALF_ENABLE_CPP11_USER_LITERALS
-    friend half literal::operator"" _h(long double);
-#endif
-#endif
-};
-
-#if HALF_ENABLE_CPP11_USER_LITERALS
-namespace literal {
-/// Half literal.
-/// While this returns a properly rounded half-precision value, half literals can unfortunately not
-/// be constant
-/// expressions due to rather involved conversions. So don't expect this to be a literal literal
-/// without involving
-/// conversion operations at runtime. It is a convenience feature, not a performance optimization.
-/// \param value literal value
-/// \return half with of given value (possibly rounded)
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half operator"" _h(long double value)
-{
-    return half(detail::binary, detail::float2half<half::round_style>(value));
-}
-} // namespace literal
-#endif
-
-namespace detail {
-/// Helper class for half casts.
-/// This class template has to be specialized for all valid cast arguments to define an appropriate
-/// static
-/// `cast` member function and a corresponding `type` member denoting its return type.
-/// \tparam T destination type
-/// \tparam U source type
-/// \tparam R rounding mode to use
-template <typename T,
-          typename U,
-          std::float_round_style R = (std::float_round_style)(HALF_ROUND_STYLE)>
-struct half_caster
-{
-};
-template <typename U, std::float_round_style R>
-struct half_caster<half, U, R>
-{
-#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
-    static_assert(std::is_arithmetic<U>::value, "half_cast from non-arithmetic type unsupported");
-#endif
-
-    static half cast(U arg) { return cast_impl(arg, is_float<U>()); };
-
-    private:
-    static half cast_impl(U arg, true_type) { return half(binary, float2half<R>(arg)); }
-    static half cast_impl(U arg, false_type) { return half(binary, int2half<R>(arg)); }
-};
-template <typename T, std::float_round_style R>
-struct half_caster<T, half, R>
-{
-#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
-    static_assert(std::is_arithmetic<T>::value, "half_cast to non-arithmetic type unsupported");
-#endif
-
-    static T cast(half arg) { return cast_impl(arg, is_float<T>()); }
-
-    private:
-    static T cast_impl(half arg, true_type) { return half2float<T>(arg.data_); }
-    static T cast_impl(half arg, false_type) { return half2int<R, true, true, T>(arg.data_); }
-};
-template <std::float_round_style R>
-struct half_caster<half, half, R>
-{
-    static half cast(half arg) { return arg; }
-};
-} // namespace detail
-} // namespace half_float
-
-/// Extensions to the C++ standard library.
-namespace std {
-/// Numeric limits for half-precision floats.
-/// **See also:** Documentation for
-/// [std::numeric_limits](https://en.cppreference.com/w/cpp/types/numeric_limits)
-template <>
-class numeric_limits<half_float::half>
-{
-    public:
-    /// Is template specialization.
-    static HALF_CONSTEXPR_CONST bool is_specialized = true;
-
-    /// Supports signed values.
-    static HALF_CONSTEXPR_CONST bool is_signed = true;
-
-    /// Is not an integer type.
-    static HALF_CONSTEXPR_CONST bool is_integer = false;
-
-    /// Is not exact.
-    static HALF_CONSTEXPR_CONST bool is_exact = false;
-
-    /// Doesn't provide modulo arithmetic.
-    static HALF_CONSTEXPR_CONST bool is_modulo = false;
-
-    /// Has a finite set of values.
-    static HALF_CONSTEXPR_CONST bool is_bounded = true;
-
-    /// IEEE conformant.
-    static HALF_CONSTEXPR_CONST bool is_iec559 = true;
-
-    /// Supports infinity.
-    static HALF_CONSTEXPR_CONST bool has_infinity = true;
-
-    /// Supports quiet NaNs.
-    static HALF_CONSTEXPR_CONST bool has_quiet_NaN = true;
-
-    /// Supports signaling NaNs.
-    static HALF_CONSTEXPR_CONST bool has_signaling_NaN = true;
-
-    /// Supports subnormal values.
-    static HALF_CONSTEXPR_CONST float_denorm_style has_denorm = denorm_present;
-
-    /// Supports no denormalization detection.
-    static HALF_CONSTEXPR_CONST bool has_denorm_loss = false;
-
-#if HALF_ERRHANDLING_THROWS
-    static HALF_CONSTEXPR_CONST bool traps = true;
-#else
-    /// Traps only if [HALF_ERRHANDLING_THROW_...](\ref HALF_ERRHANDLING_THROW_INVALID) is
-    /// acitvated.
-    static HALF_CONSTEXPR_CONST bool traps = false;
-#endif
-
-    /// Does not support no pre-rounding underflow detection.
-    static HALF_CONSTEXPR_CONST bool tinyness_before = false;
-
-    /// Rounding mode.
-    static HALF_CONSTEXPR_CONST float_round_style round_style = half_float::half::round_style;
-
-    /// Significant digits.
-    static HALF_CONSTEXPR_CONST int digits = 11;
-
-    /// Significant decimal digits.
-    static HALF_CONSTEXPR_CONST int digits10 = 3;
-
-    /// Required decimal digits to represent all possible values.
-    static HALF_CONSTEXPR_CONST int max_digits10 = 5;
-
-    /// Number base.
-    static HALF_CONSTEXPR_CONST int radix = 2;
-
-    /// One more than smallest exponent.
-    static HALF_CONSTEXPR_CONST int min_exponent = -13;
-
-    /// Smallest normalized representable power of 10.
-    static HALF_CONSTEXPR_CONST int min_exponent10 = -4;
-
-    /// One more than largest exponent
-    static HALF_CONSTEXPR_CONST int max_exponent = 16;
-
-    /// Largest finitely representable power of 10.
-    static HALF_CONSTEXPR_CONST int max_exponent10 = 4;
-
-    /// Smallest positive normal value.
-    static HALF_CONSTEXPR half_float::half min() HALF_NOTHROW
-    {
-        return half_float::half(half_float::detail::binary, 0x0400);
-    }
-
-    /// Smallest finite value.
-    static HALF_CONSTEXPR half_float::half lowest() HALF_NOTHROW
-    {
-        return half_float::half(half_float::detail::binary, 0xFBFF);
-    }
-
-    /// Largest finite value.
-    static HALF_CONSTEXPR half_float::half max() HALF_NOTHROW
-    {
-        return half_float::half(half_float::detail::binary, 0x7BFF);
-    }
-
-    /// Difference between 1 and next representable value.
-    static HALF_CONSTEXPR half_float::half epsilon() HALF_NOTHROW
-    {
-        return half_float::half(half_float::detail::binary, 0x1400);
-    }
-
-    /// Maximum rounding error in ULP (units in the last place).
-    static HALF_CONSTEXPR half_float::half round_error() HALF_NOTHROW
-    {
-        return half_float::half(half_float::detail::binary,
-                                (round_style == std::round_to_nearest) ? 0x3800 : 0x3C00);
-    }
-
-    /// Positive infinity.
-    static HALF_CONSTEXPR half_float::half infinity() HALF_NOTHROW
-    {
-        return half_float::half(half_float::detail::binary, 0x7C00);
-    }
-
-    /// Quiet NaN.
-    static HALF_CONSTEXPR half_float::half quiet_NaN() HALF_NOTHROW
-    {
-        return half_float::half(half_float::detail::binary, 0x7FFF);
-    }
-
-    /// Signaling NaN.
-    static HALF_CONSTEXPR half_float::half signaling_NaN() HALF_NOTHROW
-    {
-        return half_float::half(half_float::detail::binary, 0x7DFF);
-    }
-
-    /// Smallest positive subnormal value.
-    static HALF_CONSTEXPR half_float::half denorm_min() HALF_NOTHROW
-    {
-        return half_float::half(half_float::detail::binary, 0x0001);
-    }
-};
-
-#if HALF_ENABLE_CPP11_HASH
-/// Hash function for half-precision floats.
-/// This is only defined if C++11 `std::hash` is supported and enabled.
-///
-/// **See also:** Documentation for [std::hash](https://en.cppreference.com/w/cpp/utility/hash)
-template <>
-struct hash<half_float::half>
-{
-    /// Type of function argument.
-    typedef half_float::half argument_type;
-
-    /// Function return type.
-    typedef size_t result_type;
-
-    /// Compute hash function.
-    /// \param arg half to hash
-    /// \return hash value
-    result_type operator()(argument_type arg) const
-    {
-        return hash<half_float::detail::uint16>()(arg.data_ &
-                                                  -static_cast<unsigned>(arg.data_ != 0x8000));
-    }
-};
-#endif
-} // namespace std
-
-namespace half_float {
-/// \anchor compop
-/// \name Comparison operators
-/// \{
-
-/// Comparison for equality.
-/// \param x first operand
-/// \param y second operand
-/// \retval true if operands equal
-/// \retval false else
-/// \exception FE_INVALID if \a x or \a y is NaN
-inline HALF_CONSTEXPR_NOERR bool operator==(half x, half y)
-{
-    return !detail::compsignal(x.data_, y.data_) &&
-           (x.data_ == y.data_ || !((x.data_ | y.data_) & 0x7FFF));
-}
-
-/// Comparison for inequality.
-/// \param x first operand
-/// \param y second operand
-/// \retval true if operands not equal
-/// \retval false else
-/// \exception FE_INVALID if \a x or \a y is NaN
-inline HALF_CONSTEXPR_NOERR bool operator!=(half x, half y)
-{
-    return detail::compsignal(x.data_, y.data_) ||
-           (x.data_ != y.data_ && ((x.data_ | y.data_) & 0x7FFF));
-}
-
-/// Comparison for less than.
-/// \param x first operand
-/// \param y second operand
-/// \retval true if \a x less than \a y
-/// \retval false else
-/// \exception FE_INVALID if \a x or \a y is NaN
-inline HALF_CONSTEXPR_NOERR bool operator<(half x, half y)
-{
-    return !detail::compsignal(x.data_, y.data_) &&
-           ((x.data_ ^ (0x8000 | (0x8000 - (x.data_ >> 15)))) + (x.data_ >> 15)) <
-               ((y.data_ ^ (0x8000 | (0x8000 - (y.data_ >> 15)))) + (y.data_ >> 15));
-}
-
-/// Comparison for greater than.
-/// \param x first operand
-/// \param y second operand
-/// \retval true if \a x greater than \a y
-/// \retval false else
-/// \exception FE_INVALID if \a x or \a y is NaN
-inline HALF_CONSTEXPR_NOERR bool operator>(half x, half y)
-{
-    return !detail::compsignal(x.data_, y.data_) &&
-           ((x.data_ ^ (0x8000 | (0x8000 - (x.data_ >> 15)))) + (x.data_ >> 15)) >
-               ((y.data_ ^ (0x8000 | (0x8000 - (y.data_ >> 15)))) + (y.data_ >> 15));
-}
-
-/// Comparison for less equal.
-/// \param x first operand
-/// \param y second operand
-/// \retval true if \a x less equal \a y
-/// \retval false else
-/// \exception FE_INVALID if \a x or \a y is NaN
-inline HALF_CONSTEXPR_NOERR bool operator<=(half x, half y)
-{
-    return !detail::compsignal(x.data_, y.data_) &&
-           ((x.data_ ^ (0x8000 | (0x8000 - (x.data_ >> 15)))) + (x.data_ >> 15)) <=
-               ((y.data_ ^ (0x8000 | (0x8000 - (y.data_ >> 15)))) + (y.data_ >> 15));
-}
-
-/// Comparison for greater equal.
-/// \param x first operand
-/// \param y second operand
-/// \retval true if \a x greater equal \a y
-/// \retval false else
-/// \exception FE_INVALID if \a x or \a y is NaN
-inline HALF_CONSTEXPR_NOERR bool operator>=(half x, half y)
-{
-    return !detail::compsignal(x.data_, y.data_) &&
-           ((x.data_ ^ (0x8000 | (0x8000 - (x.data_ >> 15)))) + (x.data_ >> 15)) >=
-               ((y.data_ ^ (0x8000 | (0x8000 - (y.data_ >> 15)))) + (y.data_ >> 15));
-}
-
-/// \}
-/// \anchor arithmetics
-/// \name Arithmetic operators
-/// \{
-
-/// Identity.
-/// \param arg operand
-/// \return unchanged operand
-inline HALF_CONSTEXPR half operator+(half arg) { return arg; }
-
-/// Negation.
-/// \param arg operand
-/// \return negated operand
-inline HALF_CONSTEXPR half operator-(half arg) { return half(detail::binary, arg.data_ ^ 0x8000); }
-
-/// Addition.
-/// This operation is exact to rounding for all rounding modes.
-/// \param x left operand
-/// \param y right operand
-/// \return sum of half expressions
-/// \exception FE_INVALID if \a x and \a y are infinities with different signs or signaling NaNs
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half operator+(half x, half y)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(
-        detail::binary,
-        detail::float2half<half::round_style>(detail::half2float<detail::internal_t>(x.data_) +
-                                              detail::half2float<detail::internal_t>(y.data_)));
-#else
-    int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF;
-    bool sub = ((x.data_ ^ y.data_) & 0x8000) != 0;
-    if(absx >= 0x7C00 || absy >= 0x7C00)
-        return half(detail::binary,
-                    (absx > 0x7C00 || absy > 0x7C00)
-                        ? detail::signal(x.data_, y.data_)
-                        : (absy != 0x7C00) ? x.data_
-                                           : (sub && absx == 0x7C00) ? detail::invalid() : y.data_);
-    if(!absx)
-        return absy ? y
-                    : half(detail::binary,
-                           (half::round_style == std::round_toward_neg_infinity)
-                               ? (x.data_ | y.data_)
-                               : (x.data_ & y.data_));
-    if(!absy)
-        return x;
-    unsigned int sign = ((sub && absy > absx) ? y.data_ : x.data_) & 0x8000;
-    if(absy > absx)
-        std::swap(absx, absy);
-    int exp = (absx >> 10) + (absx <= 0x3FF), d = exp - (absy >> 10) - (absy <= 0x3FF),
-        mx = ((absx & 0x3FF) | ((absx > 0x3FF) << 10)) << 3, my;
-    if(d < 13)
-    {
-        my = ((absy & 0x3FF) | ((absy > 0x3FF) << 10)) << 3;
-        my = (my >> d) | ((my & ((1 << d) - 1)) != 0);
-    }
-    else
-        my = 1;
-    if(sub)
-    {
-        if(!(mx -= my))
-            return half(detail::binary,
-                        static_cast<unsigned>(half::round_style == std::round_toward_neg_infinity)
-                            << 15);
-        for(; mx < 0x2000 && exp > 1; mx <<= 1, --exp)
-            ;
-    }
-    else
-    {
-        mx += my;
-        int i = mx >> 14;
-        if((exp += i) > 30)
-            return half(detail::binary, detail::overflow<half::round_style>(sign));
-        mx = (mx >> i) | (mx & i);
-    }
-    return half(detail::binary,
-                detail::rounded<half::round_style, false>(
-                    sign + ((exp - 1) << 10) + (mx >> 3), (mx >> 2) & 1, (mx & 0x3) != 0));
-#endif
-}
-
-/// Subtraction.
-/// This operation is exact to rounding for all rounding modes.
-/// \param x left operand
-/// \param y right operand
-/// \return difference of half expressions
-/// \exception FE_INVALID if \a x and \a y are infinities with equal signs or signaling NaNs
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half operator-(half x, half y)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(
-        detail::binary,
-        detail::float2half<half::round_style>(detail::half2float<detail::internal_t>(x.data_) -
-                                              detail::half2float<detail::internal_t>(y.data_)));
-#else
-    return x + -y;
-#endif
-}
-
-/// Multiplication.
-/// This operation is exact to rounding for all rounding modes.
-/// \param x left operand
-/// \param y right operand
-/// \return product of half expressions
-/// \exception FE_INVALID if multiplying 0 with infinity or if \a x or \a y is signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half operator*(half x, half y)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(
-        detail::binary,
-        detail::float2half<half::round_style>(detail::half2float<detail::internal_t>(x.data_) *
-                                              detail::half2float<detail::internal_t>(y.data_)));
-#else
-    int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, exp = -16;
-    unsigned int sign = (x.data_ ^ y.data_) & 0x8000;
-    if(absx >= 0x7C00 || absy >= 0x7C00)
-        return half(detail::binary,
-                    (absx > 0x7C00 || absy > 0x7C00)
-                        ? detail::signal(x.data_, y.data_)
-                        : ((absx == 0x7C00 && !absy) || (absy == 0x7C00 && !absx))
-                              ? detail::invalid()
-                              : (sign | 0x7C00));
-    if(!absx || !absy)
-        return half(detail::binary, sign);
-    for(; absx < 0x400; absx <<= 1, --exp)
-        ;
-    for(; absy < 0x400; absy <<= 1, --exp)
-        ;
-    detail::uint32 m = static_cast<detail::uint32>((absx & 0x3FF) | 0x400) *
-                       static_cast<detail::uint32>((absy & 0x3FF) | 0x400);
-    int i = m >> 21, s = m & i;
-    exp += (absx >> 10) + (absy >> 10) + i;
-    if(exp > 29)
-        return half(detail::binary, detail::overflow<half::round_style>(sign));
-    else if(exp < -11)
-        return half(detail::binary, detail::underflow<half::round_style>(sign));
-    return half(
-        detail::binary,
-        detail::fixed2half<half::round_style, 20, false, false, false>(m >> i, exp, sign, s));
-#endif
-}
-
-/// Division.
-/// This operation is exact to rounding for all rounding modes.
-/// \param x left operand
-/// \param y right operand
-/// \return quotient of half expressions
-/// \exception FE_INVALID if dividing 0s or infinities with each other or if \a x or \a y is
-/// signaling NaN
-/// \exception FE_DIVBYZERO if dividing finite value by 0
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half operator/(half x, half y)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(
-        detail::binary,
-        detail::float2half<half::round_style>(detail::half2float<detail::internal_t>(x.data_) /
-                                              detail::half2float<detail::internal_t>(y.data_)));
-#else
-    int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, exp = 14;
-    unsigned int sign = (x.data_ ^ y.data_) & 0x8000;
-    if(absx >= 0x7C00 || absy >= 0x7C00)
-        return half(detail::binary,
-                    (absx > 0x7C00 || absy > 0x7C00)
-                        ? detail::signal(x.data_, y.data_)
-                        : (absx == absy) ? detail::invalid()
-                                         : (sign | ((absx == 0x7C00) ? 0x7C00 : 0)));
-    if(!absx)
-        return half(detail::binary, absy ? sign : detail::invalid());
-    if(!absy)
-        return half(detail::binary, detail::pole(sign));
-    for(; absx < 0x400; absx <<= 1, --exp)
-        ;
-    for(; absy < 0x400; absy <<= 1, ++exp)
-        ;
-    detail::uint32 mx = (absx & 0x3FF) | 0x400, my = (absy & 0x3FF) | 0x400;
-    int i = mx < my;
-    exp += (absx >> 10) - (absy >> 10) - i;
-    if(exp > 29)
-        return half(detail::binary, detail::overflow<half::round_style>(sign));
-    else if(exp < -11)
-        return half(detail::binary, detail::underflow<half::round_style>(sign));
-    mx <<= 12 + i;
-    my <<= 1;
-    return half(detail::binary,
-                detail::fixed2half<half::round_style, 11, false, false, false>(
-                    mx / my, exp, sign, mx % my != 0));
-#endif
-}
-
-/// \}
-/// \anchor streaming
-/// \name Input and output
-/// \{
-
-/// Output operator.
-///	This uses the built-in functionality for streaming out floating-point numbers.
-/// \param out output stream to write into
-/// \param arg half expression to write
-/// \return reference to output stream
-template <typename charT, typename traits>
-std::basic_ostream<charT, traits>& operator<<(std::basic_ostream<charT, traits>& out, half arg)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return out << detail::half2float<detail::internal_t>(arg.data_);
-#else
-    return out << detail::half2float<float>(arg.data_);
-#endif
-}
-
-/// Input operator.
-///	This uses the built-in functionality for streaming in floating-point numbers, specifically
-/// double precision floating
-/// point numbers (unless overridden with [HALF_ARITHMETIC_TYPE](\ref HALF_ARITHMETIC_TYPE)). So the
-/// input string is first
-/// rounded to double precision using the underlying platform's current floating-point rounding mode
-/// before being rounded
-/// to half-precision using the library's half-precision rounding mode.
-/// \param in input stream to read from
-/// \param arg half to read into
-/// \return reference to input stream
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-template <typename charT, typename traits>
-std::basic_istream<charT, traits>& operator>>(std::basic_istream<charT, traits>& in, half& arg)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    detail::internal_t f;
-#else
-    double f;
-#endif
-    if(in >> f)
-        arg.data_ = detail::float2half<half::round_style>(f);
-    return in;
-}
-
-/// \}
-/// \anchor basic
-/// \name Basic mathematical operations
-/// \{
-
-/// Absolute value.
-/// **See also:** Documentation for
-/// [std::fabs](https://en.cppreference.com/w/cpp/numeric/math/fabs).
-/// \param arg operand
-/// \return absolute value of \a arg
-inline HALF_CONSTEXPR half fabs(half arg) { return half(detail::binary, arg.data_ & 0x7FFF); }
-
-/// Absolute value.
-/// **See also:** Documentation for [std::abs](https://en.cppreference.com/w/cpp/numeric/math/fabs).
-/// \param arg operand
-/// \return absolute value of \a arg
-inline HALF_CONSTEXPR half abs(half arg) { return fabs(arg); }
-
-/// Remainder of division.
-/// **See also:** Documentation for
-/// [std::fmod](https://en.cppreference.com/w/cpp/numeric/math/fmod).
-/// \param x first operand
-/// \param y second operand
-/// \return remainder of floating-point division.
-/// \exception FE_INVALID if \a x is infinite or \a y is 0 or if \a x or \a y is signaling NaN
-inline half fmod(half x, half y)
-{
-    unsigned int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, sign = x.data_ & 0x8000;
-    if(absx >= 0x7C00 || absy >= 0x7C00)
-        return half(detail::binary,
-                    (absx > 0x7C00 || absy > 0x7C00)
-                        ? detail::signal(x.data_, y.data_)
-                        : (absx == 0x7C00) ? detail::invalid() : x.data_);
-    if(!absy)
-        return half(detail::binary, detail::invalid());
-    if(!absx)
-        return x;
-    if(absx == absy)
-        return half(detail::binary, sign);
-    return half(detail::binary, sign | detail::mod<false, false>(absx, absy));
-}
-
-/// Remainder of division.
-/// **See also:** Documentation for
-/// [std::remainder](https://en.cppreference.com/w/cpp/numeric/math/remainder).
-/// \param x first operand
-/// \param y second operand
-/// \return remainder of floating-point division.
-/// \exception FE_INVALID if \a x is infinite or \a y is 0 or if \a x or \a y is signaling NaN
-inline half remainder(half x, half y)
-{
-    unsigned int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, sign = x.data_ & 0x8000;
-    if(absx >= 0x7C00 || absy >= 0x7C00)
-        return half(detail::binary,
-                    (absx > 0x7C00 || absy > 0x7C00)
-                        ? detail::signal(x.data_, y.data_)
-                        : (absx == 0x7C00) ? detail::invalid() : x.data_);
-    if(!absy)
-        return half(detail::binary, detail::invalid());
-    if(absx == absy)
-        return half(detail::binary, sign);
-    return half(detail::binary, sign ^ detail::mod<false, true>(absx, absy));
-}
-
-/// Remainder of division.
-/// **See also:** Documentation for
-/// [std::remquo](https://en.cppreference.com/w/cpp/numeric/math/remquo).
-/// \param x first operand
-/// \param y second operand
-/// \param quo address to store some bits of quotient at
-/// \return remainder of floating-point division.
-/// \exception FE_INVALID if \a x is infinite or \a y is 0 or if \a x or \a y is signaling NaN
-inline half remquo(half x, half y, int* quo)
-{
-    unsigned int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, value = x.data_ & 0x8000;
-    if(absx >= 0x7C00 || absy >= 0x7C00)
-        return half(detail::binary,
-                    (absx > 0x7C00 || absy > 0x7C00)
-                        ? detail::signal(x.data_, y.data_)
-                        : (absx == 0x7C00) ? detail::invalid() : (*quo = 0, x.data_));
-    if(!absy)
-        return half(detail::binary, detail::invalid());
-    bool qsign = ((value ^ y.data_) & 0x8000) != 0;
-    int q      = 1;
-    if(absx != absy)
-        value ^= detail::mod<true, true>(absx, absy, &q);
-    return *quo = qsign ? -q : q, half(detail::binary, value);
-}
-
-/// Fused multiply add.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for [std::fma](https://en.cppreference.com/w/cpp/numeric/math/fma).
-/// \param x first operand
-/// \param y second operand
-/// \param z third operand
-/// \return ( \a x * \a y ) + \a z rounded as one operation.
-/// \exception FE_INVALID according to operator*() and operator+() unless any argument is a quiet
-/// NaN and no argument is a signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding the final addition
-inline half fma(half x, half y, half z)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    detail::internal_t fx = detail::half2float<detail::internal_t>(x.data_),
-                       fy = detail::half2float<detail::internal_t>(y.data_),
-                       fz = detail::half2float<detail::internal_t>(z.data_);
-#if HALF_ENABLE_CPP11_CMATH && FP_FAST_FMA
-    return half(detail::binary, detail::float2half<half::round_style>(std::fma(fx, fy, fz)));
-#else
-    return half(detail::binary, detail::float2half<half::round_style>(fx * fy + fz));
-#endif
-#else
-    int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, absz = z.data_ & 0x7FFF, exp = -15;
-    unsigned int sign = (x.data_ ^ y.data_) & 0x8000;
-    bool sub          = ((sign ^ z.data_) & 0x8000) != 0;
-    if(absx >= 0x7C00 || absy >= 0x7C00 || absz >= 0x7C00)
-        return (absx > 0x7C00 || absy > 0x7C00 || absz > 0x7C00)
-                   ? half(detail::binary, detail::signal(x.data_, y.data_, z.data_))
-                   : (absx == 0x7C00) ? half(detail::binary,
-                                             (!absy || (sub && absz == 0x7C00)) ? detail::invalid()
-                                                                                : (sign | 0x7C00))
-                                      : (absy == 0x7C00) ? half(detail::binary,
-                                                                (!absx || (sub && absz == 0x7C00))
-                                                                    ? detail::invalid()
-                                                                    : (sign | 0x7C00))
-                                                         : z;
-    if(!absx || !absy)
-        return absz
-                   ? z
-                   : half(detail::binary,
-                          (half::round_style == std::round_toward_neg_infinity) ? (z.data_ | sign)
-                                                                                : (z.data_ & sign));
-    for(; absx < 0x400; absx <<= 1, --exp)
-        ;
-    for(; absy < 0x400; absy <<= 1, --exp)
-        ;
-    detail::uint32 m = static_cast<detail::uint32>((absx & 0x3FF) | 0x400) *
-                       static_cast<detail::uint32>((absy & 0x3FF) | 0x400);
-    int i = m >> 21;
-    exp += (absx >> 10) + (absy >> 10) + i;
-    m <<= 3 - i;
-    if(absz)
-    {
-        int expz = 0;
-        for(; absz < 0x400; absz <<= 1, --expz)
-            ;
-        expz += absz >> 10;
-        detail::uint32 mz = static_cast<detail::uint32>((absz & 0x3FF) | 0x400) << 13;
-        if(expz > exp || (expz == exp && mz > m))
-        {
-            std::swap(m, mz);
-            std::swap(exp, expz);
-            if(sub)
-                sign = z.data_ & 0x8000;
-        }
-        int d = exp - expz;
-        mz = (d < 23) ? ((mz >> d) | ((mz & ((static_cast<detail::uint32>(1) << d) - 1)) != 0)) : 1;
-        if(sub)
-        {
-            m = m - mz;
-            if(!m)
-                return half(
-                    detail::binary,
-                    static_cast<unsigned>(half::round_style == std::round_toward_neg_infinity)
-                        << 15);
-            for(; m < 0x800000; m <<= 1, --exp)
-                ;
-        }
-        else
-        {
-            m += mz;
-            i = m >> 24;
-            m = (m >> i) | (m & i);
-            exp += i;
-        }
-    }
-    if(exp > 30)
-        return half(detail::binary, detail::overflow<half::round_style>(sign));
-    else if(exp < -10)
-        return half(detail::binary, detail::underflow<half::round_style>(sign));
-    return half(detail::binary,
-                detail::fixed2half<half::round_style, 23, false, false, false>(m, exp - 1, sign));
-#endif
-}
-
-/// Maximum of half expressions.
-/// **See also:** Documentation for
-/// [std::fmax](https://en.cppreference.com/w/cpp/numeric/math/fmax).
-/// \param x first operand
-/// \param y second operand
-/// \return maximum of operands, ignoring quiet NaNs
-/// \exception FE_INVALID if \a x or \a y is signaling NaN
-inline HALF_CONSTEXPR_NOERR half fmax(half x, half y)
-{
-    return half(detail::binary,
-                (!isnan(y) && (isnan(x) || (x.data_ ^ (0x8000 | (0x8000 - (x.data_ >> 15)))) <
-                                               (y.data_ ^ (0x8000 | (0x8000 - (y.data_ >> 15))))))
-                    ? detail::select(y.data_, x.data_)
-                    : detail::select(x.data_, y.data_));
-}
-
-/// Minimum of half expressions.
-/// **See also:** Documentation for
-/// [std::fmin](https://en.cppreference.com/w/cpp/numeric/math/fmin).
-/// \param x first operand
-/// \param y second operand
-/// \return minimum of operands, ignoring quiet NaNs
-/// \exception FE_INVALID if \a x or \a y is signaling NaN
-inline HALF_CONSTEXPR_NOERR half fmin(half x, half y)
-{
-    return half(detail::binary,
-                (!isnan(y) && (isnan(x) || (x.data_ ^ (0x8000 | (0x8000 - (x.data_ >> 15)))) >
-                                               (y.data_ ^ (0x8000 | (0x8000 - (y.data_ >> 15))))))
-                    ? detail::select(y.data_, x.data_)
-                    : detail::select(x.data_, y.data_));
-}
-
-/// Positive difference.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::fdim](https://en.cppreference.com/w/cpp/numeric/math/fdim).
-/// \param x first operand
-/// \param y second operand
-/// \return \a x - \a y or 0 if difference negative
-/// \exception FE_... according to operator-(half,half)
-inline half fdim(half x, half y)
-{
-    if(isnan(x) || isnan(y))
-        return half(detail::binary, detail::signal(x.data_, y.data_));
-    return (x.data_ ^ (0x8000 | (0x8000 - (x.data_ >> 15)))) <=
-                   (y.data_ ^ (0x8000 | (0x8000 - (y.data_ >> 15))))
-               ? half(detail::binary, 0)
-               : (x - y);
-}
-
-/// Get NaN value.
-/// **See also:** Documentation for [std::nan](https://en.cppreference.com/w/cpp/numeric/math/nan).
-/// \param arg string code
-/// \return quiet NaN
-inline half nanh(const char* arg)
-{
-    unsigned int value = 0x7FFF;
-    while(*arg)
-        value ^= static_cast<unsigned>(*arg++) & 0xFF;
-    return half(detail::binary, value);
-}
-
-/// \}
-/// \anchor exponential
-/// \name Exponential functions
-/// \{
-
-/// Exponential function.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for [std::exp](https://en.cppreference.com/w/cpp/numeric/math/exp).
-/// \param arg function argument
-/// \return e raised to \a arg
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half exp(half arg)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::exp(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    int abs = arg.data_ & 0x7FFF;
-    if(!abs)
-        return half(detail::binary, 0x3C00);
-    if(abs >= 0x7C00)
-        return half(detail::binary,
-                    (abs == 0x7C00) ? (0x7C00 & ((arg.data_ >> 15) - 1U))
-                                    : detail::signal(arg.data_));
-    if(abs >= 0x4C80)
-        return half(detail::binary,
-                    (arg.data_ & 0x8000) ? detail::underflow<half::round_style>()
-                                         : detail::overflow<half::round_style>());
-    detail::uint32 m = detail::multiply64(
-        static_cast<detail::uint32>((abs & 0x3FF) + ((abs > 0x3FF) << 10)) << 21, 0xB8AA3B29);
-    int e = (abs >> 10) + (abs <= 0x3FF), exp;
-    if(e < 14)
-    {
-        exp = 0;
-        m >>= 14 - e;
-    }
-    else
-    {
-        exp = m >> (45 - e);
-        m   = (m << (e - 14)) & 0x7FFFFFFF;
-    }
-    return half(detail::binary,
-                detail::exp2_post<half::round_style, true>(
-                    detail::exp2(m, 26), exp, (arg.data_ & 0x8000) != 0));
-#endif
-}
-
-/// Binary exponential.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::exp2](https://en.cppreference.com/w/cpp/numeric/math/exp2).
-/// \param arg function argument
-/// \return 2 raised to \a arg
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half exp2(half arg)
-{
-#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::exp2(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    int abs = arg.data_ & 0x7FFF;
-    if(!abs)
-        return half(detail::binary, 0x3C00);
-    if(abs >= 0x7C00)
-        return half(detail::binary,
-                    (abs == 0x7C00) ? (0x7C00 & ((arg.data_ >> 15) - 1U))
-                                    : detail::signal(arg.data_));
-    if(abs >= 0x4E40)
-        return half(detail::binary,
-                    (arg.data_ & 0x8000) ? detail::underflow<half::round_style>()
-                                         : detail::overflow<half::round_style>());
-    int e = (abs >> 10) + (abs <= 0x3FF), exp = (abs & 0x3FF) + ((abs > 0x3FF) << 10);
-    detail::uint32 m = detail::exp2((static_cast<detail::uint32>(exp) << (6 + e)) & 0x7FFFFFFF, 28);
-    exp >>= 25 - e;
-    if(m == 0x80000000)
-    {
-        if(arg.data_ & 0x8000)
-            exp = -exp;
-        else if(exp > 15)
-            return half(detail::binary, detail::overflow<half::round_style>());
-        return half(detail::binary,
-                    detail::fixed2half<half::round_style, 31, false, false, false>(m, exp + 14));
-    }
-    return half(detail::binary,
-                detail::exp2_post<half::round_style, true>(m, exp, (arg.data_ & 0x8000) != 0));
-#endif
-}
-
-/// Exponential minus one.
-/// This function may be 1 ULP off the correctly rounded exact result in <0.05% of inputs for
-/// `std::round_to_nearest`
-/// and in <1% of inputs for any other rounding mode.
-///
-/// **See also:** Documentation for
-/// [std::expm1](https://en.cppreference.com/w/cpp/numeric/math/expm1).
-/// \param arg function argument
-/// \return e raised to \a arg and subtracted by 1
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half expm1(half arg)
-{
-#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::expm1(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ & 0x8000;
-    if(!abs)
-        return arg;
-    if(abs >= 0x7C00)
-        return half(detail::binary,
-                    (abs == 0x7C00) ? (0x7C00 + (sign >> 1)) : detail::signal(arg.data_));
-    if(abs >= 0x4A00)
-        return half(detail::binary,
-                    (arg.data_ & 0x8000) ? detail::rounded<half::round_style, true>(0xBBFF, 1, 1)
-                                         : detail::overflow<half::round_style>());
-    detail::uint32 m = detail::multiply64(
-        static_cast<detail::uint32>((abs & 0x3FF) + ((abs > 0x3FF) << 10)) << 21, 0xB8AA3B29);
-    int e = (abs >> 10) + (abs <= 0x3FF), exp;
-    if(e < 14)
-    {
-        exp = 0;
-        m >>= 14 - e;
-    }
-    else
-    {
-        exp = m >> (45 - e);
-        m   = (m << (e - 14)) & 0x7FFFFFFF;
-    }
-    m = detail::exp2(m);
-    if(sign)
-    {
-        int s = 0;
-        if(m > 0x80000000)
-        {
-            ++exp;
-            m = detail::divide64(0x80000000, m, s);
-        }
-        m = 0x80000000 -
-            ((m >> exp) | ((m & ((static_cast<detail::uint32>(1) << exp) - 1)) != 0) | s);
-        exp = 0;
-    }
-    else
-        m -= (exp < 31) ? (0x80000000 >> exp) : 1;
-    for(exp += 14; m < 0x80000000 && exp; m <<= 1, --exp)
-        ;
-    if(exp > 29)
-        return half(detail::binary, detail::overflow<half::round_style>());
-    return half(detail::binary,
-                detail::rounded<half::round_style, true>(
-                    sign + (exp << 10) + (m >> 21), (m >> 20) & 1, (m & 0xFFFFF) != 0));
-#endif
-}
-
-/// Natural logarithm.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for [std::log](https://en.cppreference.com/w/cpp/numeric/math/log).
-/// \param arg function argument
-/// \return logarithm of \a arg to base e
-/// \exception FE_INVALID for signaling NaN or negative argument
-/// \exception FE_DIVBYZERO for 0
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half log(half arg)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::log(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    int abs = arg.data_ & 0x7FFF, exp = -15;
-    if(!abs)
-        return half(detail::binary, detail::pole(0x8000));
-    if(arg.data_ & 0x8000)
-        return half(detail::binary,
-                    (arg.data_ <= 0xFC00) ? detail::invalid() : detail::signal(arg.data_));
-    if(abs >= 0x7C00)
-        return (abs == 0x7C00) ? arg : half(detail::binary, detail::signal(arg.data_));
-    for(; abs < 0x400; abs <<= 1, --exp)
-        ;
-    exp += abs >> 10;
-    return half(detail::binary,
-                detail::log2_post<half::round_style, 0xB8AA3B2A>(
-                    detail::log2(static_cast<detail::uint32>((abs & 0x3FF) | 0x400) << 20, 27) + 8,
-                    exp,
-                    17));
-#endif
-}
-
-/// Common logarithm.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::log10](https://en.cppreference.com/w/cpp/numeric/math/log10).
-/// \param arg function argument
-/// \return logarithm of \a arg to base 10
-/// \exception FE_INVALID for signaling NaN or negative argument
-/// \exception FE_DIVBYZERO for 0
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half log10(half arg)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::log10(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    int abs = arg.data_ & 0x7FFF, exp = -15;
-    if(!abs)
-        return half(detail::binary, detail::pole(0x8000));
-    if(arg.data_ & 0x8000)
-        return half(detail::binary,
-                    (arg.data_ <= 0xFC00) ? detail::invalid() : detail::signal(arg.data_));
-    if(abs >= 0x7C00)
-        return (abs == 0x7C00) ? arg : half(detail::binary, detail::signal(arg.data_));
-    switch(abs)
-    {
-    case 0x4900: return half(detail::binary, 0x3C00);
-    case 0x5640: return half(detail::binary, 0x4000);
-    case 0x63D0: return half(detail::binary, 0x4200);
-    case 0x70E2: return half(detail::binary, 0x4400);
-    }
-    for(; abs < 0x400; abs <<= 1, --exp)
-        ;
-    exp += abs >> 10;
-    return half(detail::binary,
-                detail::log2_post<half::round_style, 0xD49A784C>(
-                    detail::log2(static_cast<detail::uint32>((abs & 0x3FF) | 0x400) << 20, 27) + 8,
-                    exp,
-                    16));
-#endif
-}
-
-/// Binary logarithm.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::log2](https://en.cppreference.com/w/cpp/numeric/math/log2).
-/// \param arg function argument
-/// \return logarithm of \a arg to base 2
-/// \exception FE_INVALID for signaling NaN or negative argument
-/// \exception FE_DIVBYZERO for 0
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half log2(half arg)
-{
-#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::log2(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    int abs = arg.data_ & 0x7FFF, exp = -15, s = 0;
-    if(!abs)
-        return half(detail::binary, detail::pole(0x8000));
-    if(arg.data_ & 0x8000)
-        return half(detail::binary,
-                    (arg.data_ <= 0xFC00) ? detail::invalid() : detail::signal(arg.data_));
-    if(abs >= 0x7C00)
-        return (abs == 0x7C00) ? arg : half(detail::binary, detail::signal(arg.data_));
-    if(abs == 0x3C00)
-        return half(detail::binary, 0);
-    for(; abs < 0x400; abs <<= 1, --exp)
-        ;
-    exp += (abs >> 10);
-    if(!(abs & 0x3FF))
-    {
-        unsigned int value = static_cast<unsigned>(exp < 0) << 15, m = std::abs(exp) << 6;
-        for(exp = 18; m < 0x400; m <<= 1, --exp)
-            ;
-        return half(detail::binary, value + (exp << 10) + m);
-    }
-    detail::uint32 ilog = exp, sign = detail::sign_mask(ilog),
-                   m = (((ilog << 27) +
-                         (detail::log2(static_cast<detail::uint32>((abs & 0x3FF) | 0x400) << 20,
-                                       28) >>
-                          4)) ^
-                        sign) -
-                       sign;
-    if(!m)
-        return half(detail::binary, 0);
-    for(exp = 14; m < 0x8000000 && exp; m <<= 1, --exp)
-        ;
-    for(; m > 0xFFFFFFF; m >>= 1, ++exp)
-        s |= m & 1;
-    return half(
-        detail::binary,
-        detail::fixed2half<half::round_style, 27, false, false, true>(m, exp, sign & 0x8000, s));
-#endif
-}
-
-/// Natural logarithm plus one.
-/// This function may be 1 ULP off the correctly rounded exact result in <0.05% of inputs for
-/// `std::round_to_nearest`
-/// and in ~1% of inputs for any other rounding mode.
-///
-/// **See also:** Documentation for
-/// [std::log1p](https://en.cppreference.com/w/cpp/numeric/math/log1p).
-/// \param arg function argument
-/// \return logarithm of \a arg plus 1 to base e
-/// \exception FE_INVALID for signaling NaN or argument <-1
-/// \exception FE_DIVBYZERO for -1
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half log1p(half arg)
-{
-#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::log1p(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    if(arg.data_ >= 0xBC00)
-        return half(detail::binary,
-                    (arg.data_ == 0xBC00)
-                        ? detail::pole(0x8000)
-                        : (arg.data_ <= 0xFC00) ? detail::invalid() : detail::signal(arg.data_));
-    int abs = arg.data_ & 0x7FFF, exp = -15;
-    if(!abs || abs >= 0x7C00)
-        return (abs > 0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
-    for(; abs < 0x400; abs <<= 1, --exp)
-        ;
-    exp += abs >> 10;
-    detail::uint32 m = static_cast<detail::uint32>((abs & 0x3FF) | 0x400) << 20;
-    if(arg.data_ & 0x8000)
-    {
-        m = 0x40000000 - (m >> -exp);
-        for(exp = 0; m < 0x40000000; m <<= 1, --exp)
-            ;
-    }
-    else
-    {
-        if(exp < 0)
-        {
-            m   = 0x40000000 + (m >> -exp);
-            exp = 0;
-        }
-        else
-        {
-            m += 0x40000000 >> exp;
-            int i = m >> 31;
-            m >>= i;
-            exp += i;
-        }
-    }
-    return half(detail::binary,
-                detail::log2_post<half::round_style, 0xB8AA3B2A>(detail::log2(m), exp, 17));
-#endif
-}
-
-/// \}
-/// \anchor power
-/// \name Power functions
-/// \{
-
-/// Square root.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::sqrt](https://en.cppreference.com/w/cpp/numeric/math/sqrt).
-/// \param arg function argument
-/// \return square root of \a arg
-/// \exception FE_INVALID for signaling NaN and negative arguments
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half sqrt(half arg)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::sqrt(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    int abs = arg.data_ & 0x7FFF, exp = 15;
-    if(!abs || arg.data_ >= 0x7C00)
-        return half(detail::binary,
-                    (abs > 0x7C00) ? detail::signal(arg.data_)
-                                   : (arg.data_ > 0x8000) ? detail::invalid() : arg.data_);
-    for(; abs < 0x400; abs <<= 1, --exp)
-        ;
-    detail::uint32 r = static_cast<detail::uint32>((abs & 0x3FF) | 0x400) << 10,
-                   m = detail::sqrt<20>(r, exp += abs >> 10);
-    return half(
-        detail::binary,
-        detail::rounded<half::round_style, false>((exp << 10) + (m & 0x3FF), r > m, r != 0));
-#endif
-}
-
-/// Cubic root.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::cbrt](https://en.cppreference.com/w/cpp/numeric/math/cbrt).
-/// \param arg function argument
-/// \return cubic root of \a arg
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half cbrt(half arg)
-{
-#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::cbrt(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    int abs = arg.data_ & 0x7FFF, exp = -15;
-    if(!abs || abs == 0x3C00 || abs >= 0x7C00)
-        return (abs > 0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
-    for(; abs < 0x400; abs <<= 1, --exp)
-        ;
-    detail::uint32 ilog = exp + (abs >> 10), sign = detail::sign_mask(ilog), f,
-                   m = (((ilog << 27) +
-                         (detail::log2(static_cast<detail::uint32>((abs & 0x3FF) | 0x400) << 20,
-                                       24) >>
-                          4)) ^
-                        sign) -
-                       sign;
-    for(exp = 2; m < 0x80000000; m <<= 1, --exp)
-        ;
-    m     = detail::multiply64(m, 0xAAAAAAAB);
-    int i = m >> 31, s;
-    exp += i;
-    m <<= 1 - i;
-    if(exp < 0)
-    {
-        f   = m >> -exp;
-        exp = 0;
-    }
-    else
-    {
-        f   = (m << exp) & 0x7FFFFFFF;
-        exp = m >> (31 - exp);
-    }
-    m = detail::exp2(f, (half::round_style == std::round_to_nearest) ? 29 : 26);
-    if(sign)
-    {
-        if(m > 0x80000000)
-        {
-            m = detail::divide64(0x80000000, m, s);
-            ++exp;
-        }
-        exp = -exp;
-    }
-    return half(detail::binary,
-                (half::round_style == std::round_to_nearest)
-                    ? detail::fixed2half<half::round_style, 31, false, false, false>(
-                          m, exp + 14, arg.data_ & 0x8000)
-                    : detail::fixed2half<half::round_style, 23, false, false, false>(
-                          (m + 0x80) >> 8, exp + 14, arg.data_ & 0x8000));
-#endif
-}
-
-/// Hypotenuse function.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::hypot](https://en.cppreference.com/w/cpp/numeric/math/hypot).
-/// \param x first argument
-/// \param y second argument
-/// \return square root of sum of squares without internal over- or underflows
-/// \exception FE_INVALID if \a x or \a y is signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding of the final square root
-inline half hypot(half x, half y)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    detail::internal_t fx = detail::half2float<detail::internal_t>(x.data_),
-                       fy = detail::half2float<detail::internal_t>(y.data_);
-#if HALF_ENABLE_CPP11_CMATH
-    return half(detail::binary, detail::float2half<half::round_style>(std::hypot(fx, fy)));
-#else
-    return half(detail::binary,
-                detail::float2half<half::round_style>(std::sqrt(fx * fx + fy * fy)));
-#endif
-#else
-    int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, expx = 0, expy = 0;
-    if(absx >= 0x7C00 || absy >= 0x7C00)
-        return half(detail::binary,
-                    (absx == 0x7C00) ? detail::select(0x7C00, y.data_)
-                                     : (absy == 0x7C00) ? detail::select(0x7C00, x.data_)
-                                                        : detail::signal(x.data_, y.data_));
-    if(!absx)
-        return half(detail::binary, absy ? detail::check_underflow(absy) : 0);
-    if(!absy)
-        return half(detail::binary, detail::check_underflow(absx));
-    if(absy > absx)
-        std::swap(absx, absy);
-    for(; absx < 0x400; absx <<= 1, --expx)
-        ;
-    for(; absy < 0x400; absy <<= 1, --expy)
-        ;
-    detail::uint32 mx = (absx & 0x3FF) | 0x400, my = (absy & 0x3FF) | 0x400;
-    mx *= mx;
-    my *= my;
-    int ix = mx >> 21, iy = my >> 21;
-    expx = 2 * (expx + (absx >> 10)) - 15 + ix;
-    expy = 2 * (expy + (absy >> 10)) - 15 + iy;
-    mx <<= 10 - ix;
-    my <<= 10 - iy;
-    int d = expx - expy;
-    my    = (d < 30) ? ((my >> d) | ((my & ((static_cast<detail::uint32>(1) << d) - 1)) != 0)) : 1;
-    return half(detail::binary, detail::hypot_post<half::round_style>(mx + my, expx));
-#endif
-}
-
-/// Hypotenuse function.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::hypot](https://en.cppreference.com/w/cpp/numeric/math/hypot).
-/// \param x first argument
-/// \param y second argument
-/// \param z third argument
-/// \return square root of sum of squares without internal over- or underflows
-/// \exception FE_INVALID if \a x, \a y or \a z is signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding of the final square root
-inline half hypot(half x, half y, half z)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    detail::internal_t fx = detail::half2float<detail::internal_t>(x.data_),
-                       fy = detail::half2float<detail::internal_t>(y.data_),
-                       fz = detail::half2float<detail::internal_t>(z.data_);
-    return half(detail::binary,
-                detail::float2half<half::round_style>(std::sqrt(fx * fx + fy * fy + fz * fz)));
-#else
-    int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, absz = z.data_ & 0x7FFF, expx = 0,
-        expy = 0, expz = 0;
-    if(!absx)
-        return hypot(y, z);
-    if(!absy)
-        return hypot(x, z);
-    if(!absz)
-        return hypot(x, y);
-    if(absx >= 0x7C00 || absy >= 0x7C00 || absz >= 0x7C00)
-        return half(detail::binary,
-                    (absx == 0x7C00)
-                        ? detail::select(0x7C00, detail::select(y.data_, z.data_))
-                        : (absy == 0x7C00)
-                              ? detail::select(0x7C00, detail::select(x.data_, z.data_))
-                              : (absz == 0x7C00)
-                                    ? detail::select(0x7C00, detail::select(x.data_, y.data_))
-                                    : detail::signal(x.data_, y.data_, z.data_));
-    if(absz > absy)
-        std::swap(absy, absz);
-    if(absy > absx)
-        std::swap(absx, absy);
-    if(absz > absy)
-        std::swap(absy, absz);
-    for(; absx < 0x400; absx <<= 1, --expx)
-        ;
-    for(; absy < 0x400; absy <<= 1, --expy)
-        ;
-    for(; absz < 0x400; absz <<= 1, --expz)
-        ;
-    detail::uint32 mx = (absx & 0x3FF) | 0x400, my = (absy & 0x3FF) | 0x400,
-                   mz = (absz & 0x3FF) | 0x400;
-    mx *= mx;
-    my *= my;
-    mz *= mz;
-    int ix = mx >> 21, iy = my >> 21, iz = mz >> 21;
-    expx = 2 * (expx + (absx >> 10)) - 15 + ix;
-    expy = 2 * (expy + (absy >> 10)) - 15 + iy;
-    expz = 2 * (expz + (absz >> 10)) - 15 + iz;
-    mx <<= 10 - ix;
-    my <<= 10 - iy;
-    mz <<= 10 - iz;
-    int d = expy - expz;
-    mz    = (d < 30) ? ((mz >> d) | ((mz & ((static_cast<detail::uint32>(1) << d) - 1)) != 0)) : 1;
-    my += mz;
-    if(my & 0x80000000)
-    {
-        my = (my >> 1) | (my & 1);
-        if(++expy > expx)
-        {
-            std::swap(mx, my);
-            std::swap(expx, expy);
-        }
-    }
-    d  = expx - expy;
-    my = (d < 30) ? ((my >> d) | ((my & ((static_cast<detail::uint32>(1) << d) - 1)) != 0)) : 1;
-    return half(detail::binary, detail::hypot_post<half::round_style>(mx + my, expx));
-#endif
-}
-
-/// Power function.
-/// This function may be 1 ULP off the correctly rounded exact result for any rounding mode in
-/// ~0.00025% of inputs.
-///
-/// **See also:** Documentation for [std::pow](https://en.cppreference.com/w/cpp/numeric/math/pow).
-/// \param x base
-/// \param y exponent
-/// \return \a x raised to \a y
-/// \exception FE_INVALID if \a x or \a y is signaling NaN or if \a x is finite an negative and \a y
-/// is finite and not integral
-/// \exception FE_DIVBYZERO if \a x is 0 and \a y is negative
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half pow(half x, half y)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::pow(detail::half2float<detail::internal_t>(x.data_),
-                             detail::half2float<detail::internal_t>(y.data_))));
-#else
-    int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, exp = -15;
-    if(!absy || x.data_ == 0x3C00)
-        return half(detail::binary,
-                    detail::select(0x3C00, (x.data_ == 0x3C00) ? y.data_ : x.data_));
-    bool is_int = absy >= 0x6400 || (absy >= 0x3C00 && !(absy & ((1 << (25 - (absy >> 10))) - 1)));
-    unsigned int sign =
-        x.data_ &
-        (static_cast<unsigned>((absy < 0x6800) && is_int && ((absy >> (25 - (absy >> 10))) & 1))
-         << 15);
-    if(absx >= 0x7C00 || absy >= 0x7C00)
-        return half(detail::binary,
-                    (absx > 0x7C00 || absy > 0x7C00)
-                        ? detail::signal(x.data_, y.data_)
-                        : (absy == 0x7C00)
-                              ? ((absx == 0x3C00)
-                                     ? 0x3C00
-                                     : (!absx && y.data_ == 0xFC00)
-                                           ? detail::pole()
-                                           : (0x7C00 & -((y.data_ >> 15) ^ (absx > 0x3C00))))
-                              : (sign | (0x7C00 & ((y.data_ >> 15) - 1U))));
-    if(!absx)
-        return half(detail::binary, (y.data_ & 0x8000) ? detail::pole(sign) : sign);
-    if((x.data_ & 0x8000) && !is_int)
-        return half(detail::binary, detail::invalid());
-    if(x.data_ == 0xBC00)
-        return half(detail::binary, sign | 0x3C00);
-    if(y.data_ == 0x3800)
-        return sqrt(x);
-    if(y.data_ == 0x3C00)
-        return half(detail::binary, detail::check_underflow(x.data_));
-    if(y.data_ == 0x4000)
-        return x * x;
-    for(; absx < 0x400; absx <<= 1, --exp)
-        ;
-    detail::uint32 ilog = exp + (absx >> 10), msign = detail::sign_mask(ilog), f,
-                   m = (((ilog << 27) +
-                         ((detail::log2(static_cast<detail::uint32>((absx & 0x3FF) | 0x400) << 20) +
-                           8) >>
-                          4)) ^
-                        msign) -
-                       msign;
-    for(exp = -11; m < 0x80000000; m <<= 1, --exp)
-        ;
-    for(; absy < 0x400; absy <<= 1, --exp)
-        ;
-    m     = detail::multiply64(m, static_cast<detail::uint32>((absy & 0x3FF) | 0x400) << 21);
-    int i = m >> 31;
-    exp += (absy >> 10) + i;
-    m <<= 1 - i;
-    if(exp < 0)
-    {
-        f   = m >> -exp;
-        exp = 0;
-    }
-    else
-    {
-        f   = (m << exp) & 0x7FFFFFFF;
-        exp = m >> (31 - exp);
-    }
-    return half(detail::binary,
-                detail::exp2_post<half::round_style, false>(
-                    detail::exp2(f), exp, ((msign & 1) ^ (y.data_ >> 15)) != 0, sign));
-#endif
-}
-
-/// \}
-/// \anchor trigonometric
-/// \name Trigonometric functions
-/// \{
-
-/// Compute sine and cosine simultaneously.
-///	This returns the same results as sin() and cos() but is faster than calling each function
-/// individually.
-///
-/// This function is exact to rounding for all rounding modes.
-/// \param arg function argument
-/// \param sin variable to take sine of \a arg
-/// \param cos variable to take cosine of \a arg
-/// \exception FE_INVALID for signaling NaN or infinity
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline void sincos(half arg, half* sin, half* cos)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    detail::internal_t f = detail::half2float<detail::internal_t>(arg.data_);
-    *sin                 = half(detail::binary, detail::float2half<half::round_style>(std::sin(f)));
-    *cos                 = half(detail::binary, detail::float2half<half::round_style>(std::cos(f)));
-#else
-    int abs = arg.data_ & 0x7FFF, sign = arg.data_ >> 15, k;
-    if(abs >= 0x7C00)
-        *sin = *cos =
-            half(detail::binary, (abs == 0x7C00) ? detail::invalid() : detail::signal(arg.data_));
-    else if(!abs)
-    {
-        *sin = arg;
-        *cos = half(detail::binary, 0x3C00);
-    }
-    else if(abs < 0x2500)
-    {
-        *sin = half(detail::binary, detail::rounded<half::round_style, true>(arg.data_ - 1, 1, 1));
-        *cos = half(detail::binary, detail::rounded<half::round_style, true>(0x3BFF, 1, 1));
-    }
-    else
-    {
-        if(half::round_style != std::round_to_nearest)
-        {
-            switch(abs)
-            {
-            case 0x48B7:
-                *sin = half(
-                    detail::binary,
-                    detail::rounded<half::round_style, true>((~arg.data_ & 0x8000) | 0x1D07, 1, 1));
-                *cos = half(detail::binary, detail::rounded<half::round_style, true>(0xBBFF, 1, 1));
-                return;
-            case 0x598C:
-                *sin = half(
-                    detail::binary,
-                    detail::rounded<half::round_style, true>((arg.data_ & 0x8000) | 0x3BFF, 1, 1));
-                *cos = half(detail::binary, detail::rounded<half::round_style, true>(0x80FC, 1, 1));
-                return;
-            case 0x6A64:
-                *sin = half(
-                    detail::binary,
-                    detail::rounded<half::round_style, true>((~arg.data_ & 0x8000) | 0x3BFE, 1, 1));
-                *cos = half(detail::binary, detail::rounded<half::round_style, true>(0x27FF, 1, 1));
-                return;
-            case 0x6D8C:
-                *sin = half(
-                    detail::binary,
-                    detail::rounded<half::round_style, true>((arg.data_ & 0x8000) | 0x0FE6, 1, 1));
-                *cos = half(detail::binary, detail::rounded<half::round_style, true>(0x3BFF, 1, 1));
-                return;
-            }
-        }
-        std::pair<detail::uint32, detail::uint32> sc =
-            detail::sincos(detail::angle_arg(abs, k), 28);
-        switch(k & 3)
-        {
-        case 1: sc = std::make_pair(sc.second, -sc.first); break;
-        case 2: sc = std::make_pair(-sc.first, -sc.second); break;
-        case 3: sc = std::make_pair(-sc.second, sc.first); break;
-        }
-        *sin = half(detail::binary,
-                    detail::fixed2half<half::round_style, 30, true, true, true>(
-                        (sc.first ^ -static_cast<detail::uint32>(sign)) + sign));
-        *cos = half(detail::binary,
-                    detail::fixed2half<half::round_style, 30, true, true, true>(sc.second));
-    }
-#endif
-}
-
-/// Sine function.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for [std::sin](https://en.cppreference.com/w/cpp/numeric/math/sin).
-/// \param arg function argument
-/// \return sine value of \a arg
-/// \exception FE_INVALID for signaling NaN or infinity
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half sin(half arg)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::sin(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    int abs = arg.data_ & 0x7FFF, k;
-    if(!abs)
-        return arg;
-    if(abs >= 0x7C00)
-        return half(detail::binary,
-                    (abs == 0x7C00) ? detail::invalid() : detail::signal(arg.data_));
-    if(abs < 0x2900)
-        return half(detail::binary, detail::rounded<half::round_style, true>(arg.data_ - 1, 1, 1));
-    if(half::round_style != std::round_to_nearest)
-        switch(abs)
-        {
-        case 0x48B7:
-            return half(
-                detail::binary,
-                detail::rounded<half::round_style, true>((~arg.data_ & 0x8000) | 0x1D07, 1, 1));
-        case 0x6A64:
-            return half(
-                detail::binary,
-                detail::rounded<half::round_style, true>((~arg.data_ & 0x8000) | 0x3BFE, 1, 1));
-        case 0x6D8C:
-            return half(
-                detail::binary,
-                detail::rounded<half::round_style, true>((arg.data_ & 0x8000) | 0x0FE6, 1, 1));
-        }
-    std::pair<detail::uint32, detail::uint32> sc = detail::sincos(detail::angle_arg(abs, k), 28);
-    detail::uint32 sign = -static_cast<detail::uint32>(((k >> 1) & 1) ^ (arg.data_ >> 15));
-    return half(detail::binary,
-                detail::fixed2half<half::round_style, 30, true, true, true>(
-                    (((k & 1) ? sc.second : sc.first) ^ sign) - sign));
-#endif
-}
-
-/// Cosine function.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for [std::cos](https://en.cppreference.com/w/cpp/numeric/math/cos).
-/// \param arg function argument
-/// \return cosine value of \a arg
-/// \exception FE_INVALID for signaling NaN or infinity
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half cos(half arg)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::cos(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    int abs = arg.data_ & 0x7FFF, k;
-    if(!abs)
-        return half(detail::binary, 0x3C00);
-    if(abs >= 0x7C00)
-        return half(detail::binary,
-                    (abs == 0x7C00) ? detail::invalid() : detail::signal(arg.data_));
-    if(abs < 0x2500)
-        return half(detail::binary, detail::rounded<half::round_style, true>(0x3BFF, 1, 1));
-    if(half::round_style != std::round_to_nearest && abs == 0x598C)
-        return half(detail::binary, detail::rounded<half::round_style, true>(0x80FC, 1, 1));
-    std::pair<detail::uint32, detail::uint32> sc = detail::sincos(detail::angle_arg(abs, k), 28);
-    detail::uint32 sign                          = -static_cast<detail::uint32>(((k >> 1) ^ k) & 1);
-    return half(detail::binary,
-                detail::fixed2half<half::round_style, 30, true, true, true>(
-                    (((k & 1) ? sc.first : sc.second) ^ sign) - sign));
-#endif
-}
-
-/// Tangent function.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for [std::tan](https://en.cppreference.com/w/cpp/numeric/math/tan).
-/// \param arg function argument
-/// \return tangent value of \a arg
-/// \exception FE_INVALID for signaling NaN or infinity
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half tan(half arg)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::tan(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    int abs = arg.data_ & 0x7FFF, exp = 13, k;
-    if(!abs)
-        return arg;
-    if(abs >= 0x7C00)
-        return half(detail::binary,
-                    (abs == 0x7C00) ? detail::invalid() : detail::signal(arg.data_));
-    if(abs < 0x2700)
-        return half(detail::binary, detail::rounded<half::round_style, true>(arg.data_, 0, 1));
-    if(half::round_style != std::round_to_nearest)
-        switch(abs)
-        {
-        case 0x658C:
-            return half(
-                detail::binary,
-                detail::rounded<half::round_style, true>((arg.data_ & 0x8000) | 0x07E6, 1, 1));
-        case 0x7330:
-            return half(
-                detail::binary,
-                detail::rounded<half::round_style, true>((~arg.data_ & 0x8000) | 0x4B62, 1, 1));
-        }
-    std::pair<detail::uint32, detail::uint32> sc = detail::sincos(detail::angle_arg(abs, k), 30);
-    if(k & 1)
-        sc = std::make_pair(-sc.second, sc.first);
-    detail::uint32 signy = detail::sign_mask(sc.first), signx = detail::sign_mask(sc.second);
-    detail::uint32 my = (sc.first ^ signy) - signy, mx = (sc.second ^ signx) - signx;
-    for(; my < 0x80000000; my <<= 1, --exp)
-        ;
-    for(; mx < 0x80000000; mx <<= 1, ++exp)
-        ;
-    return half(
-        detail::binary,
-        detail::tangent_post<half::round_style>(my, mx, exp, (signy ^ signx ^ arg.data_) & 0x8000));
-#endif
-}
-
-/// Arc sine.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::asin](https://en.cppreference.com/w/cpp/numeric/math/asin).
-/// \param arg function argument
-/// \return arc sine value of \a arg
-/// \exception FE_INVALID for signaling NaN or if abs(\a arg) > 1
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half asin(half arg)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::asin(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ & 0x8000;
-    if(!abs)
-        return arg;
-    if(abs >= 0x3C00)
-        return half(detail::binary,
-                    (abs > 0x7C00)
-                        ? detail::signal(arg.data_)
-                        : (abs > 0x3C00)
-                              ? detail::invalid()
-                              : detail::rounded<half::round_style, true>(sign | 0x3E48, 0, 1));
-    if(abs < 0x2900)
-        return half(detail::binary, detail::rounded<half::round_style, true>(arg.data_, 0, 1));
-    if(half::round_style != std::round_to_nearest && (abs == 0x2B44 || abs == 0x2DC3))
-        return half(detail::binary, detail::rounded<half::round_style, true>(arg.data_ + 1, 1, 1));
-    std::pair<detail::uint32, detail::uint32> sc = detail::atan2_args(abs);
-    detail::uint32 m =
-        detail::atan2(sc.first, sc.second, (half::round_style == std::round_to_nearest) ? 27 : 26);
-    return half(detail::binary,
-                detail::fixed2half<half::round_style, 30, false, true, true>(m, 14, sign));
-#endif
-}
-
-/// Arc cosine function.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::acos](https://en.cppreference.com/w/cpp/numeric/math/acos).
-/// \param arg function argument
-/// \return arc cosine value of \a arg
-/// \exception FE_INVALID for signaling NaN or if abs(\a arg) > 1
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half acos(half arg)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::acos(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ >> 15;
-    if(!abs)
-        return half(detail::binary, detail::rounded<half::round_style, true>(0x3E48, 0, 1));
-    if(abs >= 0x3C00)
-        return half(detail::binary,
-                    (abs > 0x7C00)
-                        ? detail::signal(arg.data_)
-                        : (abs > 0x3C00)
-                              ? detail::invalid()
-                              : sign ? detail::rounded<half::round_style, true>(0x4248, 0, 1) : 0);
-    std::pair<detail::uint32, detail::uint32> cs = detail::atan2_args(abs);
-    detail::uint32 m                             = detail::atan2(cs.second, cs.first, 28);
-    return half(detail::binary,
-                detail::fixed2half<half::round_style, 31, false, true, true>(
-                    sign ? (0xC90FDAA2 - m) : m, 15, 0, sign));
-#endif
-}
-
-/// Arc tangent function.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::atan](https://en.cppreference.com/w/cpp/numeric/math/atan).
-/// \param arg function argument
-/// \return arc tangent value of \a arg
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half atan(half arg)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::atan(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ & 0x8000;
-    if(!abs)
-        return arg;
-    if(abs >= 0x7C00)
-        return half(detail::binary,
-                    (abs == 0x7C00) ? detail::rounded<half::round_style, true>(sign | 0x3E48, 0, 1)
-                                    : detail::signal(arg.data_));
-    if(abs <= 0x2700)
-        return half(detail::binary, detail::rounded<half::round_style, true>(arg.data_ - 1, 1, 1));
-    int exp           = (abs >> 10) + (abs <= 0x3FF);
-    detail::uint32 my = (abs & 0x3FF) | ((abs > 0x3FF) << 10);
-    detail::uint32 m  = (exp > 15)
-                           ? detail::atan2(my << 19,
-                                           0x20000000 >> (exp - 15),
-                                           (half::round_style == std::round_to_nearest) ? 26 : 24)
-                           : detail::atan2(my << (exp + 4),
-                                           0x20000000,
-                                           (half::round_style == std::round_to_nearest) ? 30 : 28);
-    return half(detail::binary,
-                detail::fixed2half<half::round_style, 30, false, true, true>(m, 14, sign));
-#endif
-}
-
-/// Arc tangent function.
-/// This function may be 1 ULP off the correctly rounded exact result in ~0.005% of inputs for
-/// `std::round_to_nearest`,
-/// in ~0.1% of inputs for `std::round_toward_zero` and in ~0.02% of inputs for any other rounding
-/// mode.
-///
-/// **See also:** Documentation for
-/// [std::atan2](https://en.cppreference.com/w/cpp/numeric/math/atan2).
-/// \param y numerator
-/// \param x denominator
-/// \return arc tangent value
-/// \exception FE_INVALID if \a x or \a y is signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half atan2(half y, half x)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::atan2(detail::half2float<detail::internal_t>(y.data_),
-                               detail::half2float<detail::internal_t>(x.data_))));
-#else
-    unsigned int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, signx = x.data_ >> 15,
-                 signy = y.data_ & 0x8000;
-    if(absx >= 0x7C00 || absy >= 0x7C00)
-    {
-        if(absx > 0x7C00 || absy > 0x7C00)
-            return half(detail::binary, detail::signal(x.data_, y.data_));
-        if(absy == 0x7C00)
-            return half(detail::binary,
-                        (absx < 0x7C00)
-                            ? detail::rounded<half::round_style, true>(signy | 0x3E48, 0, 1)
-                            : signx
-                                  ? detail::rounded<half::round_style, true>(signy | 0x40B6, 0, 1)
-                                  : detail::rounded<half::round_style, true>(signy | 0x3A48, 0, 1));
-        return (x.data_ == 0x7C00)
-                   ? half(detail::binary, signy)
-                   : half(detail::binary,
-                          detail::rounded<half::round_style, true>(signy | 0x4248, 0, 1));
-    }
-    if(!absy)
-        return signx ? half(detail::binary,
-                            detail::rounded<half::round_style, true>(signy | 0x4248, 0, 1))
-                     : y;
-    if(!absx)
-        return half(detail::binary, detail::rounded<half::round_style, true>(signy | 0x3E48, 0, 1));
-    int d = (absy >> 10) + (absy <= 0x3FF) - (absx >> 10) - (absx <= 0x3FF);
-    if(d > (signx ? 18 : 12))
-        return half(detail::binary, detail::rounded<half::round_style, true>(signy | 0x3E48, 0, 1));
-    if(signx && d < -11)
-        return half(detail::binary, detail::rounded<half::round_style, true>(signy | 0x4248, 0, 1));
-    if(!signx && d < ((half::round_style == std::round_toward_zero) ? -15 : -9))
-    {
-        for(; absy < 0x400; absy <<= 1, --d)
-            ;
-        detail::uint32 mx = ((absx << 1) & 0x7FF) | 0x800, my = ((absy << 1) & 0x7FF) | 0x800;
-        int i = my < mx;
-        d -= i;
-        if(d < -25)
-            return half(detail::binary, detail::underflow<half::round_style>(signy));
-        my <<= 11 + i;
-        return half(detail::binary,
-                    detail::fixed2half<half::round_style, 11, false, false, true>(
-                        my / mx, d + 14, signy, my % mx != 0));
-    }
-    detail::uint32 m = detail::atan2(
-        ((absy & 0x3FF) | ((absy > 0x3FF) << 10)) << (19 + ((d < 0) ? d : (d > 0) ? 0 : -1)),
-        ((absx & 0x3FF) | ((absx > 0x3FF) << 10)) << (19 - ((d > 0) ? d : (d < 0) ? 0 : 1)));
-    return half(detail::binary,
-                detail::fixed2half<half::round_style, 31, false, true, true>(
-                    signx ? (0xC90FDAA2 - m) : m, 15, signy, signx));
-#endif
-}
-
-/// \}
-/// \anchor hyperbolic
-/// \name Hyperbolic functions
-/// \{
-
-/// Hyperbolic sine.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::sinh](https://en.cppreference.com/w/cpp/numeric/math/sinh).
-/// \param arg function argument
-/// \return hyperbolic sine value of \a arg
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half sinh(half arg)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::sinh(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    int abs = arg.data_ & 0x7FFF, exp;
-    if(!abs || abs >= 0x7C00)
-        return (abs > 0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
-    if(abs <= 0x2900)
-        return half(detail::binary, detail::rounded<half::round_style, true>(arg.data_, 0, 1));
-    std::pair<detail::uint32, detail::uint32> mm =
-        detail::hyperbolic_args(abs, exp, (half::round_style == std::round_to_nearest) ? 29 : 27);
-    detail::uint32 m = mm.first - mm.second;
-    for(exp += 13; m < 0x80000000 && exp; m <<= 1, --exp)
-        ;
-    unsigned int sign = arg.data_ & 0x8000;
-    if(exp > 29)
-        return half(detail::binary, detail::overflow<half::round_style>(sign));
-    return half(detail::binary,
-                detail::fixed2half<half::round_style, 31, false, false, true>(m, exp, sign));
-#endif
-}
-
-/// Hyperbolic cosine.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::cosh](https://en.cppreference.com/w/cpp/numeric/math/cosh).
-/// \param arg function argument
-/// \return hyperbolic cosine value of \a arg
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half cosh(half arg)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::cosh(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    int abs = arg.data_ & 0x7FFF, exp;
-    if(!abs)
-        return half(detail::binary, 0x3C00);
-    if(abs >= 0x7C00)
-        return half(detail::binary, (abs > 0x7C00) ? detail::signal(arg.data_) : 0x7C00);
-    std::pair<detail::uint32, detail::uint32> mm =
-        detail::hyperbolic_args(abs, exp, (half::round_style == std::round_to_nearest) ? 23 : 26);
-    detail::uint32 m = mm.first + mm.second, i = (~m & 0xFFFFFFFF) >> 31;
-    m = (m >> i) | (m & i) | 0x80000000;
-    if((exp += 13 + i) > 29)
-        return half(detail::binary, detail::overflow<half::round_style>());
-    return half(detail::binary,
-                detail::fixed2half<half::round_style, 31, false, false, true>(m, exp));
-#endif
-}
-
-/// Hyperbolic tangent.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::tanh](https://en.cppreference.com/w/cpp/numeric/math/tanh).
-/// \param arg function argument
-/// \return hyperbolic tangent value of \a arg
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half tanh(half arg)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::tanh(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    int abs = arg.data_ & 0x7FFF, exp;
-    if(!abs)
-        return arg;
-    if(abs >= 0x7C00)
-        return half(detail::binary,
-                    (abs > 0x7C00) ? detail::signal(arg.data_) : (arg.data_ - 0x4000));
-    if(abs >= 0x4500)
-        return half(detail::binary,
-                    detail::rounded<half::round_style, true>((arg.data_ & 0x8000) | 0x3BFF, 1, 1));
-    if(abs < 0x2700)
-        return half(detail::binary, detail::rounded<half::round_style, true>(arg.data_ - 1, 1, 1));
-    if(half::round_style != std::round_to_nearest && abs == 0x2D3F)
-        return half(detail::binary, detail::rounded<half::round_style, true>(arg.data_ - 3, 0, 1));
-    std::pair<detail::uint32, detail::uint32> mm = detail::hyperbolic_args(abs, exp, 27);
-    detail::uint32 my = mm.first - mm.second - (half::round_style != std::round_to_nearest),
-                   mx = mm.first + mm.second, i = (~mx & 0xFFFFFFFF) >> 31;
-    for(exp = 13; my < 0x80000000; my <<= 1, --exp)
-        ;
-    mx = (mx >> i) | 0x80000000;
-    return half(detail::binary,
-                detail::tangent_post<half::round_style>(my, mx, exp - i, arg.data_ & 0x8000));
-#endif
-}
-
-/// Hyperbolic area sine.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::asinh](https://en.cppreference.com/w/cpp/numeric/math/asinh).
-/// \param arg function argument
-/// \return area sine value of \a arg
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half asinh(half arg)
-{
-#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::asinh(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    int abs = arg.data_ & 0x7FFF;
-    if(!abs || abs >= 0x7C00)
-        return (abs > 0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
-    if(abs <= 0x2900)
-        return half(detail::binary, detail::rounded<half::round_style, true>(arg.data_ - 1, 1, 1));
-    if(half::round_style != std::round_to_nearest)
-        switch(abs)
-        {
-        case 0x32D4:
-            return half(detail::binary,
-                        detail::rounded<half::round_style, true>(arg.data_ - 13, 1, 1));
-        case 0x3B5B:
-            return half(detail::binary,
-                        detail::rounded<half::round_style, true>(arg.data_ - 197, 1, 1));
-        }
-    return half(detail::binary, detail::area<half::round_style, true>(arg.data_));
-#endif
-}
-
-/// Hyperbolic area cosine.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::acosh](https://en.cppreference.com/w/cpp/numeric/math/acosh).
-/// \param arg function argument
-/// \return area cosine value of \a arg
-/// \exception FE_INVALID for signaling NaN or arguments <1
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half acosh(half arg)
-{
-#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::acosh(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    int abs = arg.data_ & 0x7FFF;
-    if((arg.data_ & 0x8000) || abs < 0x3C00)
-        return half(detail::binary,
-                    (abs <= 0x7C00) ? detail::invalid() : detail::signal(arg.data_));
-    if(abs == 0x3C00)
-        return half(detail::binary, 0);
-    if(arg.data_ >= 0x7C00)
-        return (abs > 0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
-    return half(detail::binary, detail::area<half::round_style, false>(arg.data_));
-#endif
-}
-
-/// Hyperbolic area tangent.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::atanh](https://en.cppreference.com/w/cpp/numeric/math/atanh).
-/// \param arg function argument
-/// \return area tangent value of \a arg
-/// \exception FE_INVALID for signaling NaN or if abs(\a arg) > 1
-/// \exception FE_DIVBYZERO for +/-1
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half atanh(half arg)
-{
-#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::atanh(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    int abs = arg.data_ & 0x7FFF, exp = 0;
-    if(!abs)
-        return arg;
-    if(abs >= 0x3C00)
-        return half(detail::binary,
-                    (abs == 0x3C00)
-                        ? detail::pole(arg.data_ & 0x8000)
-                        : (abs <= 0x7C00) ? detail::invalid() : detail::signal(arg.data_));
-    if(abs < 0x2700)
-        return half(detail::binary, detail::rounded<half::round_style, true>(arg.data_, 0, 1));
-    detail::uint32 m = static_cast<detail::uint32>((abs & 0x3FF) | ((abs > 0x3FF) << 10))
-                       << ((abs >> 10) + (abs <= 0x3FF) + 6),
-                   my = 0x80000000 + m, mx = 0x80000000 - m;
-    for(; mx < 0x80000000; mx <<= 1, ++exp)
-        ;
-    int i = my >= mx, s;
-    return half(detail::binary,
-                detail::log2_post<half::round_style, 0xB8AA3B2A>(
-                    detail::log2((detail::divide64(my >> i, mx, s) + 1) >> 1, 27) + 0x10,
-                    exp + i - 1,
-                    16,
-                    arg.data_ & 0x8000));
-#endif
-}
-
-/// \}
-/// \anchor special
-/// \name Error and gamma functions
-/// \{
-
-/// Error function.
-/// This function may be 1 ULP off the correctly rounded exact result for any rounding mode in <0.5%
-/// of inputs.
-///
-/// **See also:** Documentation for [std::erf](https://en.cppreference.com/w/cpp/numeric/math/erf).
-/// \param arg function argument
-/// \return error function value of \a arg
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half erf(half arg)
-{
-#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::erf(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    unsigned int abs = arg.data_ & 0x7FFF;
-    if(!abs || abs >= 0x7C00)
-        return (abs >= 0x7C00)
-                   ? half(detail::binary,
-                          (abs == 0x7C00) ? (arg.data_ - 0x4000) : detail::signal(arg.data_))
-                   : arg;
-    if(abs >= 0x4200)
-        return half(detail::binary,
-                    detail::rounded<half::round_style, true>((arg.data_ & 0x8000) | 0x3BFF, 1, 1));
-    return half(detail::binary, detail::erf<half::round_style, false>(arg.data_));
-#endif
-}
-
-/// Complementary error function.
-/// This function may be 1 ULP off the correctly rounded exact result for any rounding mode in <0.5%
-/// of inputs.
-///
-/// **See also:** Documentation for
-/// [std::erfc](https://en.cppreference.com/w/cpp/numeric/math/erfc).
-/// \param arg function argument
-/// \return 1 minus error function value of \a arg
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half erfc(half arg)
-{
-#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::erfc(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ & 0x8000;
-    if(abs >= 0x7C00)
-        return (abs >= 0x7C00)
-                   ? half(detail::binary, (abs == 0x7C00) ? (sign >> 1) : detail::signal(arg.data_))
-                   : arg;
-    if(!abs)
-        return half(detail::binary, 0x3C00);
-    if(abs >= 0x4400)
-        return half(
-            detail::binary,
-            detail::rounded<half::round_style, true>((sign >> 1) - (sign >> 15), sign >> 15, 1));
-    return half(detail::binary, detail::erf<half::round_style, true>(arg.data_));
-#endif
-}
-
-/// Natural logarithm of gamma function.
-/// This function may be 1 ULP off the correctly rounded exact result for any rounding mode in
-/// ~0.025% of inputs.
-///
-/// **See also:** Documentation for
-/// [std::lgamma](https://en.cppreference.com/w/cpp/numeric/math/lgamma).
-/// \param arg function argument
-/// \return natural logarith of gamma function for \a arg
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_DIVBYZERO for 0 or negative integer arguments
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half lgamma(half arg)
-{
-#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::lgamma(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    int abs = arg.data_ & 0x7FFF;
-    if(abs >= 0x7C00)
-        return half(detail::binary, (abs == 0x7C00) ? 0x7C00 : detail::signal(arg.data_));
-    if(!abs || arg.data_ >= 0xE400 ||
-       (arg.data_ >= 0xBC00 && !(abs & ((1 << (25 - (abs >> 10))) - 1))))
-        return half(detail::binary, detail::pole());
-    if(arg.data_ == 0x3C00 || arg.data_ == 0x4000)
-        return half(detail::binary, 0);
-    return half(detail::binary, detail::gamma<half::round_style, true>(arg.data_));
-#endif
-}
-
-/// Gamma function.
-/// This function may be 1 ULP off the correctly rounded exact result for any rounding mode in
-/// <0.25% of inputs.
-///
-/// **See also:** Documentation for
-/// [std::tgamma](https://en.cppreference.com/w/cpp/numeric/math/tgamma).
-/// \param arg function argument
-/// \return gamma function value of \a arg
-/// \exception FE_INVALID for signaling NaN, negative infinity or negative integer arguments
-/// \exception FE_DIVBYZERO for 0
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half tgamma(half arg)
-{
-#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::tgamma(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    unsigned int abs = arg.data_ & 0x7FFF;
-    if(!abs)
-        return half(detail::binary, detail::pole(arg.data_));
-    if(abs >= 0x7C00)
-        return (arg.data_ == 0x7C00) ? arg : half(detail::binary, detail::signal(arg.data_));
-    if(arg.data_ >= 0xE400 || (arg.data_ >= 0xBC00 && !(abs & ((1 << (25 - (abs >> 10))) - 1))))
-        return half(detail::binary, detail::invalid());
-    if(arg.data_ >= 0xCA80)
-        return half(
-            detail::binary,
-            detail::underflow<half::round_style>((1 - ((abs >> (25 - (abs >> 10))) & 1)) << 15));
-    if(arg.data_ <= 0x100 || (arg.data_ >= 0x4900 && arg.data_ < 0x8000))
-        return half(detail::binary, detail::overflow<half::round_style>());
-    if(arg.data_ == 0x3C00)
-        return arg;
-    return half(detail::binary, detail::gamma<half::round_style, false>(arg.data_));
-#endif
-}
-
-/// \}
-/// \anchor rounding
-/// \name Rounding
-/// \{
-
-/// Nearest integer not less than half value.
-/// **See also:** Documentation for
-/// [std::ceil](https://en.cppreference.com/w/cpp/numeric/math/ceil).
-/// \param arg half to round
-/// \return nearest integer not less than \a arg
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_INEXACT if value had to be rounded
-inline half ceil(half arg)
-{
-    return half(detail::binary,
-                detail::integral<std::round_toward_infinity, true, true>(arg.data_));
-}
-
-/// Nearest integer not greater than half value.
-/// **See also:** Documentation for
-/// [std::floor](https://en.cppreference.com/w/cpp/numeric/math/floor).
-/// \param arg half to round
-/// \return nearest integer not greater than \a arg
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_INEXACT if value had to be rounded
-inline half floor(half arg)
-{
-    return half(detail::binary,
-                detail::integral<std::round_toward_neg_infinity, true, true>(arg.data_));
-}
-
-/// Nearest integer not greater in magnitude than half value.
-/// **See also:** Documentation for
-/// [std::trunc](https://en.cppreference.com/w/cpp/numeric/math/trunc).
-/// \param arg half to round
-/// \return nearest integer not greater in magnitude than \a arg
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_INEXACT if value had to be rounded
-inline half trunc(half arg)
-{
-    return half(detail::binary, detail::integral<std::round_toward_zero, true, true>(arg.data_));
-}
-
-/// Nearest integer.
-/// **See also:** Documentation for
-/// [std::round](https://en.cppreference.com/w/cpp/numeric/math/round).
-/// \param arg half to round
-/// \return nearest integer, rounded away from zero in half-way cases
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_INEXACT if value had to be rounded
-inline half round(half arg)
-{
-    return half(detail::binary, detail::integral<std::round_to_nearest, false, true>(arg.data_));
-}
-
-/// Nearest integer.
-/// **See also:** Documentation for
-/// [std::lround](https://en.cppreference.com/w/cpp/numeric/math/round).
-/// \param arg half to round
-/// \return nearest integer, rounded away from zero in half-way cases
-/// \exception FE_INVALID if value is not representable as `long`
-inline long lround(half arg)
-{
-    return detail::half2int<std::round_to_nearest, false, false, long>(arg.data_);
-}
-
-/// Nearest integer using half's internal rounding mode.
-/// **See also:** Documentation for
-/// [std::rint](https://en.cppreference.com/w/cpp/numeric/math/rint).
-/// \param arg half expression to round
-/// \return nearest integer using default rounding mode
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_INEXACT if value had to be rounded
-inline half rint(half arg)
-{
-    return half(detail::binary, detail::integral<half::round_style, true, true>(arg.data_));
-}
-
-/// Nearest integer using half's internal rounding mode.
-/// **See also:** Documentation for
-/// [std::lrint](https://en.cppreference.com/w/cpp/numeric/math/rint).
-/// \param arg half expression to round
-/// \return nearest integer using default rounding mode
-/// \exception FE_INVALID if value is not representable as `long`
-/// \exception FE_INEXACT if value had to be rounded
-inline long lrint(half arg)
-{
-    return detail::half2int<half::round_style, true, true, long>(arg.data_);
-}
-
-/// Nearest integer using half's internal rounding mode.
-/// **See also:** Documentation for
-/// [std::nearbyint](https://en.cppreference.com/w/cpp/numeric/math/nearbyint).
-/// \param arg half expression to round
-/// \return nearest integer using default rounding mode
-/// \exception FE_INVALID for signaling NaN
-inline half nearbyint(half arg)
-{
-    return half(detail::binary, detail::integral<half::round_style, true, false>(arg.data_));
-}
-#if HALF_ENABLE_CPP11_LONG_LONG
-/// Nearest integer.
-/// **See also:** Documentation for
-/// [std::llround](https://en.cppreference.com/w/cpp/numeric/math/round).
-/// \param arg half to round
-/// \return nearest integer, rounded away from zero in half-way cases
-/// \exception FE_INVALID if value is not representable as `long long`
-inline long long llround(half arg)
-{
-    return detail::half2int<std::round_to_nearest, false, false, long long>(arg.data_);
-}
-
-/// Nearest integer using half's internal rounding mode.
-/// **See also:** Documentation for
-/// [std::llrint](https://en.cppreference.com/w/cpp/numeric/math/rint).
-/// \param arg half expression to round
-/// \return nearest integer using default rounding mode
-/// \exception FE_INVALID if value is not representable as `long long`
-/// \exception FE_INEXACT if value had to be rounded
-inline long long llrint(half arg)
-{
-    return detail::half2int<half::round_style, true, true, long long>(arg.data_);
-}
-#endif
-
-/// \}
-/// \anchor float
-/// \name Floating point manipulation
-/// \{
-
-/// Decompress floating-point number.
-/// **See also:** Documentation for
-/// [std::frexp](https://en.cppreference.com/w/cpp/numeric/math/frexp).
-/// \param arg number to decompress
-/// \param exp address to store exponent at
-/// \return significant in range [0.5, 1)
-/// \exception FE_INVALID for signaling NaN
-inline half frexp(half arg, int* exp)
-{
-    *exp             = 0;
-    unsigned int abs = arg.data_ & 0x7FFF;
-    if(abs >= 0x7C00 || !abs)
-        return (abs > 0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
-    for(; abs < 0x400; abs <<= 1, --*exp)
-        ;
-    *exp += (abs >> 10) - 14;
-    return half(detail::binary, (arg.data_ & 0x8000) | 0x3800 | (abs & 0x3FF));
-}
-
-/// Multiply by power of two.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::scalbln](https://en.cppreference.com/w/cpp/numeric/math/scalbn).
-/// \param arg number to modify
-/// \param exp power of two to multiply with
-/// \return \a arg multplied by 2 raised to \a exp
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half scalbln(half arg, long exp)
-{
-    unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ & 0x8000;
-    if(abs >= 0x7C00 || !abs)
-        return (abs > 0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
-    for(; abs < 0x400; abs <<= 1, --exp)
-        ;
-    exp += abs >> 10;
-    if(exp > 30)
-        return half(detail::binary, detail::overflow<half::round_style>(sign));
-    else if(exp < -10)
-        return half(detail::binary, detail::underflow<half::round_style>(sign));
-    else if(exp > 0)
-        return half(detail::binary, sign | (exp << 10) | (abs & 0x3FF));
-    unsigned int m = (abs & 0x3FF) | 0x400;
-    return half(detail::binary,
-                detail::rounded<half::round_style, false>(
-                    sign | (m >> (1 - exp)), (m >> -exp) & 1, (m & ((1 << -exp) - 1)) != 0));
-}
-
-/// Multiply by power of two.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::scalbn](https://en.cppreference.com/w/cpp/numeric/math/scalbn).
-/// \param arg number to modify
-/// \param exp power of two to multiply with
-/// \return \a arg multplied by 2 raised to \a exp
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half scalbn(half arg, int exp) { return scalbln(arg, exp); }
-
-/// Multiply by power of two.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::ldexp](https://en.cppreference.com/w/cpp/numeric/math/ldexp).
-/// \param arg number to modify
-/// \param exp power of two to multiply with
-/// \return \a arg multplied by 2 raised to \a exp
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half ldexp(half arg, int exp) { return scalbln(arg, exp); }
-
-/// Extract integer and fractional parts.
-/// **See also:** Documentation for
-/// [std::modf](https://en.cppreference.com/w/cpp/numeric/math/modf).
-/// \param arg number to decompress
-/// \param iptr address to store integer part at
-/// \return fractional part
-/// \exception FE_INVALID for signaling NaN
-inline half modf(half arg, half* iptr)
-{
-    unsigned int abs = arg.data_ & 0x7FFF;
-    if(abs > 0x7C00)
-    {
-        arg          = half(detail::binary, detail::signal(arg.data_));
-        return *iptr = arg, arg;
-    }
-    if(abs >= 0x6400)
-        return *iptr = arg, half(detail::binary, arg.data_ & 0x8000);
-    if(abs < 0x3C00)
-        return iptr->data_ = arg.data_ & 0x8000, arg;
-    unsigned int exp = abs >> 10, mask = (1 << (25 - exp)) - 1, m = arg.data_ & mask;
-    iptr->data_ = arg.data_ & ~mask;
-    if(!m)
-        return half(detail::binary, arg.data_ & 0x8000);
-    for(; m < 0x400; m <<= 1, --exp)
-        ;
-    return half(detail::binary, (arg.data_ & 0x8000) | (exp << 10) | (m & 0x3FF));
-}
-
-/// Extract exponent.
-/// **See also:** Documentation for
-/// [std::ilogb](https://en.cppreference.com/w/cpp/numeric/math/ilogb).
-/// \param arg number to query
-/// \return floating-point exponent
-/// \retval FP_ILOGB0 for zero
-/// \retval FP_ILOGBNAN for NaN
-/// \retval INT_MAX for infinity
-/// \exception FE_INVALID for 0 or infinite values
-inline int ilogb(half arg)
-{
-    int abs = arg.data_ & 0x7FFF, exp;
-    if(!abs || abs >= 0x7C00)
-    {
-        detail::raise(FE_INVALID);
-        return !abs ? FP_ILOGB0 : (abs == 0x7C00) ? INT_MAX : FP_ILOGBNAN;
-    }
-    for(exp = (abs >> 10) - 15; abs < 0x200; abs <<= 1, --exp)
-        ;
-    return exp;
-}
-
-/// Extract exponent.
-/// **See also:** Documentation for
-/// [std::logb](https://en.cppreference.com/w/cpp/numeric/math/logb).
-/// \param arg number to query
-/// \return floating-point exponent
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_DIVBYZERO for 0
-inline half logb(half arg)
-{
-    int abs = arg.data_ & 0x7FFF, exp;
-    if(!abs)
-        return half(detail::binary, detail::pole(0x8000));
-    if(abs >= 0x7C00)
-        return half(detail::binary, (abs == 0x7C00) ? 0x7C00 : detail::signal(arg.data_));
-    for(exp = (abs >> 10) - 15; abs < 0x200; abs <<= 1, --exp)
-        ;
-    unsigned int value = static_cast<unsigned>(exp < 0) << 15;
-    if(exp)
-    {
-        unsigned int m = std::abs(exp) << 6;
-        for(exp = 18; m < 0x400; m <<= 1, --exp)
-            ;
-        value |= (exp << 10) + m;
-    }
-    return half(detail::binary, value);
-}
-
-/// Next representable value.
-/// **See also:** Documentation for
-/// [std::nextafter](https://en.cppreference.com/w/cpp/numeric/math/nextafter).
-/// \param from value to compute next representable value for
-/// \param to direction towards which to compute next value
-/// \return next representable value after \a from in direction towards \a to
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_OVERFLOW for infinite result from finite argument
-/// \exception FE_UNDERFLOW for subnormal result
-inline half nextafter(half from, half to)
-{
-    int fabs = from.data_ & 0x7FFF, tabs = to.data_ & 0x7FFF;
-    if(fabs > 0x7C00 || tabs > 0x7C00)
-        return half(detail::binary, detail::signal(from.data_, to.data_));
-    if(from.data_ == to.data_ || !(fabs | tabs))
-        return to;
-    if(!fabs)
-    {
-        detail::raise(FE_UNDERFLOW, !HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT);
-        return half(detail::binary, (to.data_ & 0x8000) + 1);
-    }
-    unsigned int out =
-        from.data_ +
-        (((from.data_ >> 15) ^
-          static_cast<unsigned>((from.data_ ^ (0x8000 | (0x8000 - (from.data_ >> 15)))) <
-                                (to.data_ ^ (0x8000 | (0x8000 - (to.data_ >> 15))))))
-         << 1) -
-        1;
-    detail::raise(FE_OVERFLOW, fabs < 0x7C00 && (out & 0x7C00) == 0x7C00);
-    detail::raise(FE_UNDERFLOW, !HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT && (out & 0x7C00) < 0x400);
-    return half(detail::binary, out);
-}
-
-/// Next representable value.
-/// **See also:** Documentation for
-/// [std::nexttoward](https://en.cppreference.com/w/cpp/numeric/math/nexttoward).
-/// \param from value to compute next representable value for
-/// \param to direction towards which to compute next value
-/// \return next representable value after \a from in direction towards \a to
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_OVERFLOW for infinite result from finite argument
-/// \exception FE_UNDERFLOW for subnormal result
-inline half nexttoward(half from, long double to)
-{
-    int fabs = from.data_ & 0x7FFF;
-    if(fabs > 0x7C00)
-        return half(detail::binary, detail::signal(from.data_));
-    long double lfrom = static_cast<long double>(from);
-    if(detail::builtin_isnan(to) || lfrom == to)
-        return half(static_cast<float>(to));
-    if(!fabs)
-    {
-        detail::raise(FE_UNDERFLOW, !HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT);
-        return half(detail::binary, (static_cast<unsigned>(detail::builtin_signbit(to)) << 15) + 1);
-    }
-    unsigned int out =
-        from.data_ + (((from.data_ >> 15) ^ static_cast<unsigned>(lfrom < to)) << 1) - 1;
-    detail::raise(FE_OVERFLOW, (out & 0x7FFF) == 0x7C00);
-    detail::raise(FE_UNDERFLOW, !HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT && (out & 0x7FFF) < 0x400);
-    return half(detail::binary, out);
-}
-
-/// Take sign.
-/// **See also:** Documentation for
-/// [std::copysign](https://en.cppreference.com/w/cpp/numeric/math/copysign).
-/// \param x value to change sign for
-/// \param y value to take sign from
-/// \return value equal to \a x in magnitude and to \a y in sign
-inline HALF_CONSTEXPR half copysign(half x, half y)
-{
-    return half(detail::binary, x.data_ ^ ((x.data_ ^ y.data_) & 0x8000));
-}
-
-/// \}
-/// \anchor classification
-/// \name Floating point classification
-/// \{
-
-/// Classify floating-point value.
-/// **See also:** Documentation for
-/// [std::fpclassify](https://en.cppreference.com/w/cpp/numeric/math/fpclassify).
-/// \param arg number to classify
-/// \retval FP_ZERO for positive and negative zero
-/// \retval FP_SUBNORMAL for subnormal numbers
-/// \retval FP_INFINITY for positive and negative infinity
-/// \retval FP_NAN for NaNs
-/// \retval FP_NORMAL for all other (normal) values
-inline HALF_CONSTEXPR int fpclassify(half arg)
-{
-    return !(arg.data_ & 0x7FFF)
-               ? FP_ZERO
-               : ((arg.data_ & 0x7FFF) < 0x400)
-                     ? FP_SUBNORMAL
-                     : ((arg.data_ & 0x7FFF) < 0x7C00)
-                           ? FP_NORMAL
-                           : ((arg.data_ & 0x7FFF) == 0x7C00) ? FP_INFINITE : FP_NAN;
-}
-
-/// Check if finite number.
-/// **See also:** Documentation for
-/// [std::isfinite](https://en.cppreference.com/w/cpp/numeric/math/isfinite).
-/// \param arg number to check
-/// \retval true if neither infinity nor NaN
-/// \retval false else
-inline HALF_CONSTEXPR bool isfinite(half arg) { return (arg.data_ & 0x7C00) != 0x7C00; }
-
-/// Check for infinity.
-/// **See also:** Documentation for
-/// [std::isinf](https://en.cppreference.com/w/cpp/numeric/math/isinf).
-/// \param arg number to check
-/// \retval true for positive or negative infinity
-/// \retval false else
-inline HALF_CONSTEXPR bool isinf(half arg) { return (arg.data_ & 0x7FFF) == 0x7C00; }
-
-/// Check for NaN.
-/// **See also:** Documentation for
-/// [std::isnan](https://en.cppreference.com/w/cpp/numeric/math/isnan).
-/// \param arg number to check
-/// \retval true for NaNs
-/// \retval false else
-inline HALF_CONSTEXPR bool isnan(half arg) { return (arg.data_ & 0x7FFF) > 0x7C00; }
-
-/// Check if normal number.
-/// **See also:** Documentation for
-/// [std::isnormal](https://en.cppreference.com/w/cpp/numeric/math/isnormal).
-/// \param arg number to check
-/// \retval true if normal number
-/// \retval false if either subnormal, zero, infinity or NaN
-inline HALF_CONSTEXPR bool isnormal(half arg)
-{
-    return ((arg.data_ & 0x7C00) != 0) & ((arg.data_ & 0x7C00) != 0x7C00);
-}
-
-/// Check sign.
-/// **See also:** Documentation for
-/// [std::signbit](https://en.cppreference.com/w/cpp/numeric/math/signbit).
-/// \param arg number to check
-/// \retval true for negative number
-/// \retval false for positive number
-inline HALF_CONSTEXPR bool signbit(half arg) { return (arg.data_ & 0x8000) != 0; }
-
-/// \}
-/// \anchor compfunc
-/// \name Comparison
-/// \{
-
-/// Quiet comparison for greater than.
-/// **See also:** Documentation for
-/// [std::isgreater](https://en.cppreference.com/w/cpp/numeric/math/isgreater).
-/// \param x first operand
-/// \param y second operand
-/// \retval true if \a x greater than \a y
-/// \retval false else
-inline HALF_CONSTEXPR bool isgreater(half x, half y)
-{
-    return ((x.data_ ^ (0x8000 | (0x8000 - (x.data_ >> 15)))) + (x.data_ >> 15)) >
-               ((y.data_ ^ (0x8000 | (0x8000 - (y.data_ >> 15)))) + (y.data_ >> 15)) &&
-           !isnan(x) && !isnan(y);
-}
-
-/// Quiet comparison for greater equal.
-/// **See also:** Documentation for
-/// [std::isgreaterequal](https://en.cppreference.com/w/cpp/numeric/math/isgreaterequal).
-/// \param x first operand
-/// \param y second operand
-/// \retval true if \a x greater equal \a y
-/// \retval false else
-inline HALF_CONSTEXPR bool isgreaterequal(half x, half y)
-{
-    return ((x.data_ ^ (0x8000 | (0x8000 - (x.data_ >> 15)))) + (x.data_ >> 15)) >=
-               ((y.data_ ^ (0x8000 | (0x8000 - (y.data_ >> 15)))) + (y.data_ >> 15)) &&
-           !isnan(x) && !isnan(y);
-}
-
-/// Quiet comparison for less than.
-/// **See also:** Documentation for
-/// [std::isless](https://en.cppreference.com/w/cpp/numeric/math/isless).
-/// \param x first operand
-/// \param y second operand
-/// \retval true if \a x less than \a y
-/// \retval false else
-inline HALF_CONSTEXPR bool isless(half x, half y)
-{
-    return ((x.data_ ^ (0x8000 | (0x8000 - (x.data_ >> 15)))) + (x.data_ >> 15)) <
-               ((y.data_ ^ (0x8000 | (0x8000 - (y.data_ >> 15)))) + (y.data_ >> 15)) &&
-           !isnan(x) && !isnan(y);
-}
-
-/// Quiet comparison for less equal.
-/// **See also:** Documentation for
-/// [std::islessequal](https://en.cppreference.com/w/cpp/numeric/math/islessequal).
-/// \param x first operand
-/// \param y second operand
-/// \retval true if \a x less equal \a y
-/// \retval false else
-inline HALF_CONSTEXPR bool islessequal(half x, half y)
-{
-    return ((x.data_ ^ (0x8000 | (0x8000 - (x.data_ >> 15)))) + (x.data_ >> 15)) <=
-               ((y.data_ ^ (0x8000 | (0x8000 - (y.data_ >> 15)))) + (y.data_ >> 15)) &&
-           !isnan(x) && !isnan(y);
-}
-
-/// Quiet comarison for less or greater.
-/// **See also:** Documentation for
-/// [std::islessgreater](https://en.cppreference.com/w/cpp/numeric/math/islessgreater).
-/// \param x first operand
-/// \param y second operand
-/// \retval true if either less or greater
-/// \retval false else
-inline HALF_CONSTEXPR bool islessgreater(half x, half y)
-{
-    return x.data_ != y.data_ && ((x.data_ | y.data_) & 0x7FFF) && !isnan(x) && !isnan(y);
-}
-
-/// Quiet check if unordered.
-/// **See also:** Documentation for
-/// [std::isunordered](https://en.cppreference.com/w/cpp/numeric/math/isunordered).
-/// \param x first operand
-/// \param y second operand
-/// \retval true if unordered (one or two NaN operands)
-/// \retval false else
-inline HALF_CONSTEXPR bool isunordered(half x, half y) { return isnan(x) || isnan(y); }
-
-/// \}
-/// \anchor casting
-/// \name Casting
-/// \{
-
-/// Cast to or from half-precision floating-point number.
-/// This casts between [half](\ref half_float::half) and any built-in arithmetic type. The values
-/// are converted
-/// directly using the default rounding mode, without any roundtrip over `float` that a
-/// `static_cast` would otherwise do.
-///
-/// Using this cast with neither of the two types being a [half](\ref half_float::half) or with any
-/// of the two types
-/// not being a built-in arithmetic type (apart from [half](\ref half_float::half), of course)
-/// results in a compiler
-/// error and casting between [half](\ref half_float::half)s returns the argument unmodified.
-/// \tparam T destination type (half or built-in arithmetic type)
-/// \tparam U source type (half or built-in arithmetic type)
-/// \param arg value to cast
-/// \return \a arg converted to destination type
-/// \exception FE_INVALID if \a T is integer type and result is not representable as \a T
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-template <typename T, typename U>
-T half_cast(U arg)
-{
-    return detail::half_caster<T, U>::cast(arg);
-}
-
-/// Cast to or from half-precision floating-point number.
-/// This casts between [half](\ref half_float::half) and any built-in arithmetic type. The values
-/// are converted
-/// directly using the specified rounding mode, without any roundtrip over `float` that a
-/// `static_cast` would otherwise do.
-///
-/// Using this cast with neither of the two types being a [half](\ref half_float::half) or with any
-/// of the two types
-/// not being a built-in arithmetic type (apart from [half](\ref half_float::half), of course)
-/// results in a compiler
-/// error and casting between [half](\ref half_float::half)s returns the argument unmodified.
-/// \tparam T destination type (half or built-in arithmetic type)
-/// \tparam R rounding mode to use.
-/// \tparam U source type (half or built-in arithmetic type)
-/// \param arg value to cast
-/// \return \a arg converted to destination type
-/// \exception FE_INVALID if \a T is integer type and result is not representable as \a T
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-template <typename T, std::float_round_style R, typename U>
-T half_cast(U arg)
-{
-    return detail::half_caster<T, U, R>::cast(arg);
-}
-/// \}
-
-/// \}
-/// \anchor errors
-/// \name Error handling
-/// \{
-
-/// Clear exception flags.
-/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is
-/// disabled,
-/// but in that case manual flag management is the only way to raise flags.
-///
-/// **See also:** Documentation for
-/// [std::feclearexcept](https://en.cppreference.com/w/cpp/numeric/fenv/feclearexcept).
-/// \param excepts OR of exceptions to clear
-/// \retval 0 all selected flags cleared successfully
-inline int feclearexcept(int excepts)
-{
-    detail::errflags() &= ~excepts;
-    return 0;
-}
-
-/// Test exception flags.
-/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is
-/// disabled,
-/// but in that case manual flag management is the only way to raise flags.
-///
-/// **See also:** Documentation for
-/// [std::fetestexcept](https://en.cppreference.com/w/cpp/numeric/fenv/fetestexcept).
-/// \param excepts OR of exceptions to test
-/// \return OR of selected exceptions if raised
-inline int fetestexcept(int excepts) { return detail::errflags() & excepts; }
-
-/// Raise exception flags.
-/// This raises the specified floating point exceptions and also invokes any additional automatic
-/// exception handling as
-/// configured with the [HALF_ERRHANDLIG_...](\ref HALF_ERRHANDLING_ERRNO) preprocessor symbols.
-/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is
-/// disabled,
-/// but in that case manual flag management is the only way to raise flags.
-///
-/// **See also:** Documentation for
-/// [std::feraiseexcept](https://en.cppreference.com/w/cpp/numeric/fenv/feraiseexcept).
-/// \param excepts OR of exceptions to raise
-/// \retval 0 all selected exceptions raised successfully
-inline int feraiseexcept(int excepts)
-{
-    detail::errflags() |= excepts;
-    detail::raise(excepts);
-    return 0;
-}
-
-/// Save exception flags.
-/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is
-/// disabled,
-/// but in that case manual flag management is the only way to raise flags.
-///
-/// **See also:** Documentation for
-/// [std::fegetexceptflag](https://en.cppreference.com/w/cpp/numeric/fenv/feexceptflag).
-/// \param flagp adress to store flag state at
-/// \param excepts OR of flags to save
-/// \retval 0 for success
-inline int fegetexceptflag(int* flagp, int excepts)
-{
-    *flagp = detail::errflags() & excepts;
-    return 0;
-}
-
-/// Restore exception flags.
-/// This only copies the specified exception state (including unset flags) without incurring any
-/// additional exception handling.
-/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is
-/// disabled,
-/// but in that case manual flag management is the only way to raise flags.
-///
-/// **See also:** Documentation for
-/// [std::fesetexceptflag](https://en.cppreference.com/w/cpp/numeric/fenv/feexceptflag).
-/// \param flagp adress to take flag state from
-/// \param excepts OR of flags to restore
-/// \retval 0 for success
-inline int fesetexceptflag(const int* flagp, int excepts)
-{
-    detail::errflags() = (detail::errflags() | (*flagp & excepts)) & (*flagp | ~excepts);
-    return 0;
-}
-
-/// Throw C++ exceptions based on set exception flags.
-/// This function manually throws a corresponding C++ exception if one of the specified flags is
-/// set,
-/// no matter if automatic throwing (via [HALF_ERRHANDLING_THROW_...](\ref
-/// HALF_ERRHANDLING_THROW_INVALID)) is enabled or not.
-/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is
-/// disabled,
-/// but in that case manual flag management is the only way to raise flags.
-/// \param excepts OR of exceptions to test
-/// \param msg error message to use for exception description
-/// \throw std::domain_error if `FE_INVALID` or `FE_DIVBYZERO` is selected and set
-/// \throw std::overflow_error if `FE_OVERFLOW` is selected and set
-/// \throw std::underflow_error if `FE_UNDERFLOW` is selected and set
-/// \throw std::range_error if `FE_INEXACT` is selected and set
-inline void fethrowexcept(int excepts, const char* msg = "")
-{
-    excepts &= detail::errflags();
-    if(excepts & (FE_INVALID | FE_DIVBYZERO))
-        throw std::domain_error(msg);
-    if(excepts & FE_OVERFLOW)
-        throw std::overflow_error(msg);
-    if(excepts & FE_UNDERFLOW)
-        throw std::underflow_error(msg);
-    if(excepts & FE_INEXACT)
-        throw std::range_error(msg);
-}
-/// \}
-} // namespace half_float
-
-#undef HALF_UNUSED_NOERR
-#undef HALF_CONSTEXPR
-#undef HALF_CONSTEXPR_CONST
-#undef HALF_CONSTEXPR_NOERR
-#undef HALF_NOEXCEPT
-#undef HALF_NOTHROW
-#undef HALF_THREAD_LOCAL
-#undef HALF_TWOS_COMPLEMENT_INT
-#ifdef HALF_POP_WARNINGS
-#pragma warning(pop)
-#undef HALF_POP_WARNINGS
-#endif
-
-#endif

From 6f1ea68bae7e151cb43b19e692d68c55c4b33cee Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Tue, 10 Aug 2021 14:20:32 +0000
Subject: [PATCH 25/57] suppress cppcheck

---
 CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8e16ac7f09..dad49b743d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -571,7 +571,9 @@ enable_cppcheck(
         knownConditionTrueFalse:*src/composable_kernel/composable_kernel/*/*
         identicalConditionAfterEarlyExit:*src/composable_kernel/composable_kernel/*/*
         duplicateExpression:*src/composable_kernel/composable_kernel/*/*
+        multiCondition:*src/composable_kernel/composable_kernel/*/*
         unreadVariable:*src/composable_kernel/composable_kernel/*/*
+        unreadVariable:*src/composable_kernel/host/*/*
         unreadVariable:*src/composable_kernel/external/*/*
         unmatchedSuppression
     FORCE

From 9547d24ed48201f292a37971838a231dddf6efb5 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Tue, 10 Aug 2021 20:55:13 +0000
Subject: [PATCH 26/57] fix enum issue

---
 .../include/utility/common_header.hpp                 |  4 ++--
 .../include/utility/data_type_enum.hpp                |  4 ++--
 ...data_type_helper.hpp => data_type_enum_helper.hpp} |  4 ++--
 src/hipoc/hipoc_program.cpp                           |  3 ++-
 src/include/miopen/solver/ck_utility_common.hpp       |  4 ++--
 src/kernel_warnings.cpp                               | 11 +++++++++++
 6 files changed, 21 insertions(+), 9 deletions(-)
 rename src/composable_kernel/composable_kernel/include/utility/{data_type_helper.hpp => data_type_enum_helper.hpp} (94%)

diff --git a/src/composable_kernel/composable_kernel/include/utility/common_header.hpp b/src/composable_kernel/composable_kernel/include/utility/common_header.hpp
index c4346e45d0..45d22cd618 100644
--- a/src/composable_kernel/composable_kernel/include/utility/common_header.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/common_header.hpp
@@ -7,9 +7,9 @@
 #include "statically_indexed_array.hpp"
 #include "container_element_picker.hpp"
 #include "multi_index.hpp"
-#include "data_type_enum.hpp"
 #include "data_type.hpp"
-#include "data_type_helper.hpp"
+#include "data_type_enum.hpp"
+#include "data_type_enum_helper.hpp"
 #include "functional.hpp"
 #include "functional2.hpp"
 #include "functional3.hpp"
diff --git a/src/composable_kernel/composable_kernel/include/utility/data_type_enum.hpp b/src/composable_kernel/composable_kernel/include/utility/data_type_enum.hpp
index 10a8771052..35df0067a9 100644
--- a/src/composable_kernel/composable_kernel/include/utility/data_type_enum.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/data_type_enum.hpp
@@ -3,8 +3,8 @@
 
 namespace ck {
 
-// this enumerate should be synchronized with include/miopen.h
-using DataTypeEnum_t = enum {
+enum DataTypeEnum_t
+{
     Half     = 0,
     Float    = 1,
     Int32    = 2,
diff --git a/src/composable_kernel/composable_kernel/include/utility/data_type_helper.hpp b/src/composable_kernel/composable_kernel/include/utility/data_type_enum_helper.hpp
similarity index 94%
rename from src/composable_kernel/composable_kernel/include/utility/data_type_helper.hpp
rename to src/composable_kernel/composable_kernel/include/utility/data_type_enum_helper.hpp
index 6a234cd10b..451ce992b1 100644
--- a/src/composable_kernel/composable_kernel/include/utility/data_type_helper.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/data_type_enum_helper.hpp
@@ -1,5 +1,5 @@
-#ifndef CK_DATA_TYPE_HELPER_HPP
-#define CK_DATA_TYPE_HELPER_HPP
+#ifndef CK_DATA_TYPE_ENUM_HELPER_HPP
+#define CK_DATA_TYPE_ENUM_HELPER_HPP
 
 #include "data_type.hpp"
 #include "data_type_enum.hpp"
diff --git a/src/hipoc/hipoc_program.cpp b/src/hipoc/hipoc_program.cpp
index c72d0438f3..42c5f5462b 100644
--- a/src/hipoc/hipoc_program.cpp
+++ b/src/hipoc/hipoc_program.cpp
@@ -306,7 +306,8 @@ void HIPOCProgramImpl::BuildCodeObject(std::string params,
 #if MIOPEN_BUILD_DEV
         params += " -Werror" + HipKernelWarningsString();
 #else
-        params += " -Wno-everything";
+        params += " -Werror" + HipKernelWarningsString();
+        // params += " -Wno-everything";
 #endif
     }
     else if(miopen::EndsWith(filename, ".cl"))
diff --git a/src/include/miopen/solver/ck_utility_common.hpp b/src/include/miopen/solver/ck_utility_common.hpp
index b44e9ff374..8904c3b76d 100644
--- a/src/include/miopen/solver/ck_utility_common.hpp
+++ b/src/include/miopen/solver/ck_utility_common.hpp
@@ -24,8 +24,8 @@
  *
  *******************************************************************************/
 
-#ifndef GUARD_CK_UTIL_HPP_
-#define GUARD_CK_UTIL_HPP_
+#ifndef GUARD_CK_UTILITY_COMMON_HPP_
+#define GUARD_CK_UTILITY_COMMON_HPP_
 
 #include <miopen/env.hpp>
 #include <miopen/hip_build_utils.hpp>
diff --git a/src/kernel_warnings.cpp b/src/kernel_warnings.cpp
index 9d5aed764e..2bf894a227 100644
--- a/src/kernel_warnings.cpp
+++ b/src/kernel_warnings.cpp
@@ -59,6 +59,7 @@ static std::vector<std::string> OclKernelWarnings(const bool is_miopengemm)
 
 static std::vector<std::string> HipKernelWarnings()
 {
+#if 0
     return {
         "-Weverything",
         "-Wno-c++98-compat",
@@ -84,6 +85,16 @@ static std::vector<std::string> HipKernelWarnings()
         "-Wno-disabled-macro-expansion",
         "-Wno-undefined-reinterpret-cast",
     };
+#else
+    return {
+        "-Weverything",
+        "-Wno-c++98-compat",
+        "-Wno-c++98-compat-pedantic",
+        "-Wno-missing-prototypes",
+        "-Wno-padded",
+        "-Wno-sign-conversion",
+    };
+#endif
 }
 
 static std::string MakeKernelWarningsString(const std::vector<std::string>& kernel_warnings,

From d9219655a77955b762f46adb6a9a3ea62f51cc5f Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Tue, 10 Aug 2021 20:56:28 +0000
Subject: [PATCH 27/57] revert chagnes to hip build

---
 src/hipoc/hipoc_program.cpp |  3 +--
 src/kernel_warnings.cpp     | 11 -----------
 2 files changed, 1 insertion(+), 13 deletions(-)

diff --git a/src/hipoc/hipoc_program.cpp b/src/hipoc/hipoc_program.cpp
index 42c5f5462b..c72d0438f3 100644
--- a/src/hipoc/hipoc_program.cpp
+++ b/src/hipoc/hipoc_program.cpp
@@ -306,8 +306,7 @@ void HIPOCProgramImpl::BuildCodeObject(std::string params,
 #if MIOPEN_BUILD_DEV
         params += " -Werror" + HipKernelWarningsString();
 #else
-        params += " -Werror" + HipKernelWarningsString();
-        // params += " -Wno-everything";
+        params += " -Wno-everything";
 #endif
     }
     else if(miopen::EndsWith(filename, ".cl"))
diff --git a/src/kernel_warnings.cpp b/src/kernel_warnings.cpp
index 2bf894a227..9d5aed764e 100644
--- a/src/kernel_warnings.cpp
+++ b/src/kernel_warnings.cpp
@@ -59,7 +59,6 @@ static std::vector<std::string> OclKernelWarnings(const bool is_miopengemm)
 
 static std::vector<std::string> HipKernelWarnings()
 {
-#if 0
     return {
         "-Weverything",
         "-Wno-c++98-compat",
@@ -85,16 +84,6 @@ static std::vector<std::string> HipKernelWarnings()
         "-Wno-disabled-macro-expansion",
         "-Wno-undefined-reinterpret-cast",
     };
-#else
-    return {
-        "-Weverything",
-        "-Wno-c++98-compat",
-        "-Wno-c++98-compat-pedantic",
-        "-Wno-missing-prototypes",
-        "-Wno-padded",
-        "-Wno-sign-conversion",
-    };
-#endif
 }
 
 static std::string MakeKernelWarningsString(const std::vector<std::string>& kernel_warnings,

From 2bfe093371479a887f87057d054d3e2c4b7fd478 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Tue, 10 Aug 2021 22:15:23 +0000
Subject: [PATCH 28/57] fix kernel filename

---
 fin/src/base64.cpp                            | 263 ++++++++++--------
 fin/src/include/base64.hpp                    |  12 +-
 fin/src/include/conv_fin.hpp                  |   2 -
 fin/src/include/error.hpp                     |   2 +-
 fin/src/include/tensor.hpp                    |   2 +-
 src/composable_kernel/CMakeLists.txt          |   7 +-
 ...mplicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp |   2 +-
 .../src/conv_bwd_driver_offline.cpp           |  40 +--
 .../src/conv_fwd_driver_offline.cpp           |   2 +-
 .../host/host_tensor/src/device.cpp           |  18 +-
 .../conv_hip_implicit_gemm_bwd_v1r1.cpp       |   8 +-
 ...conv_hip_implicit_gemm_bwd_v1r1_xdlops.cpp |   7 +-
 .../conv_hip_implicit_gemm_bwd_v4r1.cpp       |  13 +-
 ...conv_hip_implicit_gemm_bwd_v4r1_xdlops.cpp |   7 +-
 .../conv_hip_implicit_gemm_fwd_v4r1.cpp       |  30 +-
 ...licit_gemm_fwd_v4r4_xdlops_padded_gemm.cpp |   7 +-
 .../conv_hip_implicit_gemm_wrw_v4r4.cpp       |  12 +-
 ...conv_hip_implicit_gemm_wrw_v4r4_xdlops.cpp |   6 +-
 ...licit_gemm_wrw_v4r4_xdlops_padded_gemm.cpp |  10 +-
 19 files changed, 228 insertions(+), 222 deletions(-)

diff --git a/fin/src/base64.cpp b/fin/src/base64.cpp
index 6e88a96a74..b20bee79ba 100644
--- a/fin/src/base64.cpp
+++ b/fin/src/base64.cpp
@@ -36,51 +36,60 @@
 #include <algorithm>
 #include <stdexcept>
 
- //
- // Depending on the url parameter in base64_chars, one of
- // two sets of base64 characters needs to be chosen.
- // They differ in their last two characters.
- //
-static const char* base64_chars[2] = {
-             "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-             "abcdefghijklmnopqrstuvwxyz"
-             "0123456789"
-             "+/",
-
-             "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-             "abcdefghijklmnopqrstuvwxyz"
-             "0123456789"
-             "-_"};
-
-static unsigned int pos_of_char(const unsigned char chr) {
- //
- // Return the position of chr within base64_encode()
- //
-
-    if      (chr >= 'A' && chr <= 'Z') return chr - 'A';
-    else if (chr >= 'a' && chr <= 'z') return chr - 'a' + ('Z' - 'A')               + 1;
-    else if (chr >= '0' && chr <= '9') return chr - '0' + ('Z' - 'A') + ('z' - 'a') + 2;
-    else if (chr == '+' || chr == '-') return 62; // Be liberal with input and accept both url ('-') and non-url ('+') base 64 characters (
-    else if (chr == '/' || chr == '_') return 63; // Ditto for '/' and '_'
+//
+// Depending on the url parameter in base64_chars, one of
+// two sets of base64 characters needs to be chosen.
+// They differ in their last two characters.
+//
+static const char* base64_chars[2] = {"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+                                      "abcdefghijklmnopqrstuvwxyz"
+                                      "0123456789"
+                                      "+/",
+
+                                      "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+                                      "abcdefghijklmnopqrstuvwxyz"
+                                      "0123456789"
+                                      "-_"};
+
+static unsigned int pos_of_char(const unsigned char chr)
+{
+    //
+    // Return the position of chr within base64_encode()
+    //
+
+    if(chr >= 'A' && chr <= 'Z')
+        return chr - 'A';
+    else if(chr >= 'a' && chr <= 'z')
+        return chr - 'a' + ('Z' - 'A') + 1;
+    else if(chr >= '0' && chr <= '9')
+        return chr - '0' + ('Z' - 'A') + ('z' - 'a') + 2;
+    else if(chr == '+' || chr == '-')
+        return 62; // Be liberal with input and accept both url ('-') and non-url ('+') base 64
+                   // characters (
+    else if(chr == '/' || chr == '_')
+        return 63; // Ditto for '/' and '_'
     else
- //
- // 2020-10-23: Throw std::exception rather than const char*
- //(Pablo Martin-Gomez, https://github.com/Bouska)
- //
-    throw std::runtime_error("Input is not valid base64-encoded data.");
+        //
+        // 2020-10-23: Throw std::exception rather than const char*
+        //(Pablo Martin-Gomez, https://github.com/Bouska)
+        //
+        throw std::runtime_error("Input is not valid base64-encoded data.");
 }
 
-static std::string insert_linebreaks(std::string str, size_t distance) {
- //
- // Provided by https://github.com/JomaCorpFX, adapted by me.
- //
-    if (!str.length()) {
+static std::string insert_linebreaks(std::string str, size_t distance)
+{
+    //
+    // Provided by https://github.com/JomaCorpFX, adapted by me.
+    //
+    if(!str.length())
+    {
         return "";
     }
 
     size_t pos = distance;
 
-    while (pos < str.size()) {
+    while(pos < str.size())
+    {
         str.insert(pos, "\n");
         pos += distance + 1;
     }
@@ -89,40 +98,45 @@ static std::string insert_linebreaks(std::string str, size_t distance) {
 }
 
 template <typename String, unsigned int line_length>
-static std::string encode_with_line_breaks(String s) {
-  return insert_linebreaks(base64_encode(s, false), line_length);
+static std::string encode_with_line_breaks(String s)
+{
+    return insert_linebreaks(base64_encode(s, false), line_length);
 }
 
 template <typename String>
-static std::string encode_pem(String s) {
-  return encode_with_line_breaks<String, 64>(s);
+static std::string encode_pem(String s)
+{
+    return encode_with_line_breaks<String, 64>(s);
 }
 
 template <typename String>
-static std::string encode_mime(String s) {
-  return encode_with_line_breaks<String, 76>(s);
+static std::string encode_mime(String s)
+{
+    return encode_with_line_breaks<String, 76>(s);
 }
 
 template <typename String>
-static std::string encode(String s, bool url) {
-  return base64_encode(reinterpret_cast<const unsigned char*>(s.data()), s.length(), url);
+static std::string encode(String s, bool url)
+{
+    return base64_encode(reinterpret_cast<const unsigned char*>(s.data()), s.length(), url);
 }
 
-std::string base64_encode(unsigned char const* bytes_to_encode, size_t in_len, bool url) {
+std::string base64_encode(unsigned char const* bytes_to_encode, size_t in_len, bool url)
+{
 
-    size_t len_encoded = (in_len +2) / 3 * 4;
+    size_t len_encoded = (in_len + 2) / 3 * 4;
 
     unsigned char trailing_char = url ? '.' : '=';
 
- //
- // Choose set of base64 characters. They differ
- // for the last two positions, depending on the url
- // parameter.
- // A bool (as is the parameter url) is guaranteed
- // to evaluate to either 0 or 1 in C++ therefore,
- // the correct character set is chosen by subscripting
- // base64_chars with url.
- //
+    //
+    // Choose set of base64 characters. They differ
+    // for the last two positions, depending on the url
+    // parameter.
+    // A bool (as is the parameter url) is guaranteed
+    // to evaluate to either 0 or 1 in C++ therefore,
+    // the correct character set is chosen by subscripting
+    // base64_chars with url.
+    //
     const char* base64_chars_ = base64_chars[url];
 
     std::string ret;
@@ -130,22 +144,29 @@ std::string base64_encode(unsigned char const* bytes_to_encode, size_t in_len, b
 
     unsigned int pos = 0;
 
-    while (pos < in_len) {
+    while(pos < in_len)
+    {
         ret.push_back(base64_chars_[(bytes_to_encode[pos + 0] & 0xfc) >> 2]);
 
-        if (pos+1 < in_len) {
-           ret.push_back(base64_chars_[((bytes_to_encode[pos + 0] & 0x03) << 4) + ((bytes_to_encode[pos + 1] & 0xf0) >> 4)]);
-
-           if (pos+2 < in_len) {
-              ret.push_back(base64_chars_[((bytes_to_encode[pos + 1] & 0x0f) << 2) + ((bytes_to_encode[pos + 2] & 0xc0) >> 6)]);
-              ret.push_back(base64_chars_[  bytes_to_encode[pos + 2] & 0x3f]);
-           }
-           else {
-              ret.push_back(base64_chars_[(bytes_to_encode[pos + 1] & 0x0f) << 2]);
-              ret.push_back(trailing_char);
-           }
+        if(pos + 1 < in_len)
+        {
+            ret.push_back(base64_chars_[((bytes_to_encode[pos + 0] & 0x03) << 4) +
+                                        ((bytes_to_encode[pos + 1] & 0xf0) >> 4)]);
+
+            if(pos + 2 < in_len)
+            {
+                ret.push_back(base64_chars_[((bytes_to_encode[pos + 1] & 0x0f) << 2) +
+                                            ((bytes_to_encode[pos + 2] & 0xc0) >> 6)]);
+                ret.push_back(base64_chars_[bytes_to_encode[pos + 2] & 0x3f]);
+            }
+            else
+            {
+                ret.push_back(base64_chars_[(bytes_to_encode[pos + 1] & 0x0f) << 2]);
+                ret.push_back(trailing_char);
+            }
         }
-        else {
+        else
+        {
 
             ret.push_back(base64_chars_[(bytes_to_encode[pos + 0] & 0x03) << 4]);
             ret.push_back(trailing_char);
@@ -155,78 +176,81 @@ std::string base64_encode(unsigned char const* bytes_to_encode, size_t in_len, b
         pos += 3;
     }
 
-
     return ret;
 }
 
 template <typename String>
-static std::string decode(String encoded_string, bool remove_linebreaks) {
- //
- // decode(…) is templated so that it can be used with String = const std::string&
- // or std::string_view (requires at least C++17)
- //
+static std::string decode(String encoded_string, bool remove_linebreaks)
+{
+    //
+    // decode(…) is templated so that it can be used with String = const std::string&
+    // or std::string_view (requires at least C++17)
+    //
 
-    if (encoded_string.empty()) return std::string();
+    if(encoded_string.empty())
+        return std::string();
 
-    if (remove_linebreaks) {
+    if(remove_linebreaks)
+    {
 
-       std::string copy(encoded_string);
+        std::string copy(encoded_string);
 
-       copy.erase(std::remove(copy.begin(), copy.end(), '\n'), copy.end());
+        copy.erase(std::remove(copy.begin(), copy.end(), '\n'), copy.end());
 
-       return base64_decode(copy, false);
+        return base64_decode(copy, false);
     }
 
     size_t length_of_string = encoded_string.length();
-    size_t pos = 0;
-
- //
- // The approximate length (bytes) of the decoded string might be one or
- // two bytes smaller, depending on the amount of trailing equal signs
- // in the encoded string. This approximation is needed to reserve
- // enough space in the string to be returned.
- //
+    size_t pos              = 0;
+
+    //
+    // The approximate length (bytes) of the decoded string might be one or
+    // two bytes smaller, depending on the amount of trailing equal signs
+    // in the encoded string. This approximation is needed to reserve
+    // enough space in the string to be returned.
+    //
     size_t approx_length_of_decoded_string = length_of_string / 4 * 3;
     std::string ret;
     ret.reserve(approx_length_of_decoded_string);
 
-    while (pos < length_of_string) {
+    while(pos < length_of_string)
+    {
 
-       unsigned int pos_of_char_1 = pos_of_char(encoded_string[pos+1] );
+        unsigned int pos_of_char_1 = pos_of_char(encoded_string[pos + 1]);
 
-       ret.push_back(static_cast<std::string::value_type>( ( (pos_of_char(encoded_string[pos+0]) ) << 2 ) + ( (pos_of_char_1 & 0x30 ) >> 4)));
+        ret.push_back(static_cast<std::string::value_type>(
+            ((pos_of_char(encoded_string[pos + 0])) << 2) + ((pos_of_char_1 & 0x30) >> 4)));
 
-       if (encoded_string[pos+2] != '=' && encoded_string[pos+2] != '.') { // accept URL-safe base 64 strings, too, so check for '.' also.
+        if(encoded_string[pos + 2] != '=' && encoded_string[pos + 2] != '.')
+        { // accept URL-safe base 64 strings, too, so check for '.' also.
 
-          unsigned int pos_of_char_2 = pos_of_char(encoded_string[pos+2] );
-          ret.push_back(static_cast<std::string::value_type>( (( pos_of_char_1 & 0x0f) << 4) + (( pos_of_char_2 & 0x3c) >> 2)));
+            unsigned int pos_of_char_2 = pos_of_char(encoded_string[pos + 2]);
+            ret.push_back(static_cast<std::string::value_type>(((pos_of_char_1 & 0x0f) << 4) +
+                                                               ((pos_of_char_2 & 0x3c) >> 2)));
 
-          if (encoded_string[pos+3] != '=' && encoded_string[pos+3] != '.') {
-             ret.push_back(static_cast<std::string::value_type>( ( (pos_of_char_2 & 0x03 ) << 6 ) + pos_of_char(encoded_string[pos+3])   ));
-          }
-       }
+            if(encoded_string[pos + 3] != '=' && encoded_string[pos + 3] != '.')
+            {
+                ret.push_back(static_cast<std::string::value_type>(
+                    ((pos_of_char_2 & 0x03) << 6) + pos_of_char(encoded_string[pos + 3])));
+            }
+        }
 
-       pos += 4;
+        pos += 4;
     }
 
     return ret;
 }
 
-std::string base64_decode(std::string const& s, bool remove_linebreaks) {
-   return decode(s, remove_linebreaks);
+std::string base64_decode(std::string const& s, bool remove_linebreaks)
+{
+    return decode(s, remove_linebreaks);
 }
 
-std::string base64_encode(std::string const& s, bool url) {
-   return encode(s, url);
-}
+std::string base64_encode(std::string const& s, bool url) { return encode(s, url); }
 
-std::string base64_encode_pem (std::string const& s) {
-   return encode_pem(s);
-}
+std::string base64_encode_pem(std::string const& s) { return encode_pem(s); }
 
-std::string base64_encode_mime(std::string const& s) {
-   return encode_mime(s);
-}
+std::string base64_encode_mime(std::string const& s) { return encode_mime(s); }
 
 #if __cplusplus >= 201703L
 //
@@ -235,20 +259,15 @@ std::string base64_encode_mime(std::string const& s) {
 // Provided by Yannic Bonenberger (https://github.com/Yannic)
 //
 
-std::string base64_encode(std::string_view s, bool url) {
-   return encode(s, url);
-}
+std::string base64_encode(std::string_view s, bool url) { return encode(s, url); }
 
-std::string base64_encode_pem(std::string_view s) {
-   return encode_pem(s);
-}
+std::string base64_encode_pem(std::string_view s) { return encode_pem(s); }
 
-std::string base64_encode_mime(std::string_view s) {
-   return encode_mime(s);
-}
+std::string base64_encode_mime(std::string_view s) { return encode_mime(s); }
 
-std::string base64_decode(std::string_view s, bool remove_linebreaks) {
-   return decode(s, remove_linebreaks);
+std::string base64_decode(std::string_view s, bool remove_linebreaks)
+{
+    return decode(s, remove_linebreaks);
 }
 
-#endif  // __cplusplus >= 201703L
+#endif // __cplusplus >= 201703L
diff --git a/fin/src/include/base64.hpp b/fin/src/include/base64.hpp
index 09f75a8a33..a788065fd1 100644
--- a/fin/src/include/base64.hpp
+++ b/fin/src/include/base64.hpp
@@ -10,10 +10,10 @@
 
 #if __cplusplus >= 201703L
 #include <string_view>
-#endif  // __cplusplus >= 201703L
+#endif // __cplusplus >= 201703L
 
-std::string base64_encode     (std::string const& s, bool url = false);
-std::string base64_encode_pem (std::string const& s);
+std::string base64_encode(std::string const& s, bool url = false);
+std::string base64_encode_pem(std::string const& s);
 std::string base64_encode_mime(std::string const& s);
 
 std::string base64_decode(std::string const& s, bool remove_linebreaks = false);
@@ -25,11 +25,11 @@ std::string base64_encode(unsigned char const*, size_t len, bool url = false);
 // Requires C++17
 // Provided by Yannic Bonenberger (https://github.com/Yannic)
 //
-std::string base64_encode     (std::string_view s, bool url = false);
-std::string base64_encode_pem (std::string_view s);
+std::string base64_encode(std::string_view s, bool url = false);
+std::string base64_encode_pem(std::string_view s);
 std::string base64_encode_mime(std::string_view s);
 
 std::string base64_decode(std::string_view s, bool remove_linebreaks = false);
-#endif  // __cplusplus >= 201703L
+#endif // __cplusplus >= 201703L
 
 #endif /* BASE64_H_C0CE2A47_D10E_42C9_A27C_C883944E704A */
diff --git a/fin/src/include/conv_fin.hpp b/fin/src/include/conv_fin.hpp
index 7ca262e1da..b58804a7eb 100644
--- a/fin/src/include/conv_fin.hpp
+++ b/fin/src/include/conv_fin.hpp
@@ -520,7 +520,6 @@ int ConvFin<Tgpu, Tref>::MIOpenFindEval()
             res_item["reason"] = "Success";
 
             return true;
-
         };
 
         auto res              = process_solver();
@@ -703,7 +702,6 @@ int ConvFin<Tgpu, Tref>::MIOpenFind()
             res_item["reason"] = "Success";
 
             return true;
-
         };
 
         auto res              = process_solver();
diff --git a/fin/src/include/error.hpp b/fin/src/include/error.hpp
index 0279019964..8ba99b1ec4 100644
--- a/fin/src/include/error.hpp
+++ b/fin/src/include/error.hpp
@@ -19,7 +19,7 @@ struct Exception : std::exception
 
     const char* what() const noexcept override { return message.c_str(); }
 };
-}
+} // namespace fin
 #define FIN_THROW(...)                                                    \
     do                                                                    \
     {                                                                     \
diff --git a/fin/src/include/tensor.hpp b/fin/src/include/tensor.hpp
index b6706690db..ace5e9868a 100644
--- a/fin/src/include/tensor.hpp
+++ b/fin/src/include/tensor.hpp
@@ -52,7 +52,7 @@ miopenDataType_t GetDataType<bfloat16>();
 using status_t = cl_int;
 #else // FIN_BACKEND_HIP
 #define STATUS_SUCCESS 0
-using status_t               = int;
+using status_t = int;
 #endif
 
 template <typename Tgpu, typename Tcpu>
diff --git a/src/composable_kernel/CMakeLists.txt b/src/composable_kernel/CMakeLists.txt
index 6581bae8b9..4ac8550082 100644
--- a/src/composable_kernel/CMakeLists.txt
+++ b/src/composable_kernel/CMakeLists.txt
@@ -42,6 +42,12 @@ message(STATUS "Build with HIP ${hip_VERSION}")
 #find_path(HALF_INCLUDE_DIR half.hpp)
 message("HALF_INCLUDE_DIR: ${HALF_INCLUDE_DIR}")
 
+# CMAKE_CXX_FLAGS
+if(BUILD_DEV)
+    string(APPEND CMAKE_CXX_FLAGS " -Werror -Weverything")
+endif()
+message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
+
 ## tidy
 include(EnableCompilerWarnings)
 set(MIOPEN_TIDY_ERRORS ERRORS * -readability-inconsistent-declaration-parameter-name)
@@ -50,7 +56,6 @@ if(CMAKE_CXX_COMPILER MATCHES ".*hcc" OR CMAKE_CXX_COMPILER MATCHES ".*clang\\+\
 # Enable tidy on hip
 elseif(MIOPEN_BACKEND STREQUAL "HIP" OR MIOPEN_BACKEND STREQUAL "HIPNOGPU")
     set(MIOPEN_TIDY_ERRORS ALL)
-
 endif()
 
 include(ClangTidy)
diff --git a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp b/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
index 9edbb811ca..37ea6f1b17 100644
--- a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
@@ -51,7 +51,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
     const auto out_desc_n_k_ho_wo =
         make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
 
-#if 1
+#if 0
     // [8, 1, 128, 1] * [8, 4, 32, 1] = [1, 128, 4, 32] for fp32
     // cdata = 64, BlockSize = 256
     constexpr index_t BlockSize = 256;
diff --git a/src/composable_kernel/host/driver_offline/src/conv_bwd_driver_offline.cpp b/src/composable_kernel/host/driver_offline/src/conv_bwd_driver_offline.cpp
index 828bbae2bd..34fa7eb3fa 100644
--- a/src/composable_kernel/host/driver_offline/src/conv_bwd_driver_offline.cpp
+++ b/src/composable_kernel/host/driver_offline/src/conv_bwd_driver_offline.cpp
@@ -115,15 +115,13 @@ int main(int argc, char* argv[])
 #endif
 
 #if 0
-    constexpr index_t in_vector_size = 1;
     using in_data_t                  = float;
     using acc_data_t                 = float;
     using out_data_t                 = float;
 #elif 1
-    constexpr index_t in_vector_size = 1;
-    using in_data_t                  = half_t;
-    using acc_data_t                 = float;
-    using out_data_t                 = half_t;
+    using in_data_t  = half_t;
+    using acc_data_t = float;
+    using out_data_t = half_t;
 #endif
 
     std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4);
@@ -213,38 +211,6 @@ int main(int argc, char* argv[])
         wei.GenerateTensorValue(gen_wei, num_thread);
     }
 
-    auto f_make_for_device_nchw = [&]() {
-#if USE_DYNAMIC_MODE
-        const auto in_lengths_dev     = make_tuple(N, C, Hi, Wi);
-        const auto wei_lengths_dev    = make_tuple(K, C, Y, X);
-        const auto out_lengths_dev    = make_tuple(N, K, Ho, Wo);
-        const auto conv_strides_dev   = make_tuple(conv_stride_h, conv_stride_w);
-        const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
-        const auto in_left_pads_dev   = make_tuple(in_left_pad_h, in_left_pad_w);
-        const auto in_right_pads_dev  = make_tuple(in_right_pad_h, in_right_pad_w);
-#else
-        const auto in_lengths_dev =
-            make_tuple(Number<N>{}, Number<C>{}, Number<Hi>{}, Number<Wi>{});
-        const auto wei_lengths_dev = make_tuple(Number<K>{}, Number<C>{}, Number<Y>{}, Number<X>{});
-        const auto out_lengths_dev =
-            make_tuple(Number<N>{}, Number<K>{}, Number<Ho>{}, Number<Wo>{});
-        const auto conv_strides_dev = make_tuple(Number<conv_stride_h>{}, Number<conv_stride_w>{});
-        const auto conv_dilations_dev =
-            make_tuple(Number<conv_dilation_h>{}, Number<conv_dilation_w>{});
-        const auto in_left_pads_dev = make_tuple(Number<in_left_pad_h>{}, Number<in_left_pad_w>{});
-        const auto in_right_pads_dev =
-            make_tuple(Number<in_right_pad_h>{}, Number<in_right_pad_w>{});
-#endif
-
-        return make_tuple(in_lengths_dev,
-                          wei_lengths_dev,
-                          out_lengths_dev,
-                          conv_strides_dev,
-                          conv_dilations_dev,
-                          in_left_pads_dev,
-                          in_right_pads_dev);
-    };
-
     auto f_make_for_device_nhwc = [&]() {
 #if USE_DYNAMIC_MODE
         const auto in_lengths_dev     = make_tuple(N, Hi, Wi, C);
diff --git a/src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp b/src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp
index f88d1b831e..968501e947 100644
--- a/src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp
+++ b/src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp
@@ -22,7 +22,7 @@
 #define USE_DYNAMIC_MODE 1
 #define USE_CONV_FWD_V4R4_NCHW 0
 #define USE_CONV_FWD_V4R4R2_NHWC 1
-#define USE_CONV_FWD_V6R1_NCHW 0
+#define USE_CONV_FWD_V6R1_NCHW 1
 #define USE_CONV_FWD_V5R1_NCHW 0
 #define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
 #define USE_CONV_FWD_V4R4R4_XDL_NHWC 0
diff --git a/src/composable_kernel/host/host_tensor/src/device.cpp b/src/composable_kernel/host/host_tensor/src/device.cpp
index d0d74a4c2a..0d1b3d6883 100644
--- a/src/composable_kernel/host/host_tensor/src/device.cpp
+++ b/src/composable_kernel/host/host_tensor/src/device.cpp
@@ -24,32 +24,32 @@ struct KernelTimerImpl
 {
     KernelTimerImpl()
     {
-        hipEventCreate(&mStart);
-        hipEventCreate(&mEnd);
+        hipGetErrorString(hipEventCreate(&mStart));
+        hipGetErrorString(hipEventCreate(&mEnd));
     }
 
     ~KernelTimerImpl()
     {
-        hipEventDestroy(mStart);
-        hipEventDestroy(mEnd);
+        hipGetErrorString(hipEventDestroy(mStart));
+        hipGetErrorString(hipEventDestroy(mEnd));
     }
 
     void Start()
     {
-        hipDeviceSynchronize();
-        hipEventRecord(mStart, 0);
+        hipGetErrorString(hipDeviceSynchronize());
+        hipGetErrorString(hipEventRecord(mStart, nullptr));
     }
 
     void End()
     {
-        hipEventRecord(mEnd, 0);
-        hipEventSynchronize(mEnd);
+        hipGetErrorString(hipEventRecord(mEnd, nullptr));
+        hipGetErrorString(hipEventSynchronize(mEnd));
     }
 
     float GetElapsedTime() const
     {
         float time;
-        hipEventElapsedTime(&time, mStart, mEnd);
+        hipGetErrorString(hipEventElapsedTime(&time, mStart, mEnd));
         return time;
     }
 
diff --git a/src/solver/conv_hip_implicit_gemm_bwd_v1r1.cpp b/src/solver/conv_hip_implicit_gemm_bwd_v1r1.cpp
index 147c72649b..1b59a7508c 100644
--- a/src/solver/conv_hip_implicit_gemm_bwd_v1r1.cpp
+++ b/src/solver/conv_hip_implicit_gemm_bwd_v1r1.cpp
@@ -716,11 +716,11 @@ ConvSolution ConvHipImplicitGemmBwdDataV1R1::GetSolution(
     construction_parameters.g_wk.push_back(1);
     construction_parameters.g_wk.push_back(1);
 
+    // clang-format off
     construction_parameters.kernel_file = ctx.Is3d()
-                                              ? "static_kernel_gridwise_convolution_backward_data_"
-                                                "implicit_gemm_v1r1_ncdhw_kczyx_nkdhw.cpp"
-                                              : "static_kernel_gridwise_convolution_backward_data_"
-                                                "implicit_gemm_v1r1_nchw_kcyx_nkhw.cpp";
+                                              ? "static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_ncdhw_kczyx_nkdhw.cpp"
+                                              : "static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.cpp";
+    // clang-format on
 
     construction_parameters.kernel_name =
         ctx.Is3d() ? "gridwise_convolution_backward_data_implicit_gemm_v1r1_ncdhw_kczyx_nkdhw"
diff --git a/src/solver/conv_hip_implicit_gemm_bwd_v1r1_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_v1r1_xdlops.cpp
index db1eb2d6e9..d17d6deba7 100644
--- a/src/solver/conv_hip_implicit_gemm_bwd_v1r1_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_bwd_v1r1_xdlops.cpp
@@ -812,10 +812,13 @@ ConvSolution ConvHipImplicitGemmBwdDataV1R1Xdlops::GetSolution(
 
     KernelInfo construction_parameters;
 
-    construction_parameters.kernel_file = "static_kernel_gridwise_convolution_backward_data_"
-                                          "implicit_gemm_v1r1_xdlops_nchw_kcyx_nkhw.cpp";
+    // clang-format off
+    construction_parameters.kernel_file =
+        "static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_nchw_kcyx_nkhw.cpp";
+
     construction_parameters.kernel_name =
         "gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_nchw_kcyx_nkhw";
+    // clang-format on
 
     result.workspce_sz = GetWorkspaceSize(ctx);
 
diff --git a/src/solver/conv_hip_implicit_gemm_bwd_v4r1.cpp b/src/solver/conv_hip_implicit_gemm_bwd_v4r1.cpp
index c9e7d5ca0c..ba09272a26 100644
--- a/src/solver/conv_hip_implicit_gemm_bwd_v4r1.cpp
+++ b/src/solver/conv_hip_implicit_gemm_bwd_v4r1.cpp
@@ -848,20 +848,23 @@ ConvSolution ConvHipImplicitGemmBwdDataV4R1::GetSolution(
 
             if(ctx.Is3d())
             {
-                construction_parameters.kernel_file = "static_kernel_gridwise_convolution_backward_"
-                                                      "data_implicit_gemm_v4r1_ncdhw_kczyx_nkdhw."
-                                                      "cpp";
+                // clang-format off
+                construction_parameters.kernel_file =
+                    "static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_ncdhw_kczyx_nkdhw.cpp";
 
                 construction_parameters.kernel_name =
                     "gridwise_convolution_backward_data_implicit_gemm_v4r1_ncdhw_kczyx_nkdhw";
+                // clang-format on
             }
             else
             {
-                construction_parameters.kernel_file = "static_kernel_gridwise_convolution_backward_"
-                                                      "data_implicit_gemm_v4r1_nchw_kcyx_nkhw.cpp";
+                // clang-format off
+                construction_parameters.kernel_file =
+                    "static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.cpp";
 
                 construction_parameters.kernel_name =
                     "gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw";
+                // clang-format on
             }
 
             int GemmMLevel0Cluster                    = 0;
diff --git a/src/solver/conv_hip_implicit_gemm_bwd_v4r1_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_v4r1_xdlops.cpp
index 4627862655..46fcbd7f34 100644
--- a/src/solver/conv_hip_implicit_gemm_bwd_v4r1_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_bwd_v4r1_xdlops.cpp
@@ -910,12 +910,13 @@ ConvSolution ConvHipImplicitGemmBwdDataV4R1Xdlops::GetSolution(
             construction_parameters.g_wk.push_back(1);
             construction_parameters.g_wk.push_back(1);
 
-            construction_parameters.kernel_file = "static_kernel_gridwise_convolution_backward_"
-                                                  "data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw."
-                                                  "cpp";
+            // clang-format off
+            construction_parameters.kernel_file =
+                "static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.cpp";
 
             construction_parameters.kernel_name =
                 "gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw";
+            // clang-format on
 
             // TODO: add fp16 calculation by GetWorkspaceSize(ctx);
             result.workspce_sz = 0;
diff --git a/src/solver/conv_hip_implicit_gemm_fwd_v4r1.cpp b/src/solver/conv_hip_implicit_gemm_fwd_v4r1.cpp
index dc0199ef9f..766722c484 100644
--- a/src/solver/conv_hip_implicit_gemm_fwd_v4r1.cpp
+++ b/src/solver/conv_hip_implicit_gemm_fwd_v4r1.cpp
@@ -236,19 +236,23 @@ ConvSolution ConvHipImplicitGemmV4R1Fwd::GetSolution(const ConvolutionContext& c
 
     if(group_counts > 1)
     {
-        construction_parameters.kernel_file = "static_kernel_gridwise_convolution_implicit_gemm_"
-                                              "v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer.cpp";
+        // clang-format off
+        construction_parameters.kernel_file =
+            "static_kernel_gridwise_convolution_implicit_gemm_v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer.cpp";
 
         construction_parameters.kernel_name =
             "gridwise_convolution_implicit_gemm_v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer";
+        // clang-format on
     }
     else
     {
-        construction_parameters.kernel_file = "static_kernel_gridwise_convolution_implicit_gemm_"
-                                              "v4r1_nchw_kcyx_nkhw_lds_double_buffer.cpp";
+        // clang-format off
+        construction_parameters.kernel_file =
+            "static_kernel_gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.cpp";
 
         construction_parameters.kernel_name =
             "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer";
+        // clang-format on
     }
 
     const auto& WeiBlockCopySubLengths_E = e_per_block / config.WeiBlockCopyClusterLengths_E;
@@ -439,19 +443,17 @@ ConvSolution ConvHipImplicitGemmV4R1WrW::GetSolution(const ConvolutionContext& c
 
     if(ctx.group_counts > 1)
     {
-        construction_parameters.kernel_file = "static_kernel_gridwise_convolution_implicit_gemm_"
-                                              "v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer.cpp";
-
-        construction_parameters.kernel_name =
-            "gridwise_convolution_implicit_gemm_v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer";
+        // clang-format off
+        construction_parameters.kernel_file = "static_kernel_gridwise_convolution_implicit_gemm_v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer.cpp";
+        construction_parameters.kernel_name = "gridwise_convolution_implicit_gemm_v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer";
+        // clang-format on
     }
     else
     {
-        construction_parameters.kernel_file = "static_kernel_gridwise_convolution_implicit_gemm_"
-                                              "v4r1_nchw_kcyx_nkhw_lds_double_buffer.cpp";
-
-        construction_parameters.kernel_name =
-            "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer";
+        // clang-format off
+        construction_parameters.kernel_file = "static_kernel_gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.cpp";
+        construction_parameters.kernel_name = "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer";
+        // clang-format on
     }
 
     const auto& WeiBlockCopySubLengths_E = e_per_block / config.WeiBlockCopyClusterLengths_E;
diff --git a/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops_padded_gemm.cpp b/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops_padded_gemm.cpp
index c3ec872785..52e35de9bd 100644
--- a/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops_padded_gemm.cpp
+++ b/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops_padded_gemm.cpp
@@ -923,12 +923,13 @@ ConvSolution ConvHipImplicitGemmForwardV4R4Xdlops_Padded_Gemm::GetSolution(
 
     KernelInfo construction_parameters;
 
-    construction_parameters.kernel_file = //
-        "static_kernel_gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_"
-        "padded_gemm.cpp";
+    // clang-format off
+    construction_parameters.kernel_file =
+        "static_kernel_gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.cpp";
 
     construction_parameters.kernel_name =
         "gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm";
+    // clang-format on
 
     int grid_size  = 0;
     int block_size = 0;
diff --git a/src/solver/conv_hip_implicit_gemm_wrw_v4r4.cpp b/src/solver/conv_hip_implicit_gemm_wrw_v4r4.cpp
index 4671ce5ffa..9ee4e1d516 100644
--- a/src/solver/conv_hip_implicit_gemm_wrw_v4r4.cpp
+++ b/src/solver/conv_hip_implicit_gemm_wrw_v4r4.cpp
@@ -659,19 +659,23 @@ ConvSolution ConvHipImplicitGemmV4R4WrW::GetSolution(const ConvolutionContext& c
 
     if(ctx.Is3d())
     {
-        construction_parameters.kernel_file = "static_kernel_gridwise_convolution_backward_weights_"
-                                              "implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.cpp";
+        // clang-format off
+        construction_parameters.kernel_file =
+            "static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.cpp";
 
         construction_parameters.kernel_name =
             "gridwise_convolution_backward_weights_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw";
+        // clang-format on
     }
     else
     {
-        construction_parameters.kernel_file = "static_kernel_gridwise_convolution_backward_weights_"
-                                              "implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp";
+        // clang-format off
+        construction_parameters.kernel_file =
+            "static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp";
 
         construction_parameters.kernel_name =
             "gridwise_convolution_backward_weights_implicit_gemm_v4r4_nchw_kcyx_nkhw";
+        // clang-format on
     }
 
     int GemmMLevel0Cluster                    = 0;
diff --git a/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops.cpp
index de346611ca..99661e7241 100644
--- a/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops.cpp
@@ -876,11 +876,13 @@ ConvSolution ConvHipImplicitGemmWrwV4R4Xdlops::GetSolution(
 
     KernelInfo construction_parameters;
 
-    construction_parameters.kernel_file = "static_kernel_gridwise_convolution_backward_weights_"
-                                          "implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp";
+    // clang-format off
+    construction_parameters.kernel_file =
+        "static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp";
 
     construction_parameters.kernel_name =
         "gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw";
+    // clang-format on
 
     int grid_size                     = 0;
     int block_size                    = 0;
diff --git a/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops_padded_gemm.cpp b/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops_padded_gemm.cpp
index 70a5196ba8..c961ed7241 100644
--- a/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops_padded_gemm.cpp
+++ b/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops_padded_gemm.cpp
@@ -935,11 +935,13 @@ ConvSolution ConvHipImplicitGemmWrwV4R4Xdlops_Padded_Gemm::GetSolution(
 
     KernelInfo construction_parameters;
 
-    construction_parameters.kernel_file = "gridwise_convolution_backward_weights_implicit_gemm_"
-                                          "v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.cpp";
+    // clang-format off
+    construction_parameters.kernel_file =
+        "static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.cpp";
 
-    construction_parameters.kernel_name = "gridwise_convolution_backward_weights_implicit_gemm_"
-                                          "v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm";
+    construction_parameters.kernel_name =
+        "gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm";
+    // clang-format on
 
     int grid_size                     = 0;
     int block_size                    = 0;

From b31e66117dcdf31dacaee31a592cc7580b01e51c Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Tue, 10 Aug 2021 22:19:13 +0000
Subject: [PATCH 29/57] update CK build script

---
 src/composable_kernel/script/cmake-rocm.sh | 32 +++-------------------
 1 file changed, 4 insertions(+), 28 deletions(-)

diff --git a/src/composable_kernel/script/cmake-rocm.sh b/src/composable_kernel/script/cmake-rocm.sh
index e65c53ce1e..ebfa2b9f69 100755
--- a/src/composable_kernel/script/cmake-rocm.sh
+++ b/src/composable_kernel/script/cmake-rocm.sh
@@ -3,40 +3,16 @@ rm -f CMakeCache.txt
 rm -f *.cmake
 rm -rf CMakeFiles
 
-MY_PROJECT_SOURCE=../../../
+MY_PROJECT_SOURCE=../../..
 MY_PROJECT_INSTALL=../install.dir
 
 cmake                                                                                                                                          \
 -D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL}                                                                                                  \
+-D HALF_INCLUDE_DIR="/root/workspace/external/half/include"                                                                                    \
+-D BUILD_DEV=ON                                                                                                                                \
 -D CMAKE_BUILD_TYPE=Release                                                                                                                    \
--D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX906 -O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=$PWD"   \
--D HIP_ONLINE_COMPILER_FLAGS="-DCK_AMD_GPU_GFX906"                                                                                             \
+-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 -O3 --amdgpu-target=gfx908 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=$PWD"   \
 -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                                                                      \
 -D CMAKE_PREFIX_PATH=/opt/rocm                                                                                                                 \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                                                              \
 ${MY_PROJECT_SOURCE}
-
-#CXX_FLAG_TMP=-Weverything
-#            -Wno-c++98-compat \
-#            -Wno-c++98-compat-pedantic \
-#            -Wno-conversion \
-#            -Wno-double-promotion \
-#            -Wno-exit-time-destructors \
-#            -Wno-extra-semi \
-#            -Wno-float-conversion \
-#            -Wno-gnu-anonymous-struct \
-#            -Wno-gnu-zero-variadic-macro-arguments \
-#            -Wno-missing-noreturn \
-#            -Wno-missing-prototypes \
-#            -Wno-nested-anon-types \
-#            -Wno-padded \
-#            -Wno-return-std-move-in-c++11 \
-#            -Wno-shorten-64-to-32 \
-#            -Wno-sign-conversion \
-#            -Wno-unknown-warning-option \
-#            -Wno-unused-command-line-argument \
-#            -Wno-weak-vtables \
-#            -Wno-covered-switch-default \
-#            -Wno-disabled-macro-expansion \
-#            -Wno-undefined-reinterpret-cast
-

From f2e1a1cd5069a4e894b4867e00763a48a8444fed Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Tue, 10 Aug 2021 23:45:36 +0000
Subject: [PATCH 30/57] rename

---
 src/composable_kernel/README.md               |  10 +-
 ...volution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp |  72 ++---
 ...lution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp |  72 ++---
 ...volution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp |  74 ++---
 ...volution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp |  50 ++--
 ...lution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp |  58 ++--
 ...lution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp |  58 ++--
 ...lution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp |  58 ++--
 ...volution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp |  46 +--
 ...ransform.hpp => multi_index_transform.hpp} | 192 ++++++-------
 ...r.hpp => multi_index_transform_helper.hpp} |  41 ++-
 .../tensor_description/tensor_adaptor.hpp     |   4 +-
 ...r_descriptor.hpp => tensor_descriptor.hpp} |  86 +++---
 ...elper.hpp => tensor_descriptor_helper.hpp} |  43 ++-
 .../blockwise_gemm_dlops_v2r2.hpp             |  50 ++--
 .../blockwise_gemm_dlops_v2r3.hpp             |  14 +-
 .../blockwise_gemm_dlops_v3.hpp               |  25 +-
 .../blockwise_gemm_xdlops.hpp                 | 110 ++++----
 ...pp => blockwise_tensor_slice_transfer.hpp} |  56 ++--
 ...=> blockwise_tensor_slice_transfer_v2.hpp} |  53 ++--
 ...pp => gridwise_contraction_dlops_v1r2.hpp} |  81 +++---
 ..._v1r2.hpp => gridwise_gemm_dlops_v1r2.hpp} | 160 ++++++-----
 ..._v1r3.hpp => gridwise_gemm_dlops_v1r3.hpp} |  96 ++++---
 ...lops_v2.hpp => gridwise_gemm_dlops_v2.hpp} | 142 +++++-----
 ...v2r3.hpp => gridwise_gemm_xdlops_v2r3.hpp} | 189 ++++++-------
 ...et.hpp => threadwise_tensor_slice_set.hpp} |  12 +-
 ...p => threadwise_tensor_slice_transfer.hpp} | 157 +++++------
 ...> threadwise_tensor_slice_transfer_v2.hpp} | 113 ++++----
 .../include/utility/config.hpp                |   4 +-
 .../include/utility/dynamic_buffer.hpp        |   4 +-
 ...plicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp} | 194 +++++++------
 ...licit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp} | 190 ++++++-------
 ...licit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp} | 190 ++++++-------
 ...plicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp} |  59 ++--
 ...licit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp} |  15 +-
 ...cit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp} |  15 +-
 ...plicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp} |  15 +-
 ...licit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp} |  17 +-
 ...icit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp} |  15 +-
 ...cit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp} |  15 +-
 ...cit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp} |  15 +-
 ...cit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp} |  15 +-
 ...cit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp} |  15 +-
 ...plicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp} |  14 +-
 ...plicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp} |  15 +-
 ....hpp => driver_contraction_dlops_v1r2.hpp} |  46 +--
 ...plicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp} |  32 +--
 ...gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp} |  32 +--
 ...ps_v1r2.hpp => driver_gemm_dlops_v1r2.hpp} | 266 +++++++++---------
 ...ps_v1r3.hpp => driver_gemm_dlops_v1r3.hpp} | 257 +++++++++--------
 ...s_v2r3.hpp => driver_gemm_xdlops_v2r3.hpp} | 132 ++++-----
 .../src/conv_bwd_driver_offline.cpp           |  66 ++---
 .../src/conv_fwd_driver_offline.cpp           | 144 +++++-----
 .../host/host_tensor/include/conv_common.hpp  |   8 +-
 .../conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp     |   9 +-
 55 files changed, 1921 insertions(+), 2000 deletions(-)
 rename src/composable_kernel/composable_kernel/include/tensor_description/{dynamic_multi_index_transform.hpp => multi_index_transform.hpp} (91%)
 rename src/composable_kernel/composable_kernel/include/tensor_description/{dynamic_multi_index_transform_helper.hpp => multi_index_transform_helper.hpp} (65%)
 rename src/composable_kernel/composable_kernel/include/tensor_description/{dynamic_tensor_descriptor.hpp => tensor_descriptor.hpp} (87%)
 rename src/composable_kernel/composable_kernel/include/tensor_description/{dynamic_tensor_descriptor_helper.hpp => tensor_descriptor_helper.hpp} (74%)
 rename src/composable_kernel/composable_kernel/include/tensor_operation/{blockwise_dynamic_tensor_slice_transfer.hpp => blockwise_tensor_slice_transfer.hpp} (74%)
 rename src/composable_kernel/composable_kernel/include/tensor_operation/{blockwise_dynamic_tensor_slice_transfer_v2.hpp => blockwise_tensor_slice_transfer_v2.hpp} (75%)
 rename src/composable_kernel/composable_kernel/include/tensor_operation/{gridwise_dynamic_contraction_dlops_v1r2.hpp => gridwise_contraction_dlops_v1r2.hpp} (91%)
 rename src/composable_kernel/composable_kernel/include/tensor_operation/{gridwise_dynamic_gemm_dlops_v1r2.hpp => gridwise_gemm_dlops_v1r2.hpp} (82%)
 rename src/composable_kernel/composable_kernel/include/tensor_operation/{gridwise_dynamic_gemm_dlops_v1r3.hpp => gridwise_gemm_dlops_v1r3.hpp} (89%)
 rename src/composable_kernel/composable_kernel/include/tensor_operation/{gridwise_dynamic_gemm_dlops_v2.hpp => gridwise_gemm_dlops_v2.hpp} (79%)
 rename src/composable_kernel/composable_kernel/include/tensor_operation/{gridwise_dynamic_gemm_xdlops_v2r3.hpp => gridwise_gemm_xdlops_v2r3.hpp} (81%)
 rename src/composable_kernel/composable_kernel/include/tensor_operation/{threadwise_dynamic_tensor_slice_set.hpp => threadwise_tensor_slice_set.hpp} (83%)
 rename src/composable_kernel/composable_kernel/include/tensor_operation/{threadwise_dynamic_tensor_slice_transfer.hpp => threadwise_tensor_slice_transfer.hpp} (89%)
 rename src/composable_kernel/composable_kernel/include/tensor_operation/{threadwise_dynamic_tensor_slice_transfer_v2.hpp => threadwise_tensor_slice_transfer_v2.hpp} (86%)
 rename src/composable_kernel/composable_kernel/src/kernel_wrapper/{dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp => convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp} (67%)
 rename src/composable_kernel/composable_kernel/src/kernel_wrapper/{dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp => convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp} (64%)
 rename src/composable_kernel/composable_kernel/src/kernel_wrapper/{dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp => convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp} (64%)
 rename src/composable_kernel/composable_kernel/src/kernel_wrapper/{dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp => convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp} (93%)
 rename src/composable_kernel/host/driver_offline/include/{device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp => device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp} (96%)
 rename src/composable_kernel/host/driver_offline/include/{device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp => device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp} (96%)
 rename src/composable_kernel/host/driver_offline/include/{device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp => device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp} (94%)
 rename src/composable_kernel/host/driver_offline/include/{device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp => device_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp} (94%)
 rename src/composable_kernel/host/driver_offline/include/{device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp => device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp} (96%)
 rename src/composable_kernel/host/driver_offline/include/{device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp => device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp} (94%)
 rename src/composable_kernel/host/driver_offline/include/{device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp => device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp} (94%)
 rename src/composable_kernel/host/driver_offline/include/{device_dynamic_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp => device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp} (96%)
 rename src/composable_kernel/host/driver_offline/include/{device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp => device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp} (96%)
 rename src/composable_kernel/host/driver_offline/include/{device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp => device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp} (91%)
 rename src/composable_kernel/host/driver_offline/include/{device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp => device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp} (95%)
 rename src/composable_kernel/host/driver_offline/include/{driver_dynamic_contraction_dlops_v1r2.hpp => driver_contraction_dlops_v1r2.hpp} (88%)
 rename src/composable_kernel/host/driver_offline/include/{driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp => driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp} (92%)
 rename src/composable_kernel/host/driver_offline/include/{driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp => driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp} (92%)
 rename src/composable_kernel/host/driver_offline/include/{driver_dynamic_gemm_dlops_v1r2.hpp => driver_gemm_dlops_v1r2.hpp} (56%)
 rename src/composable_kernel/host/driver_offline/include/{driver_dynamic_gemm_dlops_v1r3.hpp => driver_gemm_dlops_v1r3.hpp} (57%)
 rename src/composable_kernel/host/driver_offline/include/{driver_dynamic_gemm_xdlops_v2r3.hpp => driver_gemm_xdlops_v2r3.hpp} (50%)

diff --git a/src/composable_kernel/README.md b/src/composable_kernel/README.md
index 6e6019601a..4f071d5896 100644
--- a/src/composable_kernel/README.md
+++ b/src/composable_kernel/README.md
@@ -78,7 +78,7 @@ InLeftPads size 2, {1, 1, }
 InRightPads size 2, {1, 1, }
 ConvStrides size 2, {2, 2, }
 ConvDilations size 2, {1, 1, }
-device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw
+device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw
 a_k0_m_k1_grid_desc{216, 256, 8}
 b_k0_n_k1_grid_desc{216, 165888, 8}
 c_m_n_grid_desc{ 256, 165888}
@@ -100,7 +100,7 @@ InLeftPads size 2, {1, 1, }
 InRightPads size 2, {1, 1, }
 ConvStrides size 2, {1, 1, }
 ConvDilations size 2, {1, 1, }
-device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw
+device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw
 a_k0_m_k1_grid_desc{288, 1024, 8}
 b_k0_n_k1_grid_desc{288, 50176, 8}
 c_m_n_grid_desc{ 1024, 50176}
@@ -122,7 +122,7 @@ InLeftPads size 2, {1, 1, }
 InRightPads size 2, {1, 1, }
 ConvStrides size 2, {2, 2, }
 ConvDilations size 2, {1, 1, }
-device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk
+device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk
 a_k0_m_k1_grid_desc{216, 165888, 8}
 b_k0_n_k1_grid_desc{216, 256, 8}
 c_m_n_grid_desc{ 165888, 256}
@@ -144,7 +144,7 @@ InLeftPads size 2, {1, 1, }
 InRightPads size 2, {1, 1, }
 ConvStrides size 2, {1, 1, }
 ConvDilations size 2, {1, 1, }
-device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk
+device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk
 a_k0_m_k1_grid_desc{288, 50176, 8}
 b_k0_n_k1_grid_desc{288, 1024, 8}
 c_m_n_grid_desc{ 50176, 1024}
@@ -166,7 +166,7 @@ InLeftPads size 2, {1, 1, }
 InRightPads size 2, {1, 1, }
 ConvStrides size 2, {1, 1, }
 ConvDilations size 2, {1, 1, }
-device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk
+device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk
 a_k0_m_k1_grid_desc{288, 50176, 8}
 b_k0_n_k1_grid_desc{288, 1024, 8}
 c_m_n_grid_desc{ 50176, 1024}
diff --git a/src/composable_kernel/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp b/src/composable_kernel/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp
index 5c582dea46..09ea16fa23 100644
--- a/src/composable_kernel/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp
+++ b/src/composable_kernel/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp
@@ -2,8 +2,8 @@
 #define CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1_NHWC_KYXC_NHWK_HPP
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 
 namespace ck {
 
@@ -23,9 +23,9 @@ template <typename... Wei,
           index_t GemmK1Value>
 __host__ __device__ constexpr auto
 transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk(
-    const DynamicTensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
-    const DynamicTensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
-    const DynamicTensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const TensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
     const ConvStrides& conv_strides,
     const ConvDilations& conv_dilations,
     const InLeftPads& in_left_pads,
@@ -102,7 +102,7 @@ transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk(
     const auto K0 = K / K1;
 
     // weight tensor
-    const auto wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc = transform_tensor_descriptor(
         wei_k_y_x_c_grid_desc,
         make_tuple(make_pass_through_transform(K),
                    make_embed_transform(make_tuple(YDot, YTilda),
@@ -114,28 +114,28 @@ transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk(
         make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
 
     const auto wei_k0_k1_ydotslice_xdotslice_c_grid_desc =
-        transform_dynamic_tensor_descriptor(wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc,
-                                            make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
-                                                       make_slice_transform(YDot, I0, YDotSlice),
-                                                       make_slice_transform(XDot, I0, XDotSlice),
-                                                       make_freeze_transform(IYTilda),
-                                                       make_freeze_transform(IXTilda),
-                                                       make_pass_through_transform(C)),
-                                            make_tuple(Sequence<0>{},
-                                                       Sequence<1>{},
-                                                       Sequence<3>{},
-                                                       Sequence<2>{},
-                                                       Sequence<4>{},
-                                                       Sequence<5>{}),
-                                            make_tuple(Sequence<0, 1>{},
-                                                       Sequence<2>{},
-                                                       Sequence<3>{},
-                                                       Sequence<>{},
-                                                       Sequence<>{},
-                                                       Sequence<4>{}));
+        transform_tensor_descriptor(wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
+                                               make_slice_transform(YDot, I0, YDotSlice),
+                                               make_slice_transform(XDot, I0, XDotSlice),
+                                               make_freeze_transform(IYTilda),
+                                               make_freeze_transform(IXTilda),
+                                               make_pass_through_transform(C)),
+                                    make_tuple(Sequence<0>{},
+                                               Sequence<1>{},
+                                               Sequence<3>{},
+                                               Sequence<2>{},
+                                               Sequence<4>{},
+                                               Sequence<5>{}),
+                                    make_tuple(Sequence<0, 1>{},
+                                               Sequence<2>{},
+                                               Sequence<3>{},
+                                               Sequence<>{},
+                                               Sequence<>{},
+                                               Sequence<4>{}));
 
 #if 1
-    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
         wei_k0_k1_ydotslice_xdotslice_c_grid_desc,
         make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)),
                    make_pass_through_transform(C),
@@ -143,7 +143,7 @@ transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk(
         make_tuple(Sequence<2, 3, 0>{}, Sequence<4>{}, Sequence<1>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
 #else
-    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
         wei_k0_k1_ydotslice_xdotslice_c_grid_desc,
         make_tuple(make_merge_transform(make_tuple(K0, YDotSlice, XDotSlice)),
                    make_pass_through_transform(C),
@@ -154,7 +154,7 @@ transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk(
 
     // output tensor
     // this add padding check
-    const auto out_n_hop_wop_k_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto out_n_hop_wop_k_grid_desc = transform_tensor_descriptor(
         out_n_ho_wo_k_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_pad_transform(Ho, I0, I0),
@@ -163,7 +163,7 @@ transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk(
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
 
-    const auto out_n_ydot_htilda_xdot_wtilda_k_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto out_n_ydot_htilda_xdot_wtilda_k_grid_desc = transform_tensor_descriptor(
         out_n_hop_wop_k_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_embed_transform(make_tuple(YDot, HTilda),
@@ -175,7 +175,7 @@ transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk(
         make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
 
     const auto out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc =
-        transform_dynamic_tensor_descriptor(
+        transform_tensor_descriptor(
             out_n_ydot_htilda_xdot_wtilda_k_grid_desc,
             make_tuple(make_pass_through_transform(N),
                        make_slice_transform(YDot, I0, YDotSlice),
@@ -197,7 +197,7 @@ transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk(
                        Sequence<5, 6>{}));
 
 #if 1
-    const auto out_gemmk0_gemmn_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto out_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
         out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc,
         make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)),
                    make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice)),
@@ -205,7 +205,7 @@ transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk(
         make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}, Sequence<6>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
 #else
-    const auto out_gemmk0_gemmn_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto out_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
         out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc,
         make_tuple(make_merge_transform(make_tuple(K0, YDotSlice, XDotSlice)),
                    make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice)),
@@ -215,7 +215,7 @@ transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk(
 #endif
 
     // input tensor
-    const auto in_n_hip_wip_c_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
         in_n_hi_wi_c_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_pad_transform(Hi, InLeftPadH, InRightPadH),
@@ -224,7 +224,7 @@ transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk(
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
 
-    const auto in_n_ytilda_htilda_xtilda_wtilda_c_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n_ytilda_htilda_xtilda_wtilda_c_grid_desc = transform_tensor_descriptor(
         in_n_hip_wip_c_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_embed_transform(make_tuple(YTilda, HTilda),
@@ -235,7 +235,7 @@ transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk(
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
         make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
 
-    const auto in_n_htildaslice_wtildaslice_c_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n_htildaslice_wtildaslice_c_grid_desc = transform_tensor_descriptor(
         in_n_ytilda_htilda_xtilda_wtilda_c_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_freeze_transform(IYTilda),
@@ -256,7 +256,7 @@ transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk(
                    Sequence<2>{},
                    Sequence<3>{}));
 
-    const auto in_gemmm_gemmn_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
         in_n_htildaslice_wtildaslice_c_grid_desc,
         make_tuple(make_pass_through_transform(C),
                    make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice))),
diff --git a/src/composable_kernel/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp b/src/composable_kernel/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp
index 377a1ac29b..9c60e8c3ac 100644
--- a/src/composable_kernel/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp
+++ b/src/composable_kernel/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp
@@ -2,8 +2,8 @@
 #define CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1R2_NHWC_KYXC_NHWK_HPP
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 
 namespace ck {
 
@@ -26,9 +26,9 @@ template <typename... Wei,
           index_t GemmK1Value>
 __host__ __device__ constexpr auto
 transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk(
-    const DynamicTensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
-    const DynamicTensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
-    const DynamicTensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const TensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
     const ConvStrides& conv_strides,
     const ConvDilations& conv_dilations,
     const InLeftPads& in_left_pads,
@@ -106,7 +106,7 @@ transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk(
 
     // A: output tensor
     // this add padding check
-    const auto out_n_hop_wop_k_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto out_n_hop_wop_k_grid_desc = transform_tensor_descriptor(
         out_n_ho_wo_k_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_pad_transform(Ho, I0, I0),
@@ -115,7 +115,7 @@ transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk(
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
 
-    const auto out_n_ydot_htilda_xdot_wtilda_k_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto out_n_ydot_htilda_xdot_wtilda_k_grid_desc = transform_tensor_descriptor(
         out_n_hop_wop_k_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_embed_transform(make_tuple(YDot, HTilda),
@@ -127,7 +127,7 @@ transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk(
         make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
 
     const auto out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc =
-        transform_dynamic_tensor_descriptor(
+        transform_tensor_descriptor(
             out_n_ydot_htilda_xdot_wtilda_k_grid_desc,
             make_tuple(make_pass_through_transform(N),
                        make_slice_transform(YDot, I0, YDotSlice),
@@ -149,7 +149,7 @@ transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk(
                        Sequence<5, 6>{}));
 
 #if 1
-    const auto out_gemmk0_gemmm_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto out_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
         out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc,
         make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)),
                    make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice)),
@@ -157,7 +157,7 @@ transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk(
         make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}, Sequence<6>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
 #else
-    const auto out_gemmk0_gemmm_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto out_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
         out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc,
         make_tuple(make_merge_transform(make_tuple(K0, YDotSlice, XDotSlice)),
                    make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice)),
@@ -167,7 +167,7 @@ transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk(
 #endif
 
     // B: weight tensor
-    const auto wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc = transform_tensor_descriptor(
         wei_k_y_x_c_grid_desc,
         make_tuple(make_pass_through_transform(K),
                    make_embed_transform(make_tuple(YDot, YTilda),
@@ -179,28 +179,28 @@ transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk(
         make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
 
     const auto wei_k0_k1_ydotslice_xdotslice_c_grid_desc =
-        transform_dynamic_tensor_descriptor(wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc,
-                                            make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
-                                                       make_slice_transform(YDot, I0, YDotSlice),
-                                                       make_slice_transform(XDot, I0, XDotSlice),
-                                                       make_freeze_transform(IYTilda),
-                                                       make_freeze_transform(IXTilda),
-                                                       make_pass_through_transform(C)),
-                                            make_tuple(Sequence<0>{},
-                                                       Sequence<1>{},
-                                                       Sequence<3>{},
-                                                       Sequence<2>{},
-                                                       Sequence<4>{},
-                                                       Sequence<5>{}),
-                                            make_tuple(Sequence<0, 1>{},
-                                                       Sequence<2>{},
-                                                       Sequence<3>{},
-                                                       Sequence<>{},
-                                                       Sequence<>{},
-                                                       Sequence<4>{}));
+        transform_tensor_descriptor(wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
+                                               make_slice_transform(YDot, I0, YDotSlice),
+                                               make_slice_transform(XDot, I0, XDotSlice),
+                                               make_freeze_transform(IYTilda),
+                                               make_freeze_transform(IXTilda),
+                                               make_pass_through_transform(C)),
+                                    make_tuple(Sequence<0>{},
+                                               Sequence<1>{},
+                                               Sequence<3>{},
+                                               Sequence<2>{},
+                                               Sequence<4>{},
+                                               Sequence<5>{}),
+                                    make_tuple(Sequence<0, 1>{},
+                                               Sequence<2>{},
+                                               Sequence<3>{},
+                                               Sequence<>{},
+                                               Sequence<>{},
+                                               Sequence<4>{}));
 
 #if 1
-    const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
         wei_k0_k1_ydotslice_xdotslice_c_grid_desc,
         make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)),
                    make_pass_through_transform(C),
@@ -208,7 +208,7 @@ transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk(
         make_tuple(Sequence<2, 3, 0>{}, Sequence<4>{}, Sequence<1>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
 #else
-    const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
         wei_k0_k1_ydotslice_xdotslice_c_grid_desc,
         make_tuple(make_merge_transform(make_tuple(K0, YDotSlice, XDotSlice)),
                    make_pass_through_transform(C),
@@ -218,7 +218,7 @@ transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk(
 #endif
 
     // C: input tensor
-    const auto in_n_hip_wip_c_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
         in_n_hi_wi_c_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_pad_transform(Hi, InLeftPadH, InRightPadH),
@@ -227,7 +227,7 @@ transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk(
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
 
-    const auto in_n_ytilda_htilda_xtilda_wtilda_c_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n_ytilda_htilda_xtilda_wtilda_c_grid_desc = transform_tensor_descriptor(
         in_n_hip_wip_c_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_embed_transform(make_tuple(YTilda, HTilda),
@@ -238,7 +238,7 @@ transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk(
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
         make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
 
-    const auto in_n_htildaslice_wtildaslice_c_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n_htildaslice_wtildaslice_c_grid_desc = transform_tensor_descriptor(
         in_n_ytilda_htilda_xtilda_wtilda_c_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_freeze_transform(IYTilda),
@@ -259,7 +259,7 @@ transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk(
                    Sequence<2>{},
                    Sequence<3>{}));
 
-    const auto in_gemmm_gemmn_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
         in_n_htildaslice_wtildaslice_c_grid_desc,
         make_tuple(make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice)),
                    make_pass_through_transform(C)),
diff --git a/src/composable_kernel/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp b/src/composable_kernel/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp
index 4378314108..093a46256d 100644
--- a/src/composable_kernel/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/src/composable_kernel/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -2,8 +2,8 @@
 #define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4_NCHW_KCYX_NKHW_HPP
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 
 namespace ck {
 
@@ -18,9 +18,9 @@ template <typename... Wei,
           typename InLeftPads,
           typename InRightPads>
 __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(
-    const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
-    const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
-    const DynamicTensorDescriptor<Out...>& out_n_k_ho_wo_global_desc,
+    const TensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
+    const TensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
+    const TensorDescriptor<Out...>& out_n_k_ho_wo_global_desc,
     const ConvStrides& conv_strides,
     const ConvDilations& conv_dilations,
     const InLeftPads& in_left_pads,
@@ -57,14 +57,14 @@ __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_
     const auto InRightPadW = in_right_pads[I1];
 
     // weight tensor
-    const auto wei_gemmk_gemmm_global_desc = transform_dynamic_tensor_descriptor(
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C * Y * X)),
+    const auto wei_gemmk_gemmm_global_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)),
         make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
         make_tuple(Sequence<0>{}, Sequence<1>{}),
         make_tuple(Sequence<1>{}, Sequence<0>{}));
 
     // input tensor
-    const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor(
         in_n_c_hi_wi_global_desc,
         make_tuple(make_pass_through_transform(N),
                    make_pass_through_transform(C),
@@ -73,7 +73,7 @@ __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
 
-    const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor(
         in_n_c_hip_wip_global_desc,
         make_tuple(make_pass_through_transform(N),
                    make_pass_through_transform(C),
@@ -83,15 +83,15 @@ __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
 
     const auto in_gemmk_gemmn_global_desc =
-        transform_dynamic_tensor_descriptor(in_n_c_y_ho_x_wo_global_desc,
-                                            make_tuple(make_merge_transform(make_tuple(C, Y, X)),
-                                                       make_merge_transform(make_tuple(N, Ho, Wo))),
-                                            make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+        transform_tensor_descriptor(in_n_c_y_ho_x_wo_global_desc,
+                                    make_tuple(make_merge_transform(make_tuple(C, Y, X)),
+                                               make_merge_transform(make_tuple(N, Ho, Wo))),
+                                    make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
 
     // output tensor
-    const auto out_gemmm_gemmn_global_desc = transform_dynamic_tensor_descriptor(
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K, Ho * Wo)),
+    const auto out_gemmm_gemmn_global_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho * Wo)),
         make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(N, Ho * Wo))),
         make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}));
@@ -109,9 +109,9 @@ template <typename... Wei,
           typename InRightPads>
 __host__ __device__ constexpr auto
 transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_no_pad(
-    const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
-    const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
-    const DynamicTensorDescriptor<Out...>& out_n_k_ho_wo_global_desc,
+    const TensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
+    const TensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
+    const TensorDescriptor<Out...>& out_n_k_ho_wo_global_desc,
     const ConvStrides& conv_strides,
     const ConvDilations& conv_dilations,
     const InLeftPads& in_left_pads,
@@ -147,14 +147,14 @@ transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_no_pad(
     assert(InLeftPadH == 0 && InLeftPadW == 0 && InRightPadH == 0 && InRightPadW == 0);
 
     // weight tensor
-    const auto wei_gemmk_gemmm_global_desc = transform_dynamic_tensor_descriptor(
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C * Y * X)),
+    const auto wei_gemmk_gemmm_global_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)),
         make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
         make_tuple(Sequence<0>{}, Sequence<1>{}),
         make_tuple(Sequence<1>{}, Sequence<0>{}));
 
     // input tensor
-    const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor(
         in_n_c_hi_wi_global_desc,
         make_tuple(make_pass_through_transform(N),
                    make_pass_through_transform(C),
@@ -164,15 +164,15 @@ transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_no_pad(
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
 
     const auto in_gemmk_gemmn_global_desc =
-        transform_dynamic_tensor_descriptor(in_n_c_y_ho_x_wo_global_desc,
-                                            make_tuple(make_merge_transform(make_tuple(C, Y, X)),
-                                                       make_merge_transform(make_tuple(N, Ho, Wo))),
-                                            make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+        transform_tensor_descriptor(in_n_c_y_ho_x_wo_global_desc,
+                                    make_tuple(make_merge_transform(make_tuple(C, Y, X)),
+                                               make_merge_transform(make_tuple(N, Ho, Wo))),
+                                    make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
 
     // output tensor
-    const auto out_gemmm_gemmn_global_desc = transform_dynamic_tensor_descriptor(
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K, Ho * Wo)),
+    const auto out_gemmm_gemmn_global_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho * Wo)),
         make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(N, Ho * Wo))),
         make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}));
@@ -189,9 +189,9 @@ template <typename... Wei,
           typename InLeftPads,
           typename InRightPads>
 __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_1x1(
-    const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
-    const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
-    const DynamicTensorDescriptor<Out...>& out_n_k_ho_wo_global_desc,
+    const TensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
+    const TensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
+    const TensorDescriptor<Out...>& out_n_k_ho_wo_global_desc,
     const ConvStrides& conv_strides,
     const ConvDilations& conv_dilations,
     const InLeftPads& in_left_pads,
@@ -229,22 +229,22 @@ __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_
            InRightPadW == 0);
 
     // weight tensor
-    const auto wei_gemmk_gemmm_global_desc = transform_dynamic_tensor_descriptor(
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C)),
+    const auto wei_gemmk_gemmm_global_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, C)),
         make_tuple(make_pass_through_transform(K), make_pass_through_transform(C)),
         make_tuple(Sequence<0>{}, Sequence<1>{}),
         make_tuple(Sequence<1>{}, Sequence<0>{}));
 
     // input tensor
-    const auto in_gemmk_gemmn_global_desc = transform_dynamic_tensor_descriptor(
+    const auto in_gemmk_gemmn_global_desc = transform_tensor_descriptor(
         in_n_c_hi_wi_global_desc,
         make_tuple(make_pass_through_transform(C), make_merge_transform(make_tuple(N, Ho, Wo))),
         make_tuple(Sequence<1>{}, Sequence<0, 2, 3>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}));
 
     // output tensor
-    const auto out_gemmm_gemmn_global_desc = transform_dynamic_tensor_descriptor(
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K, Ho * Wo)),
+    const auto out_gemmm_gemmn_global_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho * Wo)),
         make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(N, Ho * Wo))),
         make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}));
diff --git a/src/composable_kernel/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp b/src/composable_kernel/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp
index 4764f02787..9aa27884da 100644
--- a/src/composable_kernel/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp
+++ b/src/composable_kernel/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp
@@ -2,8 +2,8 @@
 #define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4_NHWC_KYXC_NHWK_HPP
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 
 namespace ck {
 
@@ -18,9 +18,9 @@ template <typename... Wei,
           typename InLeftPads,
           typename InRightPads>
 __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk_pad(
-    const DynamicTensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
-    const DynamicTensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
-    const DynamicTensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const TensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
     const ConvStrides& conv_strides,
     const ConvDilations& conv_dilations,
     const InLeftPads& in_left_pads,
@@ -57,14 +57,14 @@ __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_
     const auto InRightPadW = in_right_pads[I1];
 
     // weight tensor
-    const auto wei_gemmk_gemmm_grid_desc = transform_dynamic_tensor_descriptor(
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, Y * X * C)),
+    const auto wei_gemmk_gemmm_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C)),
         make_tuple(make_pass_through_transform(K), make_pass_through_transform(Y * X * C)),
         make_tuple(Sequence<0>{}, Sequence<1>{}),
         make_tuple(Sequence<1>{}, Sequence<0>{}));
 
     // input tensor
-    const auto in_n_hip_wip_c_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
         in_n_hi_wi_c_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_pad_transform(Hi, InLeftPadH, InRightPadH),
@@ -73,7 +73,7 @@ __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
 
-    const auto in_n_y_ho_x_wo_c_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
         in_n_hip_wip_c_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
@@ -83,15 +83,15 @@ __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_
         make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
 
     const auto in_gemmk_gemmn_grid_desc =
-        transform_dynamic_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
-                                            make_tuple(make_merge_transform(make_tuple(Y, X, C)),
-                                                       make_merge_transform(make_tuple(N, Ho, Wo))),
-                                            make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+        transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
+                                    make_tuple(make_merge_transform(make_tuple(Y, X, C)),
+                                               make_merge_transform(make_tuple(N, Ho, Wo))),
+                                    make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
 
     // output tensor
-    const auto out_gemmm_gemmn_grid_desc = transform_dynamic_tensor_descriptor(
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N * Ho * Wo, K)),
+    const auto out_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K)),
         make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(K)),
         make_tuple(Sequence<0>{}, Sequence<1>{}),
         make_tuple(Sequence<1>{}, Sequence<0>{}));
@@ -108,9 +108,9 @@ template <typename... Wei,
           typename InLeftPads,
           typename InRightPads>
 __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk_1x1(
-    const DynamicTensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
-    const DynamicTensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
-    const DynamicTensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const TensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
     const ConvStrides& conv_strides,
     const ConvDilations& conv_dilations,
     const InLeftPads& in_left_pads,
@@ -148,22 +148,22 @@ __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_
            InRightPadW == 0);
 
     // weight tensor
-    const auto wei_gemmk_gemmm_grid_desc = transform_dynamic_tensor_descriptor(
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C)),
+    const auto wei_gemmk_gemmm_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, C)),
         make_tuple(make_pass_through_transform(K), make_pass_through_transform(C)),
         make_tuple(Sequence<0>{}, Sequence<1>{}),
         make_tuple(Sequence<1>{}, Sequence<0>{}));
 
     // input tensor
-    const auto in_gemmk_gemmn_grid_desc = transform_dynamic_tensor_descriptor(
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N * Ho * Wo, C)),
+    const auto in_gemmk_gemmn_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, C)),
         make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(C)),
         make_tuple(Sequence<0>{}, Sequence<1>{}),
         make_tuple(Sequence<1>{}, Sequence<0>{}));
 
     // output tensor
-    const auto out_gemmm_gemmn_grid_desc = transform_dynamic_tensor_descriptor(
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N * Ho * Wo, K)),
+    const auto out_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K)),
         make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(K)),
         make_tuple(Sequence<0>{}, Sequence<1>{}),
         make_tuple(Sequence<1>{}, Sequence<0>{}));
diff --git a/src/composable_kernel/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp b/src/composable_kernel/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
index 49ae26518e..16ae8b470d 100644
--- a/src/composable_kernel/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
+++ b/src/composable_kernel/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
@@ -2,8 +2,8 @@
 #define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R2_NCHW_KCYX_NKHW_HPP
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 
 namespace ck {
 
@@ -20,9 +20,9 @@ template <typename... Wei,
           index_t GemmK1Value>
 __host__ __device__ constexpr auto
 transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(
-    const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_grid_desc,
-    const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_grid_desc,
-    const DynamicTensorDescriptor<Out...>& out_n_k_ho_wo_grid_desc,
+    const TensorDescriptor<Wei...>& wei_k_c_y_x_grid_desc,
+    const TensorDescriptor<In...>& in_n_c_hi_wi_grid_desc,
+    const TensorDescriptor<Out...>& out_n_k_ho_wo_grid_desc,
     const ConvStrides& conv_strides,
     const ConvDilations& conv_dilations,
     const InLeftPads& in_left_pads,
@@ -67,21 +67,21 @@ transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(
     const auto GemmK0 = GemmK / GemmK1;
 
     // weight tensor
-    const auto wei_gemmk_gemmm_grid_desc = transform_dynamic_tensor_descriptor(
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C * Y * X)),
+    const auto wei_gemmk_gemmm_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)),
         make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
         make_tuple(Sequence<0>{}, Sequence<1>{}),
         make_tuple(Sequence<1>{}, Sequence<0>{}));
 
-    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
-        wei_gemmk_gemmm_grid_desc,
-        make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
-                   make_pass_through_transform(GemmM)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc =
+        transform_tensor_descriptor(wei_gemmk_gemmm_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                                               make_pass_through_transform(GemmM)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
     // input tensor
-    const auto in_n_c_hip_wip_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n_c_hip_wip_grid_desc = transform_tensor_descriptor(
         in_n_c_hi_wi_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_pass_through_transform(C),
@@ -90,7 +90,7 @@ transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
 
-    const auto in_n_c_y_ho_x_wo_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n_c_y_ho_x_wo_grid_desc = transform_tensor_descriptor(
         in_n_c_hip_wip_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_pass_through_transform(C),
@@ -100,22 +100,22 @@ transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
 
     const auto in_gemmk_gemmn_grid_desc =
-        transform_dynamic_tensor_descriptor(in_n_c_y_ho_x_wo_grid_desc,
-                                            make_tuple(make_merge_transform(make_tuple(C, Y, X)),
-                                                       make_merge_transform(make_tuple(N, Ho, Wo))),
-                                            make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-    const auto in_gemmk0_gemmn_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
-        in_gemmk_gemmn_grid_desc,
-        make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
-                   make_pass_through_transform(GemmN)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        transform_tensor_descriptor(in_n_c_y_ho_x_wo_grid_desc,
+                                    make_tuple(make_merge_transform(make_tuple(C, Y, X)),
+                                               make_merge_transform(make_tuple(N, Ho, Wo))),
+                                    make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    const auto in_gemmk0_gemmn_gemmk1_grid_desc =
+        transform_tensor_descriptor(in_gemmk_gemmn_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                                               make_pass_through_transform(GemmN)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
     // output tensor
-    const auto out_gemmm_gemmn_grid_desc = transform_dynamic_tensor_descriptor(
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K, Ho * Wo)),
+    const auto out_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho * Wo)),
         make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(N, Ho * Wo))),
         make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}));
diff --git a/src/composable_kernel/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp b/src/composable_kernel/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp
index 5814e66766..e81c87d046 100644
--- a/src/composable_kernel/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp
+++ b/src/composable_kernel/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp
@@ -2,8 +2,8 @@
 #define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R2_NHWC_KYXC_NHWK_HPP
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 
 namespace ck {
 
@@ -20,9 +20,9 @@ template <typename... Wei,
           index_t GemmK1Value>
 __host__ __device__ constexpr auto
 transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk_pad(
-    const DynamicTensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
-    const DynamicTensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
-    const DynamicTensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const TensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
     const ConvStrides& conv_strides,
     const ConvDilations& conv_dilations,
     const InLeftPads& in_left_pads,
@@ -67,21 +67,21 @@ transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk_pad(
     const auto GemmK0 = GemmK / GemmK1;
 
     // weight tensor
-    const auto wei_gemmk_gemmm_grid_desc = transform_dynamic_tensor_descriptor(
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, Y * X * C)),
+    const auto wei_gemmk_gemmm_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C)),
         make_tuple(make_pass_through_transform(K), make_pass_through_transform(Y * X * C)),
         make_tuple(Sequence<0>{}, Sequence<1>{}),
         make_tuple(Sequence<1>{}, Sequence<0>{}));
 
-    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
-        wei_gemmk_gemmm_grid_desc,
-        make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
-                   make_pass_through_transform(GemmM)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc =
+        transform_tensor_descriptor(wei_gemmk_gemmm_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                                               make_pass_through_transform(GemmM)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
     // input tensor
-    const auto in_n_hip_wip_c_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
         in_n_hi_wi_c_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_pad_transform(Hi, InLeftPadH, InRightPadH),
@@ -90,7 +90,7 @@ transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk_pad(
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
 
-    const auto in_n_y_ho_x_wo_c_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
         in_n_hip_wip_c_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
@@ -100,22 +100,22 @@ transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk_pad(
         make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
 
     const auto in_gemmk_gemmn_grid_desc =
-        transform_dynamic_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
-                                            make_tuple(make_merge_transform(make_tuple(Y, X, C)),
-                                                       make_merge_transform(make_tuple(N, Ho, Wo))),
-                                            make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-    const auto in_gemmk0_gemmn_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
-        in_gemmk_gemmn_grid_desc,
-        make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
-                   make_pass_through_transform(GemmN)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
+                                    make_tuple(make_merge_transform(make_tuple(Y, X, C)),
+                                               make_merge_transform(make_tuple(N, Ho, Wo))),
+                                    make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    const auto in_gemmk0_gemmn_gemmk1_grid_desc =
+        transform_tensor_descriptor(in_gemmk_gemmn_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                                               make_pass_through_transform(GemmN)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
     // output tensor
-    const auto out_gemmm_gemmn_grid_desc = transform_dynamic_tensor_descriptor(
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N * Ho * Wo, K)),
+    const auto out_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K)),
         make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(K)),
         make_tuple(Sequence<0>{}, Sequence<1>{}),
         make_tuple(Sequence<1>{}, Sequence<0>{}));
diff --git a/src/composable_kernel/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp b/src/composable_kernel/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp
index ad9d99f4e7..b0b07505e5 100644
--- a/src/composable_kernel/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp
+++ b/src/composable_kernel/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp
@@ -2,8 +2,8 @@
 #define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R4_NHWC_KYXC_NHWK_HPP
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 
 namespace ck {
 
@@ -23,9 +23,9 @@ template <typename... In,
           index_t GemmK1Value>
 __host__ __device__ constexpr auto
 transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad(
-    const DynamicTensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
-    const DynamicTensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
-    const DynamicTensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const TensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
     const ConvStrides& conv_strides,
     const ConvDilations& conv_dilations,
     const InLeftPads& in_left_pads,
@@ -70,7 +70,7 @@ transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad(
     const auto GemmK0 = GemmK / GemmK1;
 
     // A: input tensor
-    const auto in_n_hip_wip_c_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
         in_n_hi_wi_c_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_pad_transform(Hi, InLeftPadH, InRightPadH),
@@ -79,7 +79,7 @@ transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad(
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
 
-    const auto in_n_y_ho_x_wo_c_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
         in_n_hip_wip_c_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
@@ -89,36 +89,36 @@ transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad(
         make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
 
     const auto in_gemmk_gemmm_grid_desc =
-        transform_dynamic_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
-                                            make_tuple(make_merge_transform(make_tuple(Y, X, C)),
-                                                       make_merge_transform(make_tuple(N, Ho, Wo))),
-                                            make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-    const auto in_gemmk0_gemmm_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
-        in_gemmk_gemmm_grid_desc,
-        make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
-                   make_pass_through_transform(GemmM)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
+                                    make_tuple(make_merge_transform(make_tuple(Y, X, C)),
+                                               make_merge_transform(make_tuple(N, Ho, Wo))),
+                                    make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    const auto in_gemmk0_gemmm_gemmk1_grid_desc =
+        transform_tensor_descriptor(in_gemmk_gemmm_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                                               make_pass_through_transform(GemmM)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
     // B: weight tensor
-    const auto wei_gemmk_gemmn_grid_desc = transform_dynamic_tensor_descriptor(
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, Y * X * C)),
+    const auto wei_gemmk_gemmn_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C)),
         make_tuple(make_pass_through_transform(K), make_pass_through_transform(Y * X * C)),
         make_tuple(Sequence<0>{}, Sequence<1>{}),
         make_tuple(Sequence<1>{}, Sequence<0>{}));
 
-    const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
-        wei_gemmk_gemmn_grid_desc,
-        make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
-                   make_pass_through_transform(GemmN)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    const auto wei_gemmk0_gemmn_gemmk1_grid_desc =
+        transform_tensor_descriptor(wei_gemmk_gemmn_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                                               make_pass_through_transform(GemmN)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
     // C: output tensor
-    const auto out_gemmm_gemmn_grid_desc = transform_dynamic_tensor_descriptor(
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N * Ho * Wo, K)),
+    const auto out_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K)),
         make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(K)),
         make_tuple(Sequence<0>{}, Sequence<1>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}));
diff --git a/src/composable_kernel/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp b/src/composable_kernel/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp
index e709f768cb..f5cb7f4877 100644
--- a/src/composable_kernel/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp
+++ b/src/composable_kernel/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp
@@ -2,8 +2,8 @@
 #define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_CONTRACTION_V6R1_NCHW_KCYX_NKHW_HPP
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 
 namespace ck {
 
@@ -24,9 +24,9 @@ template <typename... Wei,
           typename C0Type>
 __host__ __device__ constexpr auto
 transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad(
-    const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_grid_desc,
-    const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_grid_desc,
-    const DynamicTensorDescriptor<Out...>& out_n_k_ho_wo_grid_desc,
+    const TensorDescriptor<Wei...>& wei_k_c_y_x_grid_desc,
+    const TensorDescriptor<In...>& in_n_c_hi_wi_grid_desc,
+    const TensorDescriptor<Out...>& out_n_k_ho_wo_grid_desc,
     const ConvStrides& conv_strides,
     const ConvDilations& conv_dilations,
     const InLeftPads& in_left_pads,
@@ -68,15 +68,15 @@ transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad(
     const auto C1 = C / C0;
 
     // weight tensor
-    const auto wei_gk0_gm0_gm1_gk1_grid_desc = transform_dynamic_tensor_descriptor(
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C * Y * X)),
-        make_tuple(make_unmerge_transform(make_tuple(I1, K)),
-                   make_unmerge_transform(make_tuple(C0, C1 * Y * X))),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<1, 2>{}, Sequence<3, 0>{}));
+    const auto wei_gk0_gm0_gm1_gk1_grid_desc =
+        transform_tensor_descriptor(make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)),
+                                    make_tuple(make_unmerge_transform(make_tuple(I1, K)),
+                                               make_unmerge_transform(make_tuple(C0, C1 * Y * X))),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<1, 2>{}, Sequence<3, 0>{}));
 
     // input tensor
-    const auto in_n_c_hip_wip_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n_c_hip_wip_grid_desc = transform_tensor_descriptor(
         in_n_c_hi_wi_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_pass_through_transform(C),
@@ -85,7 +85,7 @@ transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad(
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
 
-    const auto in_n0_n1_c0_c1_y_ho_x_wo_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n0_n1_c0_c1_y_ho_x_wo_grid_desc = transform_tensor_descriptor(
         in_n_c_hip_wip_grid_desc,
         make_tuple(make_unmerge_transform(make_tuple(N0, N1)),
                    make_unmerge_transform(make_tuple(C0, C1)),
@@ -94,7 +94,7 @@ transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad(
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
         make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}, Sequence<6, 7>{}));
 
-    const auto in_gk0_gn0_gn1_gk1_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_gk0_gn0_gn1_gk1_grid_desc = transform_tensor_descriptor(
         in_n0_n1_c0_c1_y_ho_x_wo_grid_desc,
         make_tuple(make_merge_transform(make_tuple(C1, Y, X)),
                    make_pass_through_transform(N0),
@@ -105,17 +105,17 @@ transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad(
 
     // output tensor
     const auto out_n_k_howo_grid_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K, Ho * Wo));
+        make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho * Wo));
 
-    const auto out_n0_n1_1_k_howo_grid_desc = transform_dynamic_tensor_descriptor(
-        out_n_k_howo_grid_desc,
-        make_tuple(make_unmerge_transform(make_tuple(N0, N1)),
-                   make_unmerge_transform(make_tuple(I1, K)),
-                   make_pass_through_transform(Ho * Wo)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-        make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}, Sequence<4>{}));
+    const auto out_n0_n1_1_k_howo_grid_desc =
+        transform_tensor_descriptor(out_n_k_howo_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(N0, N1)),
+                                               make_unmerge_transform(make_tuple(I1, K)),
+                                               make_pass_through_transform(Ho * Wo)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                                    make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}, Sequence<4>{}));
 
-    const auto out_gm0_gm1_gn0_gn1_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto out_gm0_gm1_gn0_gn1_grid_desc = transform_tensor_descriptor(
         out_n0_n1_1_k_howo_grid_desc,
         make_tuple(make_pass_through_transform(I1),
                    make_pass_through_transform(K),
diff --git a/src/composable_kernel/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp b/src/composable_kernel/composable_kernel/include/tensor_description/multi_index_transform.hpp
similarity index 91%
rename from src/composable_kernel/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
rename to src/composable_kernel/composable_kernel/include/tensor_description/multi_index_transform.hpp
index 967517bef7..fa5d2246d7 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_description/multi_index_transform.hpp
@@ -1,5 +1,5 @@
-#ifndef CK_DYNAMIC_MULTI_INDEX_TRANSFORM_HPP
-#define CK_DYNAMIC_MULTI_INDEX_TRANSFORM_HPP
+#ifndef CK_MULTI_INDEX_TRANSFORM_HPP
+#define CK_MULTI_INDEX_TRANSFORM_HPP
 
 #include "common_header.hpp"
 #include "multi_index.hpp"
@@ -7,7 +7,7 @@
 namespace ck {
 
 template <typename LowLength>
-struct DynamicPassThrough
+struct PassThrough
 {
     using LowerIndex = MultiIndex<1>;
     using UpperIndex = MultiIndex<1>;
@@ -16,9 +16,9 @@ struct DynamicPassThrough
 
     UpLengths up_lengths_;
 
-    __host__ __device__ constexpr DynamicPassThrough() = default;
+    __host__ __device__ constexpr PassThrough() = default;
 
-    __host__ __device__ constexpr DynamicPassThrough(const LowLength& low_length)
+    __host__ __device__ constexpr PassThrough(const LowLength& low_length)
         : up_lengths_{make_tuple(low_length)}
     {
     }
@@ -82,33 +82,36 @@ struct DynamicPassThrough
     __host__ __device__ void Print() const
     {
         printf("{");
-        printf("DynamicPassThrough, ");
+        printf("PassThrough, ");
         printf("up_lengths_");
         print_multi_index(up_lengths_);
         printf("}");
     }
 };
 
-template <typename LowLength, typename LeftPad, typename RightPad, bool SkipIsValidCheck = false>
-struct DynamicPad
+template <typename LowLength,
+          typename LeftPadLength,
+          typename RightPadLength,
+          bool SkipIsValidCheck = false>
+struct Pad
 {
     using LowerIndex = MultiIndex<1>;
     using UpperIndex = MultiIndex<1>;
 
-    using UpLengths = decltype(make_tuple(LowLength{} + LeftPad{} + RightPad{}));
+    using UpLengths = decltype(make_tuple(LowLength{} + LeftPadLength{} + RightPadLength{}));
 
     UpLengths up_lengths_;
-    LeftPad left_pad_;
-    RightPad right_pad_;
+    LeftPadLength left_pad_length_;
+    RightPadLength right_pad_length_;
 
-    __host__ __device__ constexpr DynamicPad() = default;
+    __host__ __device__ constexpr Pad() = default;
 
-    __host__ __device__ constexpr DynamicPad(const LowLength& low_length,
-                                             const LeftPad& left_pad,
-                                             const RightPad& right_pad)
-        : up_lengths_{make_tuple(low_length + left_pad + right_pad)},
-          left_pad_{left_pad},
-          right_pad_{right_pad}
+    __host__ __device__ constexpr Pad(const LowLength& low_length,
+                                      const LeftPadLength& left_pad_length,
+                                      const RightPadLength& right_pad_length)
+        : up_lengths_{make_tuple(low_length + left_pad_length + right_pad_length)},
+          left_pad_length_{left_pad_length},
+          right_pad_length_{right_pad_length}
     {
     }
 
@@ -125,7 +128,7 @@ struct DynamicPad
         static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
                       "wrong! inconsistent # of dimension");
 
-        idx_low(Number<0>{}) = idx_up[Number<0>{}] - left_pad_;
+        idx_low(Number<0>{}) = idx_up[Number<0>{}] - left_pad_length_;
     }
 
     template <typename LowIdxDiff,
@@ -161,45 +164,46 @@ struct DynamicPad
     __host__ __device__ constexpr bool
     IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& idx_up) const
     {
-        return SkipIsValidCheck || ((idx_up[Number<0>{}] >= left_pad_) &&
-                                    (idx_up[Number<0>{}] < up_lengths_[Number<0>{}] - right_pad_));
+        return SkipIsValidCheck ||
+               ((idx_up[Number<0>{}] >= left_pad_length_) &&
+                (idx_up[Number<0>{}] < up_lengths_[Number<0>{}] - right_pad_length_));
     }
 
     __host__ __device__ static constexpr bool IsKnownAtCompileTime()
     {
         return is_known_at_compile_time<UpLengths>::value &&
-               is_known_at_compile_time<LeftPad>::value &&
-               is_known_at_compile_time<RightPad>::value;
+               is_known_at_compile_time<LeftPadLength>::value &&
+               is_known_at_compile_time<RightPadLength>::value;
     }
 
     __host__ __device__ void Print() const
     {
         printf("{");
-        printf("DynamicPad, ");
+        printf("Pad, ");
         printf("up_lengths_");
         print_multi_index(up_lengths_);
-        printf("left_pad_ %d", index_t{left_pad_});
-        printf("right_pad_ %d", index_t{right_pad_});
+        printf("left_pad_length %d", index_t{left_pad_length_});
+        printf("right_pad_length %d", index_t{right_pad_length_});
         printf("}");
     }
 };
 
-template <typename LowLength, typename LeftPad, bool SkipIsValidCheck = false>
-struct DynamicLeftPad
+template <typename LowLength, typename LeftPadLength, bool SkipIsValidCheck = false>
+struct LeftPad
 {
     using LowerIndex = MultiIndex<1>;
     using UpperIndex = MultiIndex<1>;
 
-    using UpLengths = decltype(make_tuple(LowLength{} + LeftPad{}));
+    using UpLengths = decltype(make_tuple(LowLength{} + LeftPadLength{}));
 
     UpLengths up_lengths_;
-    LeftPad left_pad_;
+    LeftPadLength left_pad_length_;
 
-    __host__ __device__ constexpr DynamicLeftPad() = default;
+    __host__ __device__ constexpr LeftPad() = default;
 
-    __host__ __device__ constexpr DynamicLeftPad(const LowLength& low_length,
-                                                 const LeftPad& left_pad)
-        : up_lengths_{make_tuple(low_length + left_pad)}, left_pad_{left_pad}
+    __host__ __device__ constexpr LeftPad(const LowLength& low_length,
+                                          const LeftPadLength& left_pad_length)
+        : up_lengths_{make_tuple(low_length + left_pad_length)}, left_pad_length_{left_pad_length}
     {
     }
 
@@ -216,7 +220,7 @@ struct DynamicLeftPad
         static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
                       "wrong! inconsistent # of dimension");
 
-        idx_low(Number<0>{}) = idx_up[Number<0>{}] - left_pad_;
+        idx_low(Number<0>{}) = idx_up[Number<0>{}] - left_pad_length_;
     }
 
     template <typename LowIdxDiff,
@@ -252,45 +256,45 @@ struct DynamicLeftPad
     __host__ __device__ constexpr bool
     IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& idx_up) const
     {
-        return SkipIsValidCheck || (idx_up[Number<0>{}] >= left_pad_);
+        return SkipIsValidCheck || (idx_up[Number<0>{}] >= left_pad_length_);
     }
 
     __host__ __device__ static constexpr bool IsKnownAtCompileTime()
     {
         return is_known_at_compile_time<UpLengths>::value &&
-               is_known_at_compile_time<LeftPad>::value;
+               is_known_at_compile_time<LeftPadLength>::value;
     }
 
     __host__ __device__ void Print() const
     {
         printf("{");
-        printf("DynamicLeftPad, ");
+        printf("LeftPad, ");
         printf("up_lengths_");
         print_multi_index(up_lengths_);
-        printf("left_pad_ %d", index_t{left_pad_});
+        printf("left_pad_length_ %d", index_t{left_pad_length_});
         printf("}");
     }
 };
 
-template <typename LowLength, typename RightPad, bool SkipIsValidCheck = false>
-struct DynamicRightPad
+template <typename LowLength, typename RightPadLength, bool SkipIsValidCheck = false>
+struct RightPad
 {
     using LowerIndex = MultiIndex<1>;
     using UpperIndex = MultiIndex<1>;
 
-    using UpLengths = decltype(make_tuple(LowLength{} + RightPad{}));
+    using UpLengths = decltype(make_tuple(LowLength{} + RightPadLength{}));
 
     UpLengths up_lengths_;
     LowLength low_length_;
-    RightPad right_pad_;
+    RightPadLength right_pad_length_;
 
-    __host__ __device__ constexpr DynamicRightPad() = default;
+    __host__ __device__ constexpr RightPad() = default;
 
-    __host__ __device__ constexpr DynamicRightPad(const LowLength& low_length,
-                                                  const RightPad& right_pad)
-        : up_lengths_{make_tuple(low_length + right_pad)},
+    __host__ __device__ constexpr RightPad(const LowLength& low_length,
+                                           const RightPadLength& right_pad_length)
+        : up_lengths_{make_tuple(low_length + right_pad_length)},
           low_length_{low_length},
-          right_pad_{right_pad}
+          right_pad_length_{right_pad_length}
     {
     }
 
@@ -350,17 +354,17 @@ struct DynamicRightPad
     {
         return is_known_at_compile_time<UpLengths>::value &&
                is_known_at_compile_time<LowLength>::value &&
-               is_known_at_compile_time<RightPad>::value;
+               is_known_at_compile_time<RightPadLength>::value;
     }
 
     __host__ __device__ void Print() const
     {
         printf("{");
-        printf("DynamicRightPad, ");
+        printf("RightPad, ");
         printf("up_lengths_");
         print_multi_index(up_lengths_);
         printf("low_length_ %d", index_t{low_length_});
-        printf("left_pad_ %d", index_t{right_pad_});
+        printf("left_pad_length_ %d", index_t{right_pad_length_});
         printf("}");
     }
 };
@@ -374,7 +378,7 @@ struct DynamicRightPad
 template <typename UpLengths,
           typename Coefficients,
           typename std::enable_if<UpLengths::Size() == Coefficients::Size(), bool>::type = false>
-struct DynamicEmbed
+struct Embed
 {
     static constexpr index_t NDimUp = UpLengths::Size();
 
@@ -384,10 +388,10 @@ struct DynamicEmbed
     UpLengths up_lengths_;
     Coefficients coefficients_;
 
-    __host__ __device__ constexpr DynamicEmbed() = default;
+    __host__ __device__ constexpr Embed() = default;
 
-    __host__ __device__ constexpr DynamicEmbed(const UpLengths& up_lengths,
-                                               const Coefficients& coefficients)
+    __host__ __device__ constexpr Embed(const UpLengths& up_lengths,
+                                        const Coefficients& coefficients)
         : up_lengths_{up_lengths}, coefficients_{coefficients}
     {
     }
@@ -458,7 +462,7 @@ struct DynamicEmbed
     __host__ __device__ void Print() const
     {
         printf("{");
-        printf("DynamicEmbed, ");
+        printf("Embed, ");
         printf("up_lengths_ ");
         print_multi_index(up_lengths_);
         printf("coefficients_ ");
@@ -470,7 +474,7 @@ struct DynamicEmbed
 // Implementation of "Merge" transformation primitive that uses regular to do lowering of
 // multi-index and use carry-and-borrow check to do lowering of multi-index delta
 template <typename LowLengths>
-struct DynamicMerge_v1_carry_check
+struct Merge_v1_carry_check
 {
     static constexpr index_t NDimLow = LowLengths::Size();
 
@@ -487,9 +491,9 @@ struct DynamicMerge_v1_carry_check
     LowLengthsScan low_lengths_scan_;
     UpLengths up_lengths_;
 
-    __host__ __device__ constexpr DynamicMerge_v1_carry_check() = default;
+    __host__ __device__ constexpr Merge_v1_carry_check() = default;
 
-    __host__ __device__ constexpr DynamicMerge_v1_carry_check(const LowLengths& low_lengths)
+    __host__ __device__ constexpr Merge_v1_carry_check(const LowLengths& low_lengths)
         : low_lengths_{low_lengths},
           low_lengths_scan_{
               container_reverse_exclusive_scan(low_lengths, math::multiplies_v2{}, Number<1>{})},
@@ -555,7 +559,7 @@ struct DynamicMerge_v1_carry_check
         LowerIndex idx_low_length_minus_idx_diff_low_const;
         LowerIndex idx_low_length_plus_idx_diff_low_const;
 
-#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
+#if !CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
         index_t tmp = idx_diff_up[Number<0>{}];
 
         static_for<0, NDimLow - 1, 1>{}([&](auto i) {
@@ -698,7 +702,7 @@ struct DynamicMerge_v1_carry_check
         LowerIndex idx_low_length_minus_idx_diff_low_const;
         LowerIndex idx_low_length_plus_idx_diff_low_const;
 
-#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
+#if !CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
         index_t tmp = idx_diff_up[Number<0>{}];
 
         static_for<0, NDimLow - 1, 1>{}([&](auto i) {
@@ -838,7 +842,7 @@ struct DynamicMerge_v1_carry_check
         //   very expensive.
         LowerIndex idx_diff_low_const;
 
-#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
+#if !CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
         index_t tmp = idx_diff_up[Number<0>{}];
 
         static_for<0, NDimLow - 1, 1>{}([&](auto i) {
@@ -981,7 +985,7 @@ struct DynamicMerge_v1_carry_check
     __host__ __device__ void Print() const
     {
         printf("{");
-        printf("DynamicMerge_v1_carry_check, ");
+        printf("Merge_v1_carry_check, ");
         printf("low_lengths_ ");
         print_multi_index(low_lengths_);
         printf("low_lengths_scan_ ");
@@ -1025,7 +1029,7 @@ struct lambda_merge_generate_MagicDivision_calculate_magic_shift
 //   5. When upper-index is int32_t type (when index_t is int32_t), its value need to be
 //   non-negative.
 template <typename LowLengths>
-struct DynamicMerge_v2_magic_division
+struct Merge_v2_magic_division
 {
     static constexpr index_t NDimLow = LowLengths::Size();
 
@@ -1048,9 +1052,9 @@ struct DynamicMerge_v2_magic_division
     LowLengthsMagicDivisorShift low_lengths_magic_divisor_shift_;
     UpLengths up_lengths_;
 
-    __host__ __device__ constexpr DynamicMerge_v2_magic_division() = default;
+    __host__ __device__ constexpr Merge_v2_magic_division() = default;
 
-    __host__ __device__ constexpr DynamicMerge_v2_magic_division(const LowLengths& low_lengths)
+    __host__ __device__ constexpr Merge_v2_magic_division(const LowLengths& low_lengths)
         : low_lengths_{low_lengths},
           low_lengths_magic_divisor_multiplier_{generate_tuple(
               [&](auto i) { return MagicDivision::CalculateMagicMultiplier(low_lengths[i]); },
@@ -1151,7 +1155,7 @@ struct DynamicMerge_v2_magic_division
     __host__ __device__ void Print() const
     {
         printf("{");
-        printf("DynamicMerge_v2_magic_division, ");
+        printf("Merge_v2_magic_division, ");
         printf("low_lengths_ ");
         print_multi_index(low_lengths_);
         printf("low_lengths_magic_divisor_multiplier_ ");
@@ -1177,7 +1181,7 @@ struct DynamicMerge_v2_magic_division
 //   5. When upper-index is int32_t type (when index_t is int32_t), its value need to be
 //   non-negative.
 template <typename LowLengths>
-struct DynamicMerge_v2r2_magic_division
+struct Merge_v2r2_magic_division
 {
     static constexpr index_t NDimLow = LowLengths::Size();
 
@@ -1204,9 +1208,9 @@ struct DynamicMerge_v2r2_magic_division
     LowLengthsScanMagicDivisorShift low_lengths_scan_magic_divisor_shift_;
     UpLengths up_lengths_;
 
-    __host__ __device__ constexpr DynamicMerge_v2r2_magic_division() = default;
+    __host__ __device__ constexpr Merge_v2r2_magic_division() = default;
 
-    __host__ __device__ constexpr DynamicMerge_v2r2_magic_division(const LowLengths& low_lengths)
+    __host__ __device__ constexpr Merge_v2r2_magic_division(const LowLengths& low_lengths)
         : low_lengths_{low_lengths},
           low_lengths_scan_{
               container_reverse_exclusive_scan(low_lengths, math::multiplies_v2{}, Number<1>{})},
@@ -1308,7 +1312,7 @@ struct DynamicMerge_v2r2_magic_division
     __host__ __device__ void Print() const
     {
         printf("{");
-        printf("DynamicMerge_v2r2_magic_division, ");
+        printf("Merge_v2r2_magic_division, ");
         printf("low_lengths_ ");
         print_multi_index(low_lengths_);
         printf("low_lengths_scan ");
@@ -1324,7 +1328,7 @@ struct DynamicMerge_v2r2_magic_division
 };
 
 template <typename UpLengths, bool Use24BitIntegerCalculation>
-struct DynamicUnMerge
+struct UnMerge
 {
     static constexpr index_t NDimUp = UpLengths::Size();
 
@@ -1337,9 +1341,9 @@ struct DynamicUnMerge
     UpLengths up_lengths_;
     UpLengthsScan up_lengths_scan_;
 
-    __host__ __device__ constexpr DynamicUnMerge() = default;
+    __host__ __device__ constexpr UnMerge() = default;
 
-    __host__ __device__ constexpr DynamicUnMerge(const UpLengths& up_lengths)
+    __host__ __device__ constexpr UnMerge(const UpLengths& up_lengths)
         : up_lengths_{up_lengths},
           up_lengths_scan_{
               container_reverse_exclusive_scan(up_lengths, math::multiplies_v2{}, Number<1>{})}
@@ -1414,7 +1418,7 @@ struct DynamicUnMerge
     __host__ __device__ void Print() const
     {
         printf("{");
-        printf("DynamicUnMerge, ");
+        printf("UnMerge, ");
         printf("up_lengths_");
         print_multi_index(up_lengths_);
         printf("up_lengths_scan_");
@@ -1424,13 +1428,13 @@ struct DynamicUnMerge
 };
 
 template <typename LowerIndex>
-struct DynamicFreeze
+struct Freeze
 {
     LowerIndex low_idx_;
 
-    __host__ __device__ constexpr DynamicFreeze() = default;
+    __host__ __device__ constexpr Freeze() = default;
 
-    __host__ __device__ constexpr DynamicFreeze(const LowerIndex& low_idx) : low_idx_{low_idx} {}
+    __host__ __device__ constexpr Freeze(const LowerIndex& low_idx) : low_idx_{low_idx} {}
 
     __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
 
@@ -1483,22 +1487,22 @@ struct DynamicFreeze
 
     __host__ __device__ void Print() const
     {
-        printf("DynamicFreeze");
+        printf("Freeze");
         printf("low_idx_ %d", index_t{low_idx_});
     }
 };
 
 // Insert a dangling upper dimension without lower dimension
 template <typename UpperLength>
-struct DynamicInsert
+struct Insert
 {
     using UpLengths = decltype(make_tuple(UpperLength{}));
 
     UpLengths up_lengths_;
 
-    __host__ __device__ constexpr DynamicInsert() = default;
+    __host__ __device__ constexpr Insert() = default;
 
-    __host__ __device__ constexpr DynamicInsert(const UpperLength& up_length)
+    __host__ __device__ constexpr Insert(const UpperLength& up_length)
         : up_lengths_{make_tuple(up_length)}
     {
     }
@@ -1550,13 +1554,13 @@ struct DynamicInsert
 
     __host__ __device__ void Print() const
     {
-        printf("DynamicInsert");
+        printf("Insert");
         print_multi_index(up_lengths_);
     }
 };
 
 template <typename VectorSize, typename UpLength>
-struct DynamicVectorize
+struct Vectorize
 {
     using LowerIndex = MultiIndex<1>;
     using UpperIndex = MultiIndex<1>;
@@ -1566,10 +1570,10 @@ struct DynamicVectorize
     UpLengths up_lengths_;
     VectorSize vector_size_;
 
-    __host__ __device__ constexpr DynamicVectorize() = default;
+    __host__ __device__ constexpr Vectorize() = default;
 
-    __host__ __device__ constexpr DynamicVectorize(const VectorSize& vector_size,
-                                                   const UpLength& up_length)
+    __host__ __device__ constexpr Vectorize(const VectorSize& vector_size,
+                                            const UpLength& up_length)
         : vector_size_{vector_size}, up_lengths_{make_tuple(up_length)}
     {
     }
@@ -1633,7 +1637,7 @@ struct DynamicVectorize
     __host__ __device__ void Print() const
     {
         printf("{");
-        printf("DynamicVectorize, ");
+        printf("Vectorize, ");
         printf("up_lengths_");
         print_multi_index(up_lengths_);
         printf("}");
@@ -1641,7 +1645,7 @@ struct DynamicVectorize
 };
 
 template <typename LowLength, typename SliceBegin, typename SliceEnd>
-struct DynamicSlice
+struct Slice
 {
     using LowerIndex = MultiIndex<1>;
     using UpperIndex = MultiIndex<1>;
@@ -1652,11 +1656,11 @@ struct DynamicSlice
     SliceBegin slice_begin_;
     SliceEnd slice_end_;
 
-    __host__ __device__ constexpr DynamicSlice() = default;
+    __host__ __device__ constexpr Slice() = default;
 
-    __host__ __device__ constexpr DynamicSlice(const LowLength&,
-                                               const SliceBegin& slice_begin,
-                                               const SliceEnd& slice_end)
+    __host__ __device__ constexpr Slice(const LowLength&,
+                                        const SliceBegin& slice_begin,
+                                        const SliceEnd& slice_end)
         : up_lengths_{make_tuple(slice_end - slice_begin)},
           slice_begin_{slice_begin},
           slice_end_{slice_end}
@@ -1724,7 +1728,7 @@ struct DynamicSlice
     __host__ __device__ void Print() const
     {
         printf("{");
-        printf("DynamicSlice, ");
+        printf("Slice, ");
         printf("up_lengths_");
         print_multi_index(up_lengths_);
         printf("slice_begin_ %d", index_t{slice_begin_});
diff --git a/src/composable_kernel/composable_kernel/include/tensor_description/dynamic_multi_index_transform_helper.hpp b/src/composable_kernel/composable_kernel/include/tensor_description/multi_index_transform_helper.hpp
similarity index 65%
rename from src/composable_kernel/composable_kernel/include/tensor_description/dynamic_multi_index_transform_helper.hpp
rename to src/composable_kernel/composable_kernel/include/tensor_description/multi_index_transform_helper.hpp
index b3e1c60485..abb48c450b 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_description/dynamic_multi_index_transform_helper.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_description/multi_index_transform_helper.hpp
@@ -1,15 +1,15 @@
-#ifndef CK_DYNAMIC_MULTI_INDEX_TRANSFORM_HELPER_HPP
-#define CK_DYNAMIC_MULTI_INDEX_TRANSFORM_HELPER_HPP
+#ifndef CK_MULTI_INDEX_TRANSFORM_HELPER_HPP
+#define CK_MULTI_INDEX_TRANSFORM_HELPER_HPP
 
 #include "common_header.hpp"
-#include "dynamic_multi_index_transform.hpp"
+#include "multi_index_transform.hpp"
 
 namespace ck {
 
 template <typename LowLength>
 __host__ __device__ constexpr auto make_pass_through_transform(const LowLength& low_length)
 {
-    return DynamicPassThrough<LowLength>{low_length};
+    return PassThrough<LowLength>{low_length};
 }
 
 template <typename LowLength, typename LeftPad, typename RightPad, bool SkipIsValidCheck = false>
@@ -19,26 +19,25 @@ make_pad_transform(const LowLength& low_length,
                    const RightPad& right_pad,
                    integral_constant<bool, SkipIsValidCheck> = integral_constant<bool, false>{})
 {
-    return DynamicPad<LowLength, LeftPad, RightPad, SkipIsValidCheck>{
-        low_length, left_pad, right_pad};
+    return Pad<LowLength, LeftPad, RightPad, SkipIsValidCheck>{low_length, left_pad, right_pad};
 }
 
-template <typename LowLength, typename LeftPad, bool SkipIsValidCheck = false>
+template <typename LowLength, typename LeftPadLength, bool SkipIsValidCheck = false>
 __host__ __device__ constexpr auto make_left_pad_transform(
     const LowLength& low_length,
-    const LeftPad& left_pad,
+    const LeftPadLength& left_pad,
     integral_constant<bool, SkipIsValidCheck> = integral_constant<bool, false>{})
 {
-    return DynamicLeftPad<LowLength, LeftPad, SkipIsValidCheck>{low_length, left_pad};
+    return LeftPad<LowLength, LeftPadLength, SkipIsValidCheck>{low_length, left_pad};
 }
 
-template <typename LowLength, typename RightPad, bool SkipIsValidCheck>
+template <typename LowLength, typename RightPadLength, bool SkipIsValidCheck>
 __host__ __device__ constexpr auto make_right_pad_transform(
     const LowLength& low_length,
-    const RightPad& right_pad,
+    const RightPadLength& right_pad,
     integral_constant<bool, SkipIsValidCheck> = integral_constant<bool, false>{})
 {
-    return DynamicRightPad<LowLength, RightPad, SkipIsValidCheck>{low_length, right_pad};
+    return RightPad<LowLength, RightPadLength, SkipIsValidCheck>{low_length, right_pad};
 }
 
 template <typename UpLengths,
@@ -47,19 +46,19 @@ template <typename UpLengths,
 __host__ __device__ constexpr auto make_embed_transform(const UpLengths& up_lengths,
                                                         const Coefficients& coefficients)
 {
-    return DynamicEmbed<UpLengths, Coefficients>{up_lengths, coefficients};
+    return Embed<UpLengths, Coefficients>{up_lengths, coefficients};
 }
 
 template <typename LowLengths>
 __host__ __device__ constexpr auto make_merge_transform(const LowLengths& low_lengths)
 {
 #if !CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION
-    return DynamicMerge_v1_carry_check<LowLengths>{low_lengths};
+    return Merge_v1_carry_check<LowLengths>{low_lengths};
 #else
 #if 1
-    return DynamicMerge_v2_magic_division<LowLengths>{low_lengths};
+    return Merge_v2_magic_division<LowLengths>{low_lengths};
 #else
-    return DynamicMerge_v2r2_magic_division<LowLengths>{low_lengths};
+    return Merge_v2r2_magic_division<LowLengths>{low_lengths};
 #endif
 #endif
 }
@@ -68,7 +67,7 @@ template <typename LowLengths>
 __host__ __device__ constexpr auto
 make_merge_transform_v2_magic_division(const LowLengths& low_lengths)
 {
-    return DynamicMerge_v2_magic_division<LowLengths>{low_lengths};
+    return Merge_v2_magic_division<LowLengths>{low_lengths};
 }
 
 template <typename UpLengths, bool Use24BitIntegerCalculation = false>
@@ -76,13 +75,13 @@ __host__ __device__ constexpr auto make_unmerge_transform(
     const UpLengths& up_lengths,
     integral_constant<bool, Use24BitIntegerCalculation> = integral_constant<bool, false>{})
 {
-    return DynamicUnMerge<UpLengths, Use24BitIntegerCalculation>{up_lengths};
+    return UnMerge<UpLengths, Use24BitIntegerCalculation>{up_lengths};
 }
 
 template <typename LowerIndex>
 __host__ __device__ constexpr auto make_freeze_transform(const LowerIndex& low_idx)
 {
-    return DynamicFreeze<LowerIndex>{low_idx};
+    return Freeze<LowerIndex>{low_idx};
 }
 
 template <typename LowLength, typename SliceBegin, typename SliceEnd>
@@ -90,14 +89,14 @@ __host__ __device__ constexpr auto make_slice_transform(const LowLength& low_len
                                                         const SliceBegin& slice_begin,
                                                         const SliceEnd& slice_end)
 {
-    return DynamicSlice<LowLength, SliceBegin, SliceEnd>{low_length, slice_begin, slice_end};
+    return Slice<LowLength, SliceBegin, SliceEnd>{low_length, slice_begin, slice_end};
 }
 
 template <typename VectorSize, typename UpLength>
 __host__ __device__ constexpr auto make_vectorize_transform(const VectorSize& vector_size,
                                                             const UpLength& up_length)
 {
-    return DynamicVectorize<VectorSize, UpLength>{vector_size, up_length};
+    return Vectorize<VectorSize, UpLength>{vector_size, up_length};
 }
 
 } // namespace ck
diff --git a/src/composable_kernel/composable_kernel/include/tensor_description/tensor_adaptor.hpp b/src/composable_kernel/composable_kernel/include/tensor_description/tensor_adaptor.hpp
index 6affe6141f..2508abc6b9 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_description/tensor_adaptor.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_description/tensor_adaptor.hpp
@@ -2,8 +2,8 @@
 #define CK_TENSOR_ADAPTOR_HPP
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 
 namespace ck {
 
diff --git a/src/composable_kernel/composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp b/src/composable_kernel/composable_kernel/include/tensor_description/tensor_descriptor.hpp
similarity index 87%
rename from src/composable_kernel/composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
rename to src/composable_kernel/composable_kernel/include/tensor_description/tensor_descriptor.hpp
index b9ca26c879..9821ee8641 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_description/tensor_descriptor.hpp
@@ -1,16 +1,16 @@
-#ifndef CK_DYNAMIC_TENSOR_DESCRIPTOR_HPP
-#define CK_DYNAMIC_TENSOR_DESCRIPTOR_HPP
+#ifndef CK_TENSOR_DESCRIPTOR_HPP
+#define CK_TENSOR_DESCRIPTOR_HPP
 
 #include "common_header.hpp"
-#include "dynamic_multi_index_transform.hpp"
+#include "multi_index_transform.hpp"
 
 namespace ck {
 
 template <index_t NDimHidden, typename VisibleDimensionIds>
-struct DynamicTensorCoordinate;
+struct TensorCoordinate;
 
 template <index_t NTransform, index_t NDimVisible, typename UpdateLowerIndexHack>
-struct DynamicTensorCoordinateIterator;
+struct TensorCoordinateIterator;
 
 // Transforms: Tuple<transforms...>
 // LowerDimensionIdss : Tuple<Sequence<...>, ...>
@@ -21,7 +21,7 @@ template <typename Transforms,
           typename UpperDimensionIdss,
           typename VisibleDimensionIds,
           typename ElementSpaceSize>
-struct DynamicTensorDescriptor
+struct TensorDescriptor
 {
     // TODO make these private
     __host__ __device__ static constexpr index_t GetNumOfTransform() { return Transforms::Size(); }
@@ -105,16 +105,16 @@ struct DynamicTensorDescriptor
 
     using VisibleIndex = MultiIndex<ndim_visible_>;
     using HiddenIndex  = MultiIndex<ndim_hidden_>;
-    using Coordinate   = DynamicTensorCoordinate<ndim_hidden_, VisibleDimensionIds>;
+    using Coordinate   = TensorCoordinate<ndim_hidden_, VisibleDimensionIds>;
 
     // may be index_t or Number<>
     using ElementSize = remove_cv_t<decltype(InitializeElementSize(Transforms{}))>;
 
     public:
-    __host__ __device__ constexpr DynamicTensorDescriptor() = default;
+    __host__ __device__ constexpr TensorDescriptor() = default;
 
-    __host__ __device__ constexpr DynamicTensorDescriptor(const Transforms& transforms,
-                                                          ElementSpaceSize element_space_size)
+    __host__ __device__ constexpr TensorDescriptor(const Transforms& transforms,
+                                                   ElementSpaceSize element_space_size)
         : transforms_{transforms},
           element_size_{InitializeElementSize(transforms)},
           element_space_size_{element_space_size}
@@ -159,7 +159,7 @@ struct DynamicTensorDescriptor
     {
         static_assert(Idx::Size() == GetNumOfDimension(), "wrong! inconsistent # of dimension");
 
-        return make_dynamic_tensor_coordinate(*this, idx).GetOffset();
+        return make_tensor_coordinate(*this, idx).GetOffset();
     }
 
     // TODO make these private
@@ -196,7 +196,7 @@ struct DynamicTensorDescriptor
     __host__ __device__ void Print() const
     {
         printf("{");
-        printf("DynamicTensorDescriptor, ");
+        printf("TensorDescriptor, ");
         static_for<0, ntransform_, 1>{}([&](auto i) {
             printf("transforms: ");
             transforms_[i].Print();
@@ -217,7 +217,7 @@ struct DynamicTensorDescriptor
 };
 
 template <index_t NDimHidden, typename VisibleDimensionIds>
-struct DynamicTensorCoordinate
+struct TensorCoordinate
 {
     // TODO make these private
     static constexpr index_t ndim_visible_ = VisibleDimensionIds::Size();
@@ -226,9 +226,9 @@ struct DynamicTensorCoordinate
     using VisibleIndex = MultiIndex<ndim_visible_>;
 
     public:
-    __host__ __device__ constexpr DynamicTensorCoordinate() = default;
+    __host__ __device__ constexpr TensorCoordinate() = default;
 
-    __host__ __device__ constexpr DynamicTensorCoordinate(const HiddenIndex& idx_hidden)
+    __host__ __device__ constexpr TensorCoordinate(const HiddenIndex& idx_hidden)
         : idx_hidden_{idx_hidden}
     {
     }
@@ -252,16 +252,17 @@ struct DynamicTensorCoordinate
 };
 
 template <index_t NTransform, index_t NDimVisible, typename UpdateLowerIndexHack>
-struct DynamicTensorCoordinateIterator
+struct TensorCoordinateIterator
 {
     // TODO make these private
     using VisibleIndex = MultiIndex<NDimVisible>;
 
     public:
-    __host__ __device__ constexpr DynamicTensorCoordinateIterator() = default;
+    __host__ __device__ constexpr TensorCoordinateIterator() = default;
 
-    __host__ __device__ constexpr DynamicTensorCoordinateIterator(
-        const VisibleIndex& idx_diff_visible, const MultiIndex<NTransform>& do_transforms)
+    __host__
+        __device__ constexpr TensorCoordinateIterator(const VisibleIndex& idx_diff_visible,
+                                                      const MultiIndex<NTransform>& do_transforms)
         : idx_diff_visible_{idx_diff_visible}, do_transforms_{do_transforms}
     {
     }
@@ -283,7 +284,7 @@ struct DynamicTensorCoordinateIterator
 
 // TODO: How to fix this? It uses an struct instead of lambda because lambda
 // doesn't have constructor, and to put it outside the scope where it is used
-// (transform_dynamic_tensor_descriptor) because template cannot be defined inside a function
+// (transform_tensor_descriptor) because template cannot be defined inside a function
 // template
 template <typename NewTransforms>
 struct lambda_get_up_dim_num
@@ -301,10 +302,10 @@ template <typename OldTensorDescriptor,
           typename NewLowerDimensionOldVisibleIdss,
           typename NewUpperDimensionNewVisibleIdss>
 __host__ __device__ constexpr auto
-transform_dynamic_tensor_descriptor(const OldTensorDescriptor& old_tensor_desc,
-                                    const NewTransforms& new_transforms,
-                                    NewLowerDimensionOldVisibleIdss,
-                                    NewUpperDimensionNewVisibleIdss)
+transform_tensor_descriptor(const OldTensorDescriptor& old_tensor_desc,
+                            const NewTransforms& new_transforms,
+                            NewLowerDimensionOldVisibleIdss,
+                            NewUpperDimensionNewVisibleIdss)
 {
     // sanity check
     {
@@ -376,17 +377,17 @@ transform_dynamic_tensor_descriptor(const OldTensorDescriptor& old_tensor_desc,
 
     const auto element_space_size = old_tensor_desc.GetElementSpaceSize();
 
-    return DynamicTensorDescriptor<remove_cv_t<decltype(all_transforms)>,
-                                   remove_cv_t<decltype(all_low_dim_hidden_idss)>,
-                                   remove_cv_t<decltype(all_up_dim_hidden_idss)>,
-                                   remove_cv_t<decltype(new_visible_dim_hidden_ids)>,
-                                   remove_cv_t<decltype(element_space_size)>>{all_transforms,
-                                                                              element_space_size};
+    return TensorDescriptor<remove_cv_t<decltype(all_transforms)>,
+                            remove_cv_t<decltype(all_low_dim_hidden_idss)>,
+                            remove_cv_t<decltype(all_up_dim_hidden_idss)>,
+                            remove_cv_t<decltype(new_visible_dim_hidden_ids)>,
+                            remove_cv_t<decltype(element_space_size)>>{all_transforms,
+                                                                       element_space_size};
 }
 
 template <typename TensorDesc, typename VisibleIndex>
-__host__ __device__ constexpr auto make_dynamic_tensor_coordinate(const TensorDesc& tensor_desc,
-                                                                  const VisibleIndex& idx_visible)
+__host__ __device__ constexpr auto make_tensor_coordinate(const TensorDesc& tensor_desc,
+                                                          const VisibleIndex& idx_visible)
 {
     static_assert(TensorDesc::GetNumOfDimension() == VisibleIndex::Size(),
                   "wrong! # of dimension inconsistent");
@@ -416,13 +417,13 @@ __host__ __device__ constexpr auto make_dynamic_tensor_coordinate(const TensorDe
         set_container_subset(idx_hidden, dims_low, idx_low);
     });
 
-    return DynamicTensorCoordinate<ndim_hidden, decltype(visible_dim_ids)>{idx_hidden};
+    return TensorCoordinate<ndim_hidden, decltype(visible_dim_ids)>{idx_hidden};
 }
 
 // UpdateLowerIndexHack: Sequence<...>
 // HACK: control UpdateLowerIndex
 template <typename TensorDesc, typename VisibleIndex, typename UpdateLowerIndexHack>
-__host__ __device__ constexpr auto make_dynamic_tensor_coordinate_iterator(
+__host__ __device__ constexpr auto make_tensor_coordinate_iterator(
     const TensorDesc&, const VisibleIndex& idx_diff_visible, UpdateLowerIndexHack)
 {
     static_assert(TensorDesc::GetNumOfDimension() == VisibleIndex::Size(),
@@ -470,23 +471,24 @@ __host__ __device__ constexpr auto make_dynamic_tensor_coordinate_iterator(
         set_container_subset(is_non_zero_diff, dims_low, non_zero_diff_pick_low);
     });
 
-    return DynamicTensorCoordinateIterator<ntransform, ndim_visible, UpdateLowerIndexHack>{
+    return TensorCoordinateIterator<ntransform, ndim_visible, UpdateLowerIndexHack>{
         idx_diff_visible, do_transforms};
 }
 
 template <typename TensorDesc, typename VisibleIndex>
 __host__ __device__ constexpr auto
-make_dynamic_tensor_coordinate_iterator(const TensorDesc&, const VisibleIndex& idx_diff_visible)
+make_tensor_coordinate_iterator(const TensorDesc&, const VisibleIndex& idx_diff_visible)
 {
     constexpr index_t ntransform = TensorDesc::GetNumOfTransform();
 
-    return make_dynamic_tensor_coordinate_iterator(
+    return make_tensor_coordinate_iterator(
         TensorDesc{}, idx_diff_visible, typename uniform_sequence_gen<ntransform, 0>::type{});
 }
 
 template <typename TensorDesc, typename TensorCoord, typename TensorCoordIterator>
-__host__ __device__ constexpr void move_dynamic_tensor_coordinate(
-    const TensorDesc& tensor_desc, TensorCoord& coord, const TensorCoordIterator& coord_iterator)
+__host__ __device__ constexpr void move_tensor_coordinate(const TensorDesc& tensor_desc,
+                                                          TensorCoord& coord,
+                                                          const TensorCoordIterator& coord_iterator)
 {
     constexpr index_t ndim_hidden = TensorDesc::GetNumOfHiddenDimension();
     constexpr index_t ntransform  = TensorDesc::GetNumOfTransform();
@@ -524,7 +526,7 @@ __host__ __device__ constexpr void move_dynamic_tensor_coordinate(
 
             MultiIndex<dims_low.Size()> idx_diff_low;
 
-            // HACK: control UpdateLowerIndex for DynamicMerge using hack
+            // HACK: control UpdateLowerIndex for Merge using hack
             constexpr index_t Hack = decltype(coord_iterator.update_lower_index_hack_)::At(itran);
 
             tran.UpdateLowerIndex(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
@@ -585,11 +587,11 @@ __host__ __device__ constexpr bool coordinate_has_valid_offset(const TensorDesc&
 }
 
 template <typename TensorDesc>
-using DynamicTensorCoordinate_t = decltype(make_dynamic_tensor_coordinate(
+using TensorCoordinate_t = decltype(make_tensor_coordinate(
     TensorDesc{}, MultiIndex<remove_cv_t<remove_reference_t<TensorDesc>>::GetNumOfDimension()>{}));
 
 template <typename TensorDesc>
-using DynamicTensorCoordinateIterator_t = decltype(make_dynamic_tensor_coordinate_iterator(
+using TensorCoordinateIterator_t = decltype(make_tensor_coordinate_iterator(
     TensorDesc{}, MultiIndex<remove_cv_t<remove_reference_t<TensorDesc>>::GetNumOfDimension()>{}));
 
 } // namespace ck
diff --git a/src/composable_kernel/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper.hpp b/src/composable_kernel/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
similarity index 74%
rename from src/composable_kernel/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper.hpp
rename to src/composable_kernel/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
index 2e36451a66..93f9dac64f 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
@@ -1,9 +1,9 @@
-#ifndef CK_DYNAMIC_TENSOR_DESCRIPTOR_HELPER_HPP
-#define CK_DYNAMIC_TENSOR_DESCRIPTOR_HELPER_HPP
+#ifndef CK_TENSOR_DESCRIPTOR_HELPER_HPP
+#define CK_TENSOR_DESCRIPTOR_HELPER_HPP
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_multi_index_transform_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "multi_index_transform_helper.hpp"
 
 namespace ck {
 
@@ -38,9 +38,8 @@ __host__ __device__ constexpr auto calculate_element_space_size_impl(const Lengt
 template <typename... Lengths,
           typename... Strides,
           typename std::enable_if<sizeof...(Lengths) == sizeof...(Strides), bool>::type = false>
-__host__ __device__ constexpr auto
-make_dynamic_naive_tensor_descriptor_v2(const Tuple<Lengths...>& lengths,
-                                        const Tuple<Strides...>& strides)
+__host__ __device__ constexpr auto make_naive_tensor_descriptor_v2(const Tuple<Lengths...>& lengths,
+                                                                   const Tuple<Strides...>& strides)
 {
     constexpr index_t N = sizeof...(Lengths);
 
@@ -75,12 +74,12 @@ make_dynamic_naive_tensor_descriptor_v2(const Tuple<Lengths...>& lengths,
         calculate_element_space_size_impl(lengths, strides, Number<0>{}, Number<1>{});
 #endif
 
-    return DynamicTensorDescriptor<remove_cv_t<decltype(transforms)>,
-                                   remove_cv_t<decltype(low_dim_hidden_idss)>,
-                                   remove_cv_t<decltype(up_dim_hidden_idss)>,
-                                   remove_cv_t<decltype(visible_dim_hidden_ids)>,
-                                   remove_cv_t<decltype(element_space_size)>>{transforms,
-                                                                              element_space_size};
+    return TensorDescriptor<remove_cv_t<decltype(transforms)>,
+                            remove_cv_t<decltype(low_dim_hidden_idss)>,
+                            remove_cv_t<decltype(up_dim_hidden_idss)>,
+                            remove_cv_t<decltype(visible_dim_hidden_ids)>,
+                            remove_cv_t<decltype(element_space_size)>>{transforms,
+                                                                       element_space_size};
 }
 
 // Lengths... can be:
@@ -88,7 +87,7 @@ make_dynamic_naive_tensor_descriptor_v2(const Tuple<Lengths...>& lengths,
 //   2) Number<>, which is known at compile-time
 template <typename... Lengths>
 __host__ __device__ constexpr auto
-make_dynamic_naive_tensor_descriptor_packed_v2(const Tuple<Lengths...>& lengths)
+make_naive_tensor_descriptor_packed(const Tuple<Lengths...>& lengths)
 {
     constexpr index_t N = sizeof...(Lengths);
 
@@ -103,17 +102,17 @@ make_dynamic_naive_tensor_descriptor_packed_v2(const Tuple<Lengths...>& lengths)
 
     const auto element_space_size = container_reduce(lengths, math::multiplies_v2{}, Number<1>{});
 
-    return DynamicTensorDescriptor<remove_cv_t<decltype(transforms)>,
-                                   remove_cv_t<decltype(low_dim_hidden_idss)>,
-                                   remove_cv_t<decltype(up_dim_hidden_idss)>,
-                                   remove_cv_t<decltype(visible_dim_hidden_ids)>,
-                                   remove_cv_t<decltype(element_space_size)>>{transforms,
-                                                                              element_space_size};
+    return TensorDescriptor<remove_cv_t<decltype(transforms)>,
+                            remove_cv_t<decltype(low_dim_hidden_idss)>,
+                            remove_cv_t<decltype(up_dim_hidden_idss)>,
+                            remove_cv_t<decltype(visible_dim_hidden_ids)>,
+                            remove_cv_t<decltype(element_space_size)>>{transforms,
+                                                                       element_space_size};
 }
 
 template <typename... Lengths, typename Align>
 __host__ __device__ constexpr auto
-make_dynamic_naive_tensor_descriptor_aligned_v2(const Tuple<Lengths...>& lengths, Align align)
+make_naive_tensor_descriptor_aligned_v2(const Tuple<Lengths...>& lengths, Align align)
 {
     constexpr auto I1 = Number<1>{};
 
@@ -143,7 +142,7 @@ make_dynamic_naive_tensor_descriptor_aligned_v2(const Tuple<Lengths...>& lengths
         },
         Number<N>{});
 
-    return make_dynamic_naive_tensor_descriptor_v2(lengths, strides);
+    return make_naive_tensor_descriptor_v2(lengths, strides);
 }
 
 } // namespace ck
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r2.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r2.hpp
index f021a7b9b4..796e6387da 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r2.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r2.hpp
@@ -3,7 +3,7 @@
 
 #include "common_header.hpp"
 #include "tensor_adaptor.hpp"
-#include "threadwise_dynamic_tensor_slice_transfer.hpp"
+#include "threadwise_tensor_slice_transfer.hpp"
 #include "threadwise_contraction_dlops.hpp"
 
 namespace ck {
@@ -73,7 +73,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2
     __host__ __device__ static constexpr auto
     MakeAKM0M1BlockDescriptor(const AKMBlockDesc& /* a_k_m_block_desc */)
     {
-        const auto a_k_m0_m1_block_desc = transform_dynamic_tensor_descriptor(
+        const auto a_k_m0_m1_block_desc = transform_tensor_descriptor(
             AKMBlockDesc{},
             make_tuple(make_pass_through_transform(Number<K>{}),
                        make_unmerge_transform(make_tuple(Number<M0>{}, Number<M1>{}))),
@@ -86,7 +86,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2
     __host__ __device__ static constexpr auto
     MakeBKN0N1BlockDescriptor(const BKNBlockDesc& /* b_k_n_block_desc */)
     {
-        const auto b_k_n0_n1_block_desc = transform_dynamic_tensor_descriptor(
+        const auto b_k_n0_n1_block_desc = transform_tensor_descriptor(
             BKNBlockDesc{},
             make_tuple(make_pass_through_transform(Number<K>{}),
                        make_unmerge_transform(make_tuple(Number<N0>{}, Number<N1>{}))),
@@ -357,34 +357,32 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2
 
     private:
     // A[K, M0, M1]
-    static constexpr auto a_k_m0_m1_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2(
+    static constexpr auto a_k_m0_m1_thread_desc_ = make_naive_tensor_descriptor_packed(
         make_tuple(Number<KPerThread>{}, Number<M0>{}, Number<M1PerThreadM11>{}));
 
     // B[K, N0, N1]
-    static constexpr auto b_k_n0_n1_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2(
+    static constexpr auto b_k_n0_n1_thread_desc_ = make_naive_tensor_descriptor_packed(
         make_tuple(Number<KPerThread>{}, Number<N0>{}, Number<N1PerThreadN11>{}));
 
-    using AThreadCopy =
-        ThreadwiseDynamicTensorSliceTransfer_v4<FloatA,
-                                                FloatA,
-                                                decltype(a_k_m0_m1_block_desc_),
-                                                decltype(a_k_m0_m1_thread_desc_),
-                                                Sequence<KPerThread, 1, M1PerThreadM11>,
-                                                Sequence<0, 1, 2>,
-                                                2,
-                                                AThreadCopyScalarPerVector_M11,
-                                                1>;
-
-    using BThreadCopy =
-        ThreadwiseDynamicTensorSliceTransfer_v4<FloatB,
-                                                FloatB,
-                                                decltype(b_k_n0_n1_block_desc_),
-                                                decltype(b_k_n0_n1_thread_desc_),
-                                                Sequence<KPerThread, 1, N1PerThreadN11>,
-                                                Sequence<0, 1, 2>,
-                                                2,
-                                                BThreadCopyScalarPerVector_N11,
-                                                1>;
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatA,
+                                                         FloatA,
+                                                         decltype(a_k_m0_m1_block_desc_),
+                                                         decltype(a_k_m0_m1_thread_desc_),
+                                                         Sequence<KPerThread, 1, M1PerThreadM11>,
+                                                         Sequence<0, 1, 2>,
+                                                         2,
+                                                         AThreadCopyScalarPerVector_M11,
+                                                         1>;
+
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatB,
+                                                         FloatB,
+                                                         decltype(b_k_n0_n1_block_desc_),
+                                                         decltype(b_k_n0_n1_thread_desc_),
+                                                         Sequence<KPerThread, 1, N1PerThreadN11>,
+                                                         Sequence<0, 1, 2>,
+                                                         2,
+                                                         BThreadCopyScalarPerVector_N11,
+                                                         1>;
 
     CIndex c_thread_origin_data_idx_;
 
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r3.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r3.hpp
index 6a3885936e..ace940d4f3 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r3.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r3.hpp
@@ -3,7 +3,7 @@
 
 #include "common_header.hpp"
 #include "tensor_adaptor.hpp"
-#include "threadwise_dynamic_tensor_slice_transfer_v2.hpp"
+#include "threadwise_tensor_slice_transfer_v2.hpp"
 #include "threadwise_contraction_dlops.hpp"
 
 namespace ck {
@@ -75,7 +75,7 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B
     __host__ __device__ static constexpr auto
     MakeABlockDescriptor_BK0_BM0_BM1_BK1(const ABlockDesc_BK0_BM_BK1& a_block_desc_bk0_bm_bk1)
     {
-        const auto a_block_bk0_bm0_bm1_bk1 = transform_dynamic_tensor_descriptor(
+        const auto a_block_bk0_bm0_bm1_bk1 = transform_tensor_descriptor(
             a_block_desc_bk0_bm_bk1,
             make_tuple(make_pass_through_transform(Number<BK0>{}),
                        make_unmerge_transform(make_tuple(Number<BM0>{}, Number<BM1>{})),
@@ -89,7 +89,7 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B
     __host__ __device__ static constexpr auto
     MakeBBlockDescriptor_BK0_BN0_BN1_BK1(const BBlockDesc_BK0_BN_BK1& b_block_desc_bk0_bn_bk1)
     {
-        const auto b_block_desc_bk0_bn0_bn1_bk1 = transform_dynamic_tensor_descriptor(
+        const auto b_block_desc_bk0_bn0_bn1_bk1 = transform_tensor_descriptor(
             b_block_desc_bk0_bn_bk1,
             make_tuple(make_pass_through_transform(Number<BK0>{}),
                        make_unmerge_transform(make_tuple(Number<BN0>{}, Number<BN1>{})),
@@ -372,15 +372,15 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B
     private:
     // A[BK0, BM0, BM1, BK1]
     static constexpr auto a_thread_desc_bk0_bm0_bm1_bk1_ =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
+        make_naive_tensor_descriptor_packed(make_tuple(
             Number<BK0PerThread>{}, Number<BM0>{}, Number<BM1PerThreadBM11>{}, Number<BK1>{}));
 
     // B[BK0, BN0, BN1, BK1]
     static constexpr auto b_thread_desc_bk0_bn0_bn1_bk1_ =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
+        make_naive_tensor_descriptor_packed(make_tuple(
             Number<BK0PerThread>{}, Number<BN0>{}, Number<BN1PerThreadBN11>{}, Number<BK1>{}));
 
-    using AThreadCopy = ThreadwiseDynamicTensorSliceTransfer_v4r1<
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4r1<
         FloatA,
         FloatA,
         decltype(a_block_desc_bk0_bm0_bm1_bk1_),
@@ -390,7 +390,7 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B
         Sequence<1, 1, BM1PerThreadBM11, BK1>,            // SrcVectorTensorLengths
         Sequence<0, 1, 2, 3>>;                            // SrcVectorTensorContiguousDimOrder
 
-    using BThreadCopy = ThreadwiseDynamicTensorSliceTransfer_v4r1<
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4r1<
         FloatB,
         FloatB,
         decltype(b_block_desc_bk0_bn0_bn1_bk1_),
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp
index b656b4595a..a15be541b5 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp
@@ -31,25 +31,24 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
     // HACK: fix this @Jing Zhang
     static constexpr index_t KPerThreadSubC = 4;
 
-    static constexpr auto a_thread_mtx_ = make_dynamic_naive_tensor_descriptor_packed_v2(
+    static constexpr auto a_thread_mtx_ = make_naive_tensor_descriptor_packed(
         make_tuple(Number<EPerThreadLoop>{}, Number<KPerThreadSubC>{}));
 
-    static constexpr auto b_thread_mtx_ = make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
+    static constexpr auto b_thread_mtx_ = make_naive_tensor_descriptor_packed(make_tuple(
         Number<EPerThreadLoop>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{}));
 
-    static constexpr auto c_thread_mtx_ = make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
+    static constexpr auto c_thread_mtx_ = make_naive_tensor_descriptor_packed(make_tuple(
         Number<KPerThreadSubC>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{}));
 
-    using AThreadCopy =
-        ThreadwiseDynamicTensorSliceTransfer_v4<FloatA,
-                                                FloatA,
-                                                BlockMatrixA,
-                                                decltype(a_thread_mtx_),
-                                                Sequence<EPerThreadLoop, KPerThreadSubC>,
-                                                Sequence<0, 1>,
-                                                1,
-                                                ThreadGemmADataPerRead_K,
-                                                1>;
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatA,
+                                                         FloatA,
+                                                         BlockMatrixA,
+                                                         decltype(a_thread_mtx_),
+                                                         Sequence<EPerThreadLoop, KPerThreadSubC>,
+                                                         Sequence<0, 1>,
+                                                         1,
+                                                         ThreadGemmADataPerRead_K,
+                                                         1>;
 
     __device__ BlockwiseGemmDlops_km_kn_m0m1n0n1_v3()
         : c_thread_begin_mtx_idx_{GetBeginOfThreadMatrixC(get_thread_local_1d_id())},
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp
index 715fbc0b41..ee6a0b7427 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp
@@ -2,7 +2,7 @@
 #define CK_BLOCKWISE_GEMM_XDLOPS_HPP
 
 #include "common_header.hpp"
-#include "threadwise_dynamic_tensor_slice_transfer.hpp"
+#include "threadwise_tensor_slice_transfer.hpp"
 #include "xdlops_gemm.hpp"
 
 namespace ck {
@@ -191,35 +191,35 @@ struct BlockwiseGemmXdlops_km_kn_m0m1m2n_v1
 
     private:
     // A[K, M]
-    static constexpr auto a_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2(
-        make_tuple(I1, Number<MRepeat>{}, I1, Number<K1>{}));
+    static constexpr auto a_thread_desc_ =
+        make_naive_tensor_descriptor_packed(make_tuple(I1, Number<MRepeat>{}, I1, Number<K1>{}));
 
     // B[K, N]
-    static constexpr auto b_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2(
-        make_tuple(I1, Number<NRepeat>{}, I1, Number<K1>{}));
-
-    static constexpr auto c_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2(
-        make_tuple(Number<MRepeat>{}, Number<NRepeat>{}));
-
-    using AThreadCopy = ThreadwiseDynamicTensorSliceTransfer_v4<FloatAB,
-                                                                FloatAB,
-                                                                ABlockDesc,
-                                                                decltype(a_thread_desc_),
-                                                                Sequence<1, MRepeat, 1, K1>,
-                                                                Sequence<0, 1, 2, 3>,
-                                                                3,
-                                                                K1,
-                                                                1>;
-
-    using BThreadCopy = ThreadwiseDynamicTensorSliceTransfer_v4<FloatAB,
-                                                                FloatAB,
-                                                                BBlockDesc,
-                                                                decltype(b_thread_desc_),
-                                                                Sequence<1, NRepeat, 1, K1>,
-                                                                Sequence<0, 1, 2, 3>,
-                                                                3,
-                                                                K1,
-                                                                1>;
+    static constexpr auto b_thread_desc_ =
+        make_naive_tensor_descriptor_packed(make_tuple(I1, Number<NRepeat>{}, I1, Number<K1>{}));
+
+    static constexpr auto c_thread_desc_ =
+        make_naive_tensor_descriptor_packed(make_tuple(Number<MRepeat>{}, Number<NRepeat>{}));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
+                                                         FloatAB,
+                                                         ABlockDesc,
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, MRepeat, 1, K1>,
+                                                         Sequence<0, 1, 2, 3>,
+                                                         3,
+                                                         K1,
+                                                         1>;
+
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
+                                                         FloatAB,
+                                                         BBlockDesc,
+                                                         decltype(b_thread_desc_),
+                                                         Sequence<1, NRepeat, 1, K1>,
+                                                         Sequence<0, 1, 2, 3>,
+                                                         3,
+                                                         K1,
+                                                         1>;
 
     AThreadCopy a_thread_copy_;
     BThreadCopy b_thread_copy_;
@@ -486,35 +486,35 @@ struct BlockwiseGemmXdlops_km_kn_m0m1m2n_v1_2x2pipeline
 
     private:
     // A[K, M]
-    static constexpr auto a_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2(
-        make_tuple(I1, Number<MRepeat>{}, I1, Number<K1>{}));
+    static constexpr auto a_thread_desc_ =
+        make_naive_tensor_descriptor_packed(make_tuple(I1, Number<MRepeat>{}, I1, Number<K1>{}));
 
     // B[K, N]
-    static constexpr auto b_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2(
-        make_tuple(I1, Number<NRepeat>{}, I1, Number<K1>{}));
-
-    static constexpr auto c_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2(
-        make_tuple(Number<MRepeat>{}, Number<NRepeat>{}));
-
-    using AThreadCopy = ThreadwiseDynamicTensorSliceTransfer_v4<FloatAB,
-                                                                FloatAB,
-                                                                ABlockDesc,
-                                                                decltype(a_thread_desc_),
-                                                                Sequence<1, 1, 1, K1>,
-                                                                Sequence<0, 1, 2, 3>,
-                                                                3,
-                                                                1, // K1,
-                                                                1>;
-
-    using BThreadCopy = ThreadwiseDynamicTensorSliceTransfer_v4<FloatAB,
-                                                                FloatAB,
-                                                                BBlockDesc,
-                                                                decltype(b_thread_desc_),
-                                                                Sequence<1, 1, 1, K1>,
-                                                                Sequence<0, 1, 2, 3>,
-                                                                3,
-                                                                1, // K1,
-                                                                1>;
+    static constexpr auto b_thread_desc_ =
+        make_naive_tensor_descriptor_packed(make_tuple(I1, Number<NRepeat>{}, I1, Number<K1>{}));
+
+    static constexpr auto c_thread_desc_ =
+        make_naive_tensor_descriptor_packed(make_tuple(Number<MRepeat>{}, Number<NRepeat>{}));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
+                                                         FloatAB,
+                                                         ABlockDesc,
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, 1, 1, K1>,
+                                                         Sequence<0, 1, 2, 3>,
+                                                         3,
+                                                         1, // K1,
+                                                         1>;
+
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
+                                                         FloatAB,
+                                                         BBlockDesc,
+                                                         decltype(b_thread_desc_),
+                                                         Sequence<1, 1, 1, K1>,
+                                                         Sequence<0, 1, 2, 3>,
+                                                         3,
+                                                         1, // K1,
+                                                         1>;
 
     AThreadCopy a_thread_copy_;
     BThreadCopy b_thread_copy_;
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp
similarity index 74%
rename from src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
rename to src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp
index 694b2fd2cc..4303b6a4ca 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp
@@ -1,18 +1,18 @@
-#ifndef CK_BLOCKWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_HPP
-#define CK_BLOCKWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_HPP
+#ifndef CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_HPP
+#define CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_HPP
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 #include "cluster_descriptor.hpp"
-#include "threadwise_dynamic_tensor_slice_transfer.hpp"
+#include "threadwise_tensor_slice_transfer.hpp"
 
 namespace ck {
 
 // this version does following things to avoid scratch memory issue
 // 1. Use StaticallyIndexedArray instead of C array for thread buffer
-// 2. ThreadwiseDynamicTensorSliceTransfer_v3 does not keep reference to tensor descriptor
-// 3. ThreadwiseDynamicTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
+// 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor
+// 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
 template <index_t BlockSize,
           InMemoryDataOperationEnum_t DstInMemOp,
           typename BlockSliceLengths,
@@ -33,16 +33,16 @@ template <index_t BlockSize,
           index_t DstScalarStrideInVector,
           bool ThreadTransferSrcResetCoordinateAfterRun,
           bool ThreadTransferDstResetCoordinateAfterRun>
-struct BlockwiseDynamicTensorSliceTransfer_v4
+struct BlockwiseTensorSliceTransfer_v4
 {
     static constexpr index_t nDim = remove_reference_t<SrcDesc>::GetNumOfDimension();
 
     using Index = MultiIndex<nDim>;
 
-    __device__ constexpr BlockwiseDynamicTensorSliceTransfer_v4(const SrcDesc& src_desc,
-                                                                const Index& src_block_slice_origin,
-                                                                const DstDesc& dst_desc,
-                                                                const Index& dst_block_slice_origin)
+    __device__ constexpr BlockwiseTensorSliceTransfer_v4(const SrcDesc& src_desc,
+                                                         const Index& src_block_slice_origin,
+                                                         const DstDesc& dst_desc,
+                                                         const Index& dst_block_slice_origin)
         : threadwise_transfer_(
               src_desc, make_zero_multi_index<nDim>(), dst_desc, make_zero_multi_index<nDim>())
 
@@ -147,22 +147,22 @@ struct BlockwiseDynamicTensorSliceTransfer_v4
         make_cluster_descriptor_v2(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
 
     using ThreadwiseTransfer =
-        ThreadwiseDynamicTensorSliceTransfer_v3<ThreadSliceLengths,
-                                                DstInMemOp,
-                                                SrcData,
-                                                DstData,
-                                                SrcDesc,
-                                                DstDesc,
-                                                SrcDimAccessOrder,
-                                                DstDimAccessOrder,
-                                                SrcVectorDim,
-                                                DstVectorDim,
-                                                SrcScalarPerVector,
-                                                DstScalarPerVector,
-                                                SrcScalarStrideInVector,
-                                                DstScalarStrideInVector,
-                                                ThreadTransferSrcResetCoordinateAfterRun,
-                                                ThreadTransferDstResetCoordinateAfterRun>;
+        ThreadwiseTensorSliceTransfer_v3<ThreadSliceLengths,
+                                         DstInMemOp,
+                                         SrcData,
+                                         DstData,
+                                         SrcDesc,
+                                         DstDesc,
+                                         SrcDimAccessOrder,
+                                         DstDimAccessOrder,
+                                         SrcVectorDim,
+                                         DstVectorDim,
+                                         SrcScalarPerVector,
+                                         DstScalarPerVector,
+                                         SrcScalarStrideInVector,
+                                         DstScalarStrideInVector,
+                                         ThreadTransferSrcResetCoordinateAfterRun,
+                                         ThreadTransferDstResetCoordinateAfterRun>;
 
     ThreadwiseTransfer threadwise_transfer_;
 };
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer_v2.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer_v2.hpp
similarity index 75%
rename from src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer_v2.hpp
rename to src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer_v2.hpp
index 20f3225f82..25df52904d 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer_v2.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer_v2.hpp
@@ -1,18 +1,18 @@
-#ifndef CK_BLOCKWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_V2_HPP
-#define CK_BLOCKWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_V2_HPP
+#ifndef CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V2_HPP
+#define CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V2_HPP
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 #include "cluster_descriptor.hpp"
-#include "threadwise_dynamic_tensor_slice_transfer_v2.hpp"
+#include "threadwise_tensor_slice_transfer_v2.hpp"
 
 namespace ck {
 
 // this version does following things to avoid scratch memory issue
 // 1. Use StaticallyIndexedArray instead of C array for thread buffer
-// 2. ThreadwiseDynamicTensorSliceTransfer_v3 does not keep reference to tensor descriptor
-// 3. ThreadwiseDynamicTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
+// 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor
+// 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
 template <index_t BlockSize,
           InMemoryDataOperationEnum_t DstInMemOp,
           typename BlockSliceLengths,
@@ -31,17 +31,16 @@ template <index_t BlockSize,
           typename DstVectorTensorContiguousDimOrder,
           bool ThreadTransferSrcResetCoordinateAfterRun,
           bool ThreadTransferDstResetCoordinateAfterRun>
-struct BlockwiseDynamicTensorSliceTransfer_v4r1
+struct BlockwiseTensorSliceTransfer_v4r1
 {
     static constexpr index_t nDim = remove_reference_t<SrcDesc>::GetNumOfDimension();
 
     using Index = MultiIndex<nDim>;
 
-    __device__ constexpr BlockwiseDynamicTensorSliceTransfer_v4r1(
-        const SrcDesc& src_desc,
-        const Index& src_block_slice_origin,
-        const DstDesc& dst_desc,
-        const Index& dst_block_slice_origin)
+    __device__ constexpr BlockwiseTensorSliceTransfer_v4r1(const SrcDesc& src_desc,
+                                                           const Index& src_block_slice_origin,
+                                                           const DstDesc& dst_desc,
+                                                           const Index& dst_block_slice_origin)
         : threadwise_transfer_(
               src_desc, make_zero_multi_index<nDim>(), dst_desc, make_zero_multi_index<nDim>())
 
@@ -136,20 +135,20 @@ struct BlockwiseDynamicTensorSliceTransfer_v4r1
         make_cluster_descriptor_v2(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
 
     using ThreadwiseTransfer =
-        ThreadwiseDynamicTensorSliceTransfer_v3r1<ThreadSliceLengths,
-                                                  DstInMemOp,
-                                                  SrcData,
-                                                  DstData,
-                                                  SrcDesc,
-                                                  DstDesc,
-                                                  SrcDimAccessOrder,
-                                                  DstDimAccessOrder,
-                                                  SrcVectorTensorLengths,
-                                                  DstVectorTensorLengths,
-                                                  SrcVectorTensorContiguousDimOrder,
-                                                  DstVectorTensorContiguousDimOrder,
-                                                  ThreadTransferSrcResetCoordinateAfterRun,
-                                                  ThreadTransferDstResetCoordinateAfterRun>;
+        ThreadwiseTensorSliceTransfer_v3r1<ThreadSliceLengths,
+                                           DstInMemOp,
+                                           SrcData,
+                                           DstData,
+                                           SrcDesc,
+                                           DstDesc,
+                                           SrcDimAccessOrder,
+                                           DstDimAccessOrder,
+                                           SrcVectorTensorLengths,
+                                           DstVectorTensorLengths,
+                                           SrcVectorTensorContiguousDimOrder,
+                                           DstVectorTensorContiguousDimOrder,
+                                           ThreadTransferSrcResetCoordinateAfterRun,
+                                           ThreadTransferDstResetCoordinateAfterRun>;
 
     ThreadwiseTransfer threadwise_transfer_;
 };
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_contraction_dlops_v1r2.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_contraction_dlops_v1r2.hpp
similarity index 91%
rename from src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_contraction_dlops_v1r2.hpp
rename to src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_contraction_dlops_v1r2.hpp
index 6d48a18169..3070045554 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_contraction_dlops_v1r2.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_contraction_dlops_v1r2.hpp
@@ -1,14 +1,14 @@
-#ifndef CK_GRIDWISE_DYNAMIC_CONTRACTION_DLOPS_V1R2_HPP
-#define CK_GRIDWISE_DYNAMIC_CONTRACTION_DLOPS_V1R2_HPP
+#ifndef CK_GRIDWISE_CONTRACTION_DLOPS_V1R2_HPP
+#define CK_GRIDWISE_CONTRACTION_DLOPS_V1R2_HPP
 
 #include "common_header.hpp"
-#include "dynamic_multi_index_transform_helper.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "multi_index_transform_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 #include "blockwise_gemm_dlops_v2r3.hpp"
-#include "blockwise_dynamic_tensor_slice_transfer_v2.hpp"
-#include "threadwise_dynamic_tensor_slice_transfer.hpp"
-#include "threadwise_dynamic_tensor_slice_set.hpp"
+#include "blockwise_tensor_slice_transfer_v2.hpp"
+#include "threadwise_tensor_slice_transfer.hpp"
+#include "threadwise_tensor_slice_set.hpp"
 
 namespace ck {
 
@@ -25,7 +25,7 @@ __global__ void
 #if CK_USE_LAUNCH_BOUNDS
     __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_dynamic_contraction_dlops_v1r2(
+        kernel_contraction_dlops_v1r2(
             const FloatAB* __restrict__ p_a_grid,
             const FloatAB* __restrict__ p_b_grid,
             FloatC* __restrict__ p_c_grid,
@@ -89,7 +89,7 @@ template <index_t BlockSize,
           typename CGridIteratorHacks,
           typename AGridMoveSliceWindowIteratorHacks,
           typename BGridMoveSliceWindowIteratorHacks>
-struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1
+struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1
 {
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
@@ -110,17 +110,15 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
 
         // A matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto a_block_desc_gk0_gm0_gm10_gm11_gk1 =
-            make_dynamic_naive_tensor_descriptor_aligned_v2(
-                make_tuple(Number<GK0PerBlock>{}, GM0, I1, Number<GM1PerBlockGM11>{}, GK1),
-                max_lds_align);
+        constexpr auto a_block_desc_gk0_gm0_gm10_gm11_gk1 = make_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<GK0PerBlock>{}, GM0, I1, Number<GM1PerBlockGM11>{}, GK1),
+            max_lds_align);
 
         // B matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto b_block_desc_gk0_gn0_gn10_gn11_gk1 =
-            make_dynamic_naive_tensor_descriptor_aligned_v2(
-                make_tuple(Number<GK0PerBlock>{}, GN0, I1, Number<GN1PerBlockGN11>{}, GK1),
-                max_lds_align);
+        constexpr auto b_block_desc_gk0_gn0_gn10_gn11_gk1 = make_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<GK0PerBlock>{}, GN0, I1, Number<GN1PerBlockGN11>{}, GK1),
+            max_lds_align);
 
         // LDS allocation for A and B: be careful of alignment
         constexpr auto a_block_aligned_space_size = math::integer_least_multiple(
@@ -201,7 +199,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
         const auto GM11 = Number<GM1PerBlockGM11>{};
         const auto GM10 = GM1 / GM11;
 
-        const auto a_grid_desc_gk0_gm0_gm10_gm11_gk1 = transform_dynamic_tensor_descriptor(
+        const auto a_grid_desc_gk0_gm0_gm10_gm11_gk1 = transform_tensor_descriptor(
             a_grid_desc_gk0_gm0_gm1_gk1,
             make_tuple(make_pass_through_transform(GK0),
                        make_pass_through_transform(GM0),
@@ -222,7 +220,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
         const auto GN11 = Number<GN1PerBlockGN11>{};
         const auto GN10 = GN1 / GN11;
 
-        const auto b_grid_desc_gk0_gn0_gn10_gn11_gk1 = transform_dynamic_tensor_descriptor(
+        const auto b_grid_desc_gk0_gn0_gn10_gn11_gk1 = transform_tensor_descriptor(
             b_grid_desc_gk0_gn0_gn1_gk1,
             make_tuple(make_pass_through_transform(GK0),
                        make_pass_through_transform(GN0),
@@ -259,7 +257,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
         constexpr auto BM0 = BM / BM1;
         constexpr auto BN0 = BN / BN1;
 
-        const auto c_gm0_gm10_gm11_gn0_gn10_gn11_grid_desc = transform_dynamic_tensor_descriptor(
+        const auto c_gm0_gm10_gm11_gn0_gn10_gn11_grid_desc = transform_tensor_descriptor(
             c_grid_desc_gm0_gm1_gn0_gn1,
             make_tuple(make_pass_through_transform(GM0),
                        make_unmerge_transform(make_tuple(GM10, GM11)),
@@ -268,7 +266,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
             make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
             make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}, Sequence<4, 5>{}));
 
-        const auto c_gm10_bm_gn10_bn_grid_desc = transform_dynamic_tensor_descriptor(
+        const auto c_gm10_bm_gn10_bn_grid_desc = transform_tensor_descriptor(
             c_gm0_gm10_gm11_gn0_gn10_gn11_grid_desc,
             make_tuple(make_pass_through_transform(GM10),
                        make_merge_transform(make_tuple(GM0, GM11)),
@@ -277,7 +275,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
             make_tuple(Sequence<1>{}, Sequence<0, 2>{}, Sequence<4>{}, Sequence<3, 5>{}),
             make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
 
-        const auto c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1 = transform_dynamic_tensor_descriptor(
+        const auto c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1 = transform_tensor_descriptor(
             c_gm10_bm_gn10_bn_grid_desc,
             make_tuple(make_pass_through_transform(GM10),
                        make_unmerge_transform(make_tuple(BM0, BM1)),
@@ -356,26 +354,24 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
 
         // A matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto a_block_desc_gk0_gm0_gm10_gm11_gk1 =
-            make_dynamic_naive_tensor_descriptor_aligned_v2(
-                make_tuple(Number<GK0PerBlock>{}, GM0, I1, Number<GM1PerBlockGM11>{}, GK1),
-                max_lds_align);
+        constexpr auto a_block_desc_gk0_gm0_gm10_gm11_gk1 = make_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<GK0PerBlock>{}, GM0, I1, Number<GM1PerBlockGM11>{}, GK1),
+            max_lds_align);
 
         // B matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto b_block_desc_gk0_gn0_gn10_gn11_gk1 =
-            make_dynamic_naive_tensor_descriptor_aligned_v2(
-                make_tuple(Number<GK0PerBlock>{}, GN0, I1, Number<GN1PerBlockGN11>{}, GK1),
-                max_lds_align);
+        constexpr auto b_block_desc_gk0_gn0_gn10_gn11_gk1 = make_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<GK0PerBlock>{}, GN0, I1, Number<GN1PerBlockGN11>{}, GK1),
+            max_lds_align);
 
         // A matrix in LDS memory for blockwise GEMM
         //   be careful of LDS alignment
-        constexpr auto a_block_desc_gk0_bm_gk1 = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_block_desc_gk0_bm_gk1 = make_naive_tensor_descriptor_aligned_v2(
             make_tuple(Number<GK0PerBlock>{}, GM0 * Number<GM1PerBlockGM11>{}, GK1), max_lds_align);
 
         // B matrix in LDS memory for blockwise GEMM
         //   be careful of LDS alignment
-        constexpr auto b_block_desc_gk0_bn_gk1 = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto b_block_desc_gk0_bn_gk1 = make_naive_tensor_descriptor_aligned_v2(
             make_tuple(Number<GK0PerBlock>{}, GN0 * Number<GN1PerBlockGN11>{}, GK1), max_lds_align);
 
         static_assert(a_block_desc_gk0_gm0_gm10_gm11_gk1.GetElementSpaceSize() ==
@@ -385,7 +381,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
                       "wrong!");
 
         // A matrix blockwise copy
-        auto a_blockwise_copy = BlockwiseDynamicTensorSliceTransfer_v4r1<
+        auto a_blockwise_copy = BlockwiseTensorSliceTransfer_v4r1<
             BlockSize,
             InMemoryDataOperationEnum_t::Set,
             Sequence<GK0PerBlock, GM0, 1, GM1PerBlockGM11, GK1.value>,
@@ -409,7 +405,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
                   make_multi_index(0, 0, 0, 0, 0));
 
         // B matrix blockwise copy
-        auto b_blockwise_copy = BlockwiseDynamicTensorSliceTransfer_v4r1<
+        auto b_blockwise_copy = BlockwiseTensorSliceTransfer_v4r1<
             BlockSize,
             InMemoryDataOperationEnum_t::Set,
             Sequence<GK0PerBlock, GN0, 1, GN1PerBlockGN11, GK1.value>,
@@ -457,9 +453,8 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
         constexpr auto c_thread_tensor_lengths_bm0_bm1_bn0_bn1 =
             decltype(blockwise_gemm)::GetCThreadTensorLengths_BM0_BM1_BN0_BN1();
 
-        constexpr auto c_thread_desc_bm0_bm1_bn0_bn1 =
-            make_dynamic_naive_tensor_descriptor_packed_v2(
-                sequence_to_tuple_of_number(c_thread_tensor_lengths_bm0_bm1_bn0_bn1));
+        constexpr auto c_thread_desc_bm0_bm1_bn0_bn1 = make_naive_tensor_descriptor_packed(
+            sequence_to_tuple_of_number(c_thread_tensor_lengths_bm0_bm1_bn0_bn1));
 
         // LDS allocation for A and B: be careful of alignment
         constexpr auto a_block_aligned_space_size = math::integer_least_multiple(
@@ -475,9 +470,9 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
         auto c_thread_buf = make_static_buffer<AddressSpaceEnum_t::Vgpr, FloatAcc>(
             c_thread_desc_bm0_bm1_bn0_bn1.GetElementSpaceSize());
 
-        ThreadwiseDynamicTensorSliceSet_v1<FloatAcc,
-                                           decltype(c_thread_desc_bm0_bm1_bn0_bn1),
-                                           decltype(c_thread_tensor_lengths_bm0_bm1_bn0_bn1)>{}
+        ThreadwiseTensorSliceSet_v1<FloatAcc,
+                                    decltype(c_thread_desc_bm0_bm1_bn0_bn1),
+                                    decltype(c_thread_tensor_lengths_bm0_bm1_bn0_bn1)>{}
             .Run(c_thread_desc_bm0_bm1_bn0_bn1,
                  make_tuple(I0, I0, I0, I0),
                  c_thread_buf,
@@ -615,7 +610,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
         // output: register to global memory
         {
             constexpr auto c_thread_desc_gm10_bm0_bm1_gn10_bn0_bn1 =
-                make_dynamic_naive_tensor_descriptor_packed_v2(
+                make_naive_tensor_descriptor_packed(
                     make_tuple(I1,
                                Number<c_thread_tensor_lengths_bm0_bm1_bn0_bn1[I0]>{},
                                Number<c_thread_tensor_lengths_bm0_bm1_bn0_bn1[I1]>{},
@@ -627,7 +622,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
                 blockwise_gemm.CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1(
                     get_thread_local_1d_id());
 
-            ThreadwiseDynamicTensorSliceTransfer_v1r3<
+            ThreadwiseTensorSliceTransfer_v1r3<
                 FloatAcc,
                 FloatC,
                 decltype(c_thread_desc_gm10_bm0_bm1_gn10_bn0_bn1),
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r2.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r2.hpp
similarity index 82%
rename from src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r2.hpp
rename to src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r2.hpp
index e4858af492..88f2059bbf 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r2.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r2.hpp
@@ -1,14 +1,14 @@
-#ifndef CK_GRIDWISE_DYNAMIC_GEMM_DLOPS_V1R2_HPP
-#define CK_GRIDWISE_DYNAMIC_GEMM_DLOPS_V1R2_HPP
+#ifndef CK_GRIDWISE_GEMM_DLOPS_V1R2_HPP
+#define CK_GRIDWISE_GEMM_DLOPS_V1R2_HPP
 
 #include "common_header.hpp"
-#include "dynamic_multi_index_transform_helper.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "multi_index_transform_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 #include "blockwise_gemm_dlops_v2r2.hpp"
-#include "blockwise_dynamic_tensor_slice_transfer.hpp"
-#include "threadwise_dynamic_tensor_slice_transfer.hpp"
-#include "threadwise_dynamic_tensor_slice_set.hpp"
+#include "blockwise_tensor_slice_transfer.hpp"
+#include "threadwise_tensor_slice_transfer.hpp"
+#include "threadwise_tensor_slice_set.hpp"
 
 namespace ck {
 
@@ -26,7 +26,7 @@ __global__ void
 #if CK_USE_LAUNCH_BOUNDS
     __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_dynamic_gemm_dlops_v1r2(
+        kernel_gemm_dlops_v1r2(
             const FloatAB* __restrict__ p_a_grid,
             const FloatAB* __restrict__ p_b_grid,
             FloatC* __restrict__ p_c_grid,
@@ -68,14 +68,13 @@ __global__ void
 #if CK_USE_LAUNCH_BOUNDS
     __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_dynamic_gemm_dlops_v1r2(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            FloatC* __restrict__ p_c_grid,
-            const void CONSTANT* p_a_k_m0_m1_grid_desc,
-            const void CONSTANT* p_b_k_n0_n1_grid_desc,
-            const void CONSTANT* p_c_m0_m10_m11_n0_n10_n11_grid_desc,
-            const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor)
+        kernel_gemm_dlops_v1r2(const FloatAB* __restrict__ p_a_grid,
+                               const FloatAB* __restrict__ p_b_grid,
+                               FloatC* __restrict__ p_c_grid,
+                               const void CONSTANT* p_a_k_m0_m1_grid_desc,
+                               const void CONSTANT* p_b_k_n0_n1_grid_desc,
+                               const void CONSTANT* p_c_m0_m10_m11_n0_n10_n11_grid_desc,
+                               const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor)
 {
     // first cast void CONSTANT void* to void*
     // second cast void* to Desc*
@@ -151,7 +150,7 @@ template <index_t BlockSize,
           typename CGridIteratorHacks,
           typename AGridMoveSliceWindowIteratorHacks,
           typename BGridMoveSliceWindowIteratorHacks>
-struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
+struct GridwiseGemmDlops_km_kn_mn_v1r2
 {
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
@@ -167,12 +166,12 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
 
         // A matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto a_k_m_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned_v2(
             make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}), max_lds_align);
 
         // B matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto b_k_n_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned_v2(
             make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}), max_lds_align);
 
         // LDS allocation for A and B: be careful of alignment
@@ -230,7 +229,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
         const auto M1 = Number<MPerBlockM1>{};
         const auto M0 = M / M1;
 
-        const auto a_k_m0_m1_grid_desc = transform_dynamic_tensor_descriptor(
+        const auto a_k_m0_m1_grid_desc = transform_tensor_descriptor(
             a_k_m_grid_desc,
             make_tuple(make_pass_through_transform(K), make_unmerge_transform(make_tuple(M0, M1))),
             make_tuple(Sequence<0>{}, Sequence<1>{}),
@@ -248,7 +247,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
         const auto N1 = Number<NPerBlockN1>{};
         const auto N0 = N / N1;
 
-        const auto b_k_n0_n1_grid_desc = transform_dynamic_tensor_descriptor(
+        const auto b_k_n0_n1_grid_desc = transform_tensor_descriptor(
             b_k_n_grid_desc,
             make_tuple(make_pass_through_transform(K), make_unmerge_transform(make_tuple(N0, N1))),
             make_tuple(Sequence<0>{}, Sequence<1>{}),
@@ -277,7 +276,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
         constexpr auto M10 = M1 / M11;
         constexpr auto N10 = N1 / N11;
 
-        const auto c_m0_m10_m11_n0_n10_n11_grid_desc = transform_dynamic_tensor_descriptor(
+        const auto c_m0_m10_m11_n0_n10_n11_grid_desc = transform_tensor_descriptor(
             c_m_n_grid_desc,
             make_tuple(make_unmerge_transform(make_tuple(M0, M10, M11)),
                        make_unmerge_transform(make_tuple(N0, N10, N11))),
@@ -352,75 +351,75 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
 
         // A matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto a_k_m_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned_v2(
             make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}), max_lds_align);
 
         // B matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto b_k_n_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned_v2(
             make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}), max_lds_align);
 
         // A matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto a_k_m0_m1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_k_m0_m1_block_desc = make_naive_tensor_descriptor_aligned_v2(
             make_tuple(Number<KPerBlock>{}, I1, Number<MPerBlockM1>{}), max_lds_align);
 
         // B matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto b_k_n0_n1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto b_k_n0_n1_block_desc = make_naive_tensor_descriptor_aligned_v2(
             make_tuple(Number<KPerBlock>{}, I1, Number<NPerBlockN1>{}), max_lds_align);
 
         // A matrix blockwise copy
         auto a_blockwise_copy =
-            BlockwiseDynamicTensorSliceTransfer_v4<BlockSize,
-                                                   InMemoryDataOperationEnum_t::Set,
-                                                   Sequence<KPerBlock, 1, MPerBlockM1>,
-                                                   ABlockTransferThreadSliceLengths_K_M0_M1,
-                                                   ABlockTransferThreadClusterLengths_K_M0_M1,
-                                                   ABlockTransferThreadClusterArrangeOrder,
-                                                   FloatAB,
-                                                   FloatAB,
-                                                   decltype(a_k_m0_m1_grid_desc),
-                                                   decltype(a_k_m0_m1_block_desc),
-                                                   ABlockTransferSrcAccessOrder,
-                                                   Sequence<0, 1, 2>,
-                                                   ABlockTransferSrcVectorDim,
-                                                   2,
-                                                   ABlockTransferSrcScalarPerVector,
-                                                   ABlockTransferDstScalarPerVector_M1,
-                                                   1,
-                                                   1,
-                                                   AThreadTransferSrcResetCoordinateAfterRun,
-                                                   true>(a_k_m0_m1_grid_desc,
-                                                         make_multi_index(0, im0, 0),
-                                                         a_k_m0_m1_block_desc,
-                                                         make_multi_index(0, 0, 0));
+            BlockwiseTensorSliceTransfer_v4<BlockSize,
+                                            InMemoryDataOperationEnum_t::Set,
+                                            Sequence<KPerBlock, 1, MPerBlockM1>,
+                                            ABlockTransferThreadSliceLengths_K_M0_M1,
+                                            ABlockTransferThreadClusterLengths_K_M0_M1,
+                                            ABlockTransferThreadClusterArrangeOrder,
+                                            FloatAB,
+                                            FloatAB,
+                                            decltype(a_k_m0_m1_grid_desc),
+                                            decltype(a_k_m0_m1_block_desc),
+                                            ABlockTransferSrcAccessOrder,
+                                            Sequence<0, 1, 2>,
+                                            ABlockTransferSrcVectorDim,
+                                            2,
+                                            ABlockTransferSrcScalarPerVector,
+                                            ABlockTransferDstScalarPerVector_M1,
+                                            1,
+                                            1,
+                                            AThreadTransferSrcResetCoordinateAfterRun,
+                                            true>(a_k_m0_m1_grid_desc,
+                                                  make_multi_index(0, im0, 0),
+                                                  a_k_m0_m1_block_desc,
+                                                  make_multi_index(0, 0, 0));
 
         // B matrix blockwise copy
         auto b_blockwise_copy =
-            BlockwiseDynamicTensorSliceTransfer_v4<BlockSize,
-                                                   InMemoryDataOperationEnum_t::Set,
-                                                   Sequence<KPerBlock, 1, NPerBlockN1>,
-                                                   BBlockTransferThreadSliceLengths_K_N0_N1,
-                                                   BBlockTransferThreadClusterLengths_K_N0_N1,
-                                                   BBlockTransferThreadClusterArrangeOrder,
-                                                   FloatAB,
-                                                   FloatAB,
-                                                   decltype(b_k_n0_n1_grid_desc),
-                                                   decltype(b_k_n0_n1_block_desc),
-                                                   BBlockTransferSrcAccessOrder,
-                                                   Sequence<0, 1, 2>,
-                                                   BBlockTransferSrcVectorDim,
-                                                   2,
-                                                   BBlockTransferSrcScalarPerVector,
-                                                   BBlockTransferDstScalarPerVector_N1,
-                                                   1,
-                                                   1,
-                                                   BThreadTransferSrcResetCoordinateAfterRun,
-                                                   true>(b_k_n0_n1_grid_desc,
-                                                         make_multi_index(0, in0, 0),
-                                                         b_k_n0_n1_block_desc,
-                                                         make_multi_index(0, 0, 0));
+            BlockwiseTensorSliceTransfer_v4<BlockSize,
+                                            InMemoryDataOperationEnum_t::Set,
+                                            Sequence<KPerBlock, 1, NPerBlockN1>,
+                                            BBlockTransferThreadSliceLengths_K_N0_N1,
+                                            BBlockTransferThreadClusterLengths_K_N0_N1,
+                                            BBlockTransferThreadClusterArrangeOrder,
+                                            FloatAB,
+                                            FloatAB,
+                                            decltype(b_k_n0_n1_grid_desc),
+                                            decltype(b_k_n0_n1_block_desc),
+                                            BBlockTransferSrcAccessOrder,
+                                            Sequence<0, 1, 2>,
+                                            BBlockTransferSrcVectorDim,
+                                            2,
+                                            BBlockTransferSrcScalarPerVector,
+                                            BBlockTransferDstScalarPerVector_N1,
+                                            1,
+                                            1,
+                                            BThreadTransferSrcResetCoordinateAfterRun,
+                                            true>(b_k_n0_n1_grid_desc,
+                                                  make_multi_index(0, in0, 0),
+                                                  b_k_n0_n1_block_desc,
+                                                  make_multi_index(0, 0, 0));
 
         // GEMM definition
         //   c_mtx += transpose(a_mtx) * b_mtx
@@ -447,9 +446,8 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
         constexpr auto c_m10_m11_n10_n11_thread_tensor_lengths =
             decltype(blockwise_gemm)::GetCM0M1N0N1ThreadTensorLengths();
 
-        constexpr auto c_m10_m11_n10_n11_thread_desc =
-            make_dynamic_naive_tensor_descriptor_packed_v2(
-                sequence_to_tuple_of_number(c_m10_m11_n10_n11_thread_tensor_lengths));
+        constexpr auto c_m10_m11_n10_n11_thread_desc = make_naive_tensor_descriptor_packed(
+            sequence_to_tuple_of_number(c_m10_m11_n10_n11_thread_tensor_lengths));
 
         // LDS allocation for A and B: be careful of alignment
         constexpr auto a_block_aligned_space_size =
@@ -465,9 +463,9 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
         auto c_thread_buf = make_static_buffer<AddressSpaceEnum_t::Vgpr, FloatAcc>(
             c_m10_m11_n10_n11_thread_desc.GetElementSpaceSize());
 
-        ThreadwiseDynamicTensorSliceSet_v1<FloatAcc,
-                                           decltype(c_m10_m11_n10_n11_thread_desc),
-                                           decltype(c_m10_m11_n10_n11_thread_tensor_lengths)>{}
+        ThreadwiseTensorSliceSet_v1<FloatAcc,
+                                    decltype(c_m10_m11_n10_n11_thread_desc),
+                                    decltype(c_m10_m11_n10_n11_thread_tensor_lengths)>{}
             .Run(c_m10_m11_n10_n11_thread_desc,
                  make_tuple(I0, I0, I0, I0),
                  c_thread_buf,
@@ -620,7 +618,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
         // output: register to global memory
         {
             constexpr auto c_m0_m10_m11_n0_n10_n11_thread_desc =
-                make_dynamic_naive_tensor_descriptor_packed_v2(
+                make_naive_tensor_descriptor_packed(
                     make_tuple(I1,
                                Number<c_m10_m11_n10_n11_thread_tensor_lengths[I0]>{},
                                Number<c_m10_m11_n10_n11_thread_tensor_lengths[I1]>{},
@@ -631,7 +629,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
             const auto c_m10_m11_n10_n11_thread_origin_idx_on_block =
                 blockwise_gemm.CalculateCM0M1N0N1ThreadOriginOnBlock(get_thread_local_1d_id());
 
-            ThreadwiseDynamicTensorSliceTransfer_v1r3<
+            ThreadwiseTensorSliceTransfer_v1r3<
                 FloatAcc,
                 FloatC,
                 decltype(c_m0_m10_m11_n0_n10_n11_thread_desc),
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r3.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r3.hpp
similarity index 89%
rename from src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r3.hpp
rename to src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r3.hpp
index 244c376cf8..70cedf3fa0 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r3.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r3.hpp
@@ -1,14 +1,14 @@
-#ifndef CK_GRIDWISE_DYNAMIC_GEMM_V1R3_HPP
-#define CK_GRIDWISE_DYNAMIC_GEMM_V1R3_HPP
+#ifndef CK_GRIDWISE_GEMM_V1R3_HPP
+#define CK_GRIDWISE_GEMM_V1R3_HPP
 
 #include "common_header.hpp"
-#include "dynamic_multi_index_transform_helper.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "multi_index_transform_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 #include "blockwise_gemm_dlops_v2r3.hpp"
-#include "blockwise_dynamic_tensor_slice_transfer_v2.hpp"
-#include "threadwise_dynamic_tensor_slice_transfer_v2.hpp"
-#include "threadwise_dynamic_tensor_slice_set.hpp"
+#include "blockwise_tensor_slice_transfer_v2.hpp"
+#include "threadwise_tensor_slice_transfer_v2.hpp"
+#include "threadwise_tensor_slice_set.hpp"
 
 namespace ck {
 
@@ -26,7 +26,7 @@ __global__ void
 #if CK_USE_LAUNCH_BOUNDS
     __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_dynamic_gemm_dlops_v1r3(
+        kernel_gemm_dlops_v1r3(
             const FloatAB* __restrict__ p_a_grid,
             const FloatAB* __restrict__ p_b_grid,
             FloatC* __restrict__ p_c_grid,
@@ -68,14 +68,13 @@ __global__ void
 #if CK_USE_LAUNCH_BOUNDS
     __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_dynamic_gemm_dlops_v1r3(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            FloatC* __restrict__ p_c_grid,
-            const void CONSTANT* p_a_k0_m0_m1_k1_grid_desc,
-            const void CONSTANT* p_b_k0_n0_n1_k1_grid_desc,
-            const void CONSTANT* p_c_m0_m10_m11_n0_n10_n11_grid_desc,
-            const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor)
+        kernel_gemm_dlops_v1r3(const FloatAB* __restrict__ p_a_grid,
+                               const FloatAB* __restrict__ p_b_grid,
+                               FloatC* __restrict__ p_c_grid,
+                               const void CONSTANT* p_a_k0_m0_m1_k1_grid_desc,
+                               const void CONSTANT* p_b_k0_n0_n1_k1_grid_desc,
+                               const void CONSTANT* p_c_m0_m10_m11_n0_n10_n11_grid_desc,
+                               const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor)
 {
     // first cast void CONSTANT void* to void*
     // second cast void* to Desc*
@@ -147,7 +146,7 @@ template <index_t BlockSize,
           typename CGridIteratorHacks,
           typename AGridMoveSliceWindowIteratorHacks,
           typename BGridMoveSliceWindowIteratorHacks>
-struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
+struct GridwiseGemmDlops_km_kn_mn_v1r3
 {
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
@@ -164,12 +163,12 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
 
         // TODO: check alignment
         // A matrix in LDS memory, dst of blockwise copy
-        constexpr auto a_k_m_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned_v2(
             make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}, K1), max_lds_align);
 
         // TODO: check alignment
         // B matrix in LDS memory, dst of blockwise copy
-        constexpr auto b_k_n_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned_v2(
             make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}, K1), max_lds_align);
 
         // TODO: check alignment
@@ -231,13 +230,13 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
         const auto M1 = Number<MPerBlockM1>{};
         const auto M0 = M / M1;
 
-        const auto a_k0_m0_m1_k1_grid_desc = transform_dynamic_tensor_descriptor(
-            a_k0_m_k1_grid_desc,
-            make_tuple(make_pass_through_transform(K0),
-                       make_unmerge_transform(make_tuple(M0, M1)),
-                       make_pass_through_transform(K1)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+        const auto a_k0_m0_m1_k1_grid_desc =
+            transform_tensor_descriptor(a_k0_m_k1_grid_desc,
+                                        make_tuple(make_pass_through_transform(K0),
+                                                   make_unmerge_transform(make_tuple(M0, M1)),
+                                                   make_pass_through_transform(K1)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
 
         return a_k0_m0_m1_k1_grid_desc;
     }
@@ -251,13 +250,13 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
         const auto N1 = Number<NPerBlockN1>{};
         const auto N0 = N / N1;
 
-        const auto b_k0_n0_n1_k1_grid_desc = transform_dynamic_tensor_descriptor(
-            b_k0_n_k1_grid_desc,
-            make_tuple(make_pass_through_transform(K0),
-                       make_unmerge_transform(make_tuple(N0, N1)),
-                       make_pass_through_transform(K1)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+        const auto b_k0_n0_n1_k1_grid_desc =
+            transform_tensor_descriptor(b_k0_n_k1_grid_desc,
+                                        make_tuple(make_pass_through_transform(K0),
+                                                   make_unmerge_transform(make_tuple(N0, N1)),
+                                                   make_pass_through_transform(K1)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
 
         return b_k0_n0_n1_k1_grid_desc;
     }
@@ -284,7 +283,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
         constexpr auto M10 = M1 / M11;
         constexpr auto N10 = N1 / N11;
 
-        const auto c_m0_m10_m11_n0_n10_n11_grid_desc = transform_dynamic_tensor_descriptor(
+        const auto c_m0_m10_m11_n0_n10_n11_grid_desc = transform_tensor_descriptor(
             c_m_n_grid_desc,
             make_tuple(make_unmerge_transform(make_tuple(M0, M10, M11)),
                        make_unmerge_transform(make_tuple(N0, N10, N11))),
@@ -355,23 +354,23 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
         // TODO: check alignment
         // A matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto a_k0_m0_m1_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_k0_m0_m1_k1_block_desc = make_naive_tensor_descriptor_aligned_v2(
             make_tuple(Number<KPerBlock>{}, I1, Number<MPerBlockM1>{}, K1), max_lds_align);
 
         // TODO: check alignment
         // B matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto b_k0_n0_n1_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto b_k0_n0_n1_k1_block_desc = make_naive_tensor_descriptor_aligned_v2(
             make_tuple(Number<KPerBlock>{}, I1, Number<NPerBlockN1>{}, K1), max_lds_align);
 
         // TODO: check alignment
         // A matrix in LDS memory, for blockwise GEMM
-        constexpr auto a_k0_m_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned_v2(
             make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}, K1), max_lds_align);
 
         // TODO: check alignment
         // B matrix in LDS memory, for blockwise GEMM
-        constexpr auto b_k0_n_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned_v2(
             make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}, K1), max_lds_align);
 
         static_assert(a_k0_m0_m1_k1_block_desc.GetElementSpaceSize() ==
@@ -381,7 +380,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
                       "wrong!");
 
         // A matrix blockwise copy
-        auto a_blockwise_copy = BlockwiseDynamicTensorSliceTransfer_v4r1<
+        auto a_blockwise_copy = BlockwiseTensorSliceTransfer_v4r1<
             BlockSize,
             InMemoryDataOperationEnum_t::Set,
             Sequence<KPerBlock, 1, MPerBlockM1, K1.value>,
@@ -405,7 +404,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
                   make_multi_index(0, 0, 0, 0));
 
         // B matrix blockwise copy
-        auto b_blockwise_copy = BlockwiseDynamicTensorSliceTransfer_v4r1<
+        auto b_blockwise_copy = BlockwiseTensorSliceTransfer_v4r1<
             BlockSize,
             InMemoryDataOperationEnum_t::Set,
             Sequence<KPerBlock, 1, NPerBlockN1, K1.value>,
@@ -453,9 +452,8 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
         constexpr auto c_m10_m11_n10_n11_thread_tensor_lengths =
             decltype(blockwise_gemm)::GetCThreadTensorLengths_BM0_BM1_BN0_BN1();
 
-        constexpr auto c_m10_m11_n10_n11_thread_desc =
-            make_dynamic_naive_tensor_descriptor_packed_v2(
-                sequence_to_tuple_of_number(c_m10_m11_n10_n11_thread_tensor_lengths));
+        constexpr auto c_m10_m11_n10_n11_thread_desc = make_naive_tensor_descriptor_packed(
+            sequence_to_tuple_of_number(c_m10_m11_n10_n11_thread_tensor_lengths));
 
         // LDS allocation for A and B: be careful of alignment
         constexpr auto a_block_aligned_space_size = math::integer_least_multiple(
@@ -471,9 +469,9 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
         auto c_thread_buf = make_static_buffer<AddressSpaceEnum_t::Vgpr, FloatAcc>(
             c_m10_m11_n10_n11_thread_desc.GetElementSpaceSize());
 
-        ThreadwiseDynamicTensorSliceSet_v1<FloatAcc,
-                                           decltype(c_m10_m11_n10_n11_thread_desc),
-                                           decltype(c_m10_m11_n10_n11_thread_tensor_lengths)>{}
+        ThreadwiseTensorSliceSet_v1<FloatAcc,
+                                    decltype(c_m10_m11_n10_n11_thread_desc),
+                                    decltype(c_m10_m11_n10_n11_thread_tensor_lengths)>{}
             .Run(c_m10_m11_n10_n11_thread_desc,
                  make_tuple(I0, I0, I0, I0),
                  c_thread_buf,
@@ -609,7 +607,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
         // output: register to global memory
         {
             constexpr auto c_m0_m10_m11_n0_n10_n11_thread_desc =
-                make_dynamic_naive_tensor_descriptor_packed_v2(
+                make_naive_tensor_descriptor_packed(
                     make_tuple(I1,
                                Number<c_m10_m11_n10_n11_thread_tensor_lengths[I0]>{},
                                Number<c_m10_m11_n10_n11_thread_tensor_lengths[I1]>{},
@@ -621,7 +619,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
                 blockwise_gemm.CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1(
                     get_thread_local_1d_id());
 
-            ThreadwiseDynamicTensorSliceTransfer_v1r3<
+            ThreadwiseTensorSliceTransfer_v1r3<
                 FloatAcc,
                 FloatC,
                 decltype(c_m0_m10_m11_n0_n10_n11_thread_desc),
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v2.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
similarity index 79%
rename from src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v2.hpp
rename to src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
index 5e90e0e85d..484f5d938d 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v2.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
@@ -1,12 +1,12 @@
-#ifndef CK_GRIDWISE_DYNAMIC_GEMM_V2_HPP
-#define CK_GRIDWISE_DYNAMIC_GEMM_V2_HPP
+#ifndef CK_GRIDWISE_GEMM_V2_HPP
+#define CK_GRIDWISE_GEMM_V2_HPP
 
 #include "common_header.hpp"
-#include "dynamic_multi_index_transform_helper.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "blockwise_dynamic_tensor_slice_transfer.hpp"
-#include "threadwise_dynamic_tensor_slice_transfer.hpp"
+#include "multi_index_transform_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "blockwise_tensor_slice_transfer.hpp"
+#include "threadwise_tensor_slice_transfer.hpp"
 #include "blockwise_gemm_dlops_v3.hpp"
 
 namespace ck {
@@ -47,7 +47,7 @@ template <index_t BlockSize,
           typename CGlobalIteratorHacks,
           typename AGlobalMoveSliceWindowIteratorHacks,
           typename BGlobalMoveSliceWindowIteratorHacks>
-struct GridwiseDynamicGemmDlops_km_kn_mn_v3
+struct GridwiseGemmDlops_km_kn_mn_v3
 {
     __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
     {
@@ -58,7 +58,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
 
         // A matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto a_e_k_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_e_k_desc = make_naive_tensor_descriptor_aligned_v2(
             make_tuple(Number<E>{}, Number<KPerBlock>{}), max_lds_align);
 
         // LDS allocation for A and B: be careful of alignment
@@ -132,23 +132,21 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
 
         // A matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto a_e_k_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_e_k_block_desc = make_naive_tensor_descriptor_aligned_v2(
             make_tuple(Number<EPerBlock>{}, Number<KPerBlock>{}), max_lds_align);
 
-        constexpr auto a_e_k_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_e_k_desc = make_naive_tensor_descriptor_aligned_v2(
             make_tuple(Number<E>{}, Number<KPerBlock>{}), max_lds_align);
 
         // B matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto b_e_n_ho_wo_block_desc =
-            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
-                Number<EPerBlock>{}, Number<1>{}, Number<HoPerBlock>{}, Number<WoPerBlock>{}));
+        constexpr auto b_e_n_ho_wo_block_desc = make_naive_tensor_descriptor_packed(make_tuple(
+            Number<EPerBlock>{}, Number<1>{}, Number<HoPerBlock>{}, Number<WoPerBlock>{}));
 
         // c_thread_mtx definition: this is a mess
         // TODO:: more elegent way of defining c_thread_mtx
-        constexpr auto c_k_n_ho_wo_thread_desc =
-            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
-                Number<KPerThread>{}, Number<1>{}, Number<HoPerThread>{}, Number<WoPerThread>{}));
+        constexpr auto c_k_n_ho_wo_thread_desc = make_naive_tensor_descriptor_packed(make_tuple(
+            Number<KPerThread>{}, Number<1>{}, Number<HoPerThread>{}, Number<WoPerThread>{}));
 
         auto blockwise_gemm =
             BlockwiseGemmDlops_km_kn_m0m1n0n1_v3<BlockSize,
@@ -182,47 +180,46 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
 
         // A matrix blockwise copy
         auto a_blockwise_copy =
-            BlockwiseDynamicTensorSliceTransfer_v4<BlockSize,
-                                                   InMemoryDataOperationEnum_t::Set,
-                                                   Sequence<E, KPerBlock>,
-                                                   ABlockTransferThreadSliceLengths_E_K,
-                                                   ABlockTransferThreadClusterLengths_E_K,
-                                                   ABlockTransferThreadClusterArrangeOrder,
-                                                   FloatAB,
-                                                   FloatAB,
-                                                   decltype(a_e_k_global_desc),
-                                                   decltype(a_e_k_desc),
-                                                   ABlockTransferSrcAccessOrder,
-                                                   Sequence<0, 1>,
-                                                   ABlockTransferSrcVectorDim,
-                                                   1,
-                                                   ABlockTransferSrcScalarPerVector,
-                                                   ABlockTransferDstScalarPerVector_K,
-                                                   1,
-                                                   1,
-                                                   AThreadTransferSrcResetCoordinateAfterRun,
-                                                   true>(
-                a_e_k_global_desc,
-                make_multi_index(0, k_block_data_on_global),
-                a_e_k_desc,
-                make_multi_index(0, 0));
-
-        constexpr auto b_e_n_ho_wo_thread_desc =
-            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
-                Number<EPerBlock>{}, Number<1>{}, Number<HoPerThread>{}, Number<WoPerThread>{}));
-
-        auto b_threadwise_transfer = ThreadwiseDynamicTensorSliceTransfer_v2<
-            FloatAB,
-            FloatAB,
-            decltype(b_e_n_ho_wo_global_desc),
-            decltype(b_e_n_ho_wo_thread_desc),
-            Sequence<EPerBlock, 1, HoPerThread, WoPerThread>,
-            BBlockTransferSrcAccessOrder,
-            BBlockTransferSrcVectorDim,
-            BBlockTransferSrcScalarPerVector,
-            1,
-            true>(b_e_n_ho_wo_global_desc,
-                  make_multi_index(0, 0, ho_thread_data_on_global, wo_thread_data_on_global));
+            BlockwiseTensorSliceTransfer_v4<BlockSize,
+                                            InMemoryDataOperationEnum_t::Set,
+                                            Sequence<E, KPerBlock>,
+                                            ABlockTransferThreadSliceLengths_E_K,
+                                            ABlockTransferThreadClusterLengths_E_K,
+                                            ABlockTransferThreadClusterArrangeOrder,
+                                            FloatAB,
+                                            FloatAB,
+                                            decltype(a_e_k_global_desc),
+                                            decltype(a_e_k_desc),
+                                            ABlockTransferSrcAccessOrder,
+                                            Sequence<0, 1>,
+                                            ABlockTransferSrcVectorDim,
+                                            1,
+                                            ABlockTransferSrcScalarPerVector,
+                                            ABlockTransferDstScalarPerVector_K,
+                                            1,
+                                            1,
+                                            AThreadTransferSrcResetCoordinateAfterRun,
+                                            true>(a_e_k_global_desc,
+                                                  make_multi_index(0, k_block_data_on_global),
+                                                  a_e_k_desc,
+                                                  make_multi_index(0, 0));
+
+        constexpr auto b_e_n_ho_wo_thread_desc = make_naive_tensor_descriptor_packed(make_tuple(
+            Number<EPerBlock>{}, Number<1>{}, Number<HoPerThread>{}, Number<WoPerThread>{}));
+
+        auto b_threadwise_transfer =
+            ThreadwiseTensorSliceTransfer_v2<FloatAB,
+                                             FloatAB,
+                                             decltype(b_e_n_ho_wo_global_desc),
+                                             decltype(b_e_n_ho_wo_thread_desc),
+                                             Sequence<EPerBlock, 1, HoPerThread, WoPerThread>,
+                                             BBlockTransferSrcAccessOrder,
+                                             BBlockTransferSrcVectorDim,
+                                             BBlockTransferSrcScalarPerVector,
+                                             1,
+                                             true>(
+                b_e_n_ho_wo_global_desc,
+                make_multi_index(0, 0, ho_thread_data_on_global, wo_thread_data_on_global));
 
         auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
             p_shared_block, a_e_k_desc.GetElementSpaceSize());
@@ -234,9 +231,9 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
             c_thread_buf;
 
         // initialize output thread tensor
-        ThreadwiseDynamicTensorSliceSet_v1<FloatAcc,
-                                           decltype(c_k_n_ho_wo_thread_desc),
-                                           Sequence<KPerThread, 1, HoPerThread, WoPerThread>>{}
+        ThreadwiseTensorSliceSet_v1<FloatAcc,
+                                    decltype(c_k_n_ho_wo_thread_desc),
+                                    Sequence<KPerThread, 1, HoPerThread, WoPerThread>>{}
             .Run(c_k_n_ho_wo_thread_desc, make_tuple(I0, I0, I0, I0), c_thread_buf, FloatAcc{0});
 
         constexpr auto b_thread_slice_copy_step = make_multi_index(EPerBlock, 0, 0, 0);
@@ -354,18 +351,17 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
             const index_t k_thread_data_on_global =
                 k_block_data_on_global + k_thread_id * KPerThread;
 
-            ThreadwiseDynamicTensorSliceTransfer_v1r3<
-                FloatAcc,
-                FloatC,
-                decltype(c_k_n_ho_wo_thread_desc),
-                decltype(c_k_n_ho_wo_global_desc),
-                Sequence<KPerThread, 1, HoPerThread, WoPerThread>,
-                CThreadTransferSrcDstAccessOrder,
-                CThreadTransferSrcDstVectorDim,
-                CThreadTransferDstScalarPerVector,
-                CGlobalMemoryDataOperation,
-                1,
-                true>(
+            ThreadwiseTensorSliceTransfer_v1r3<FloatAcc,
+                                               FloatC,
+                                               decltype(c_k_n_ho_wo_thread_desc),
+                                               decltype(c_k_n_ho_wo_global_desc),
+                                               Sequence<KPerThread, 1, HoPerThread, WoPerThread>,
+                                               CThreadTransferSrcDstAccessOrder,
+                                               CThreadTransferSrcDstVectorDim,
+                                               CThreadTransferDstScalarPerVector,
+                                               CGlobalMemoryDataOperation,
+                                               1,
+                                               true>(
                 c_k_n_ho_wo_global_desc,
                 make_multi_index(
                     k_thread_data_on_global, 0, ho_thread_data_on_global, wo_thread_data_on_global))
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
similarity index 81%
rename from src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp
rename to src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
index 4f02da1409..b70d57f913 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
@@ -1,14 +1,14 @@
-#ifndef CK_GRIDWISE_DYNAMIC_GEMM_XDLOPS_V2R3_HPP
-#define CK_GRIDWISE_DYNAMIC_GEMM_XDLOPS_V2R3_HPP
+#ifndef CK_GRIDWISE_GEMM_XDLOPS_V2R3_HPP
+#define CK_GRIDWISE_GEMM_XDLOPS_V2R3_HPP
 
 #include "common_header.hpp"
-#include "dynamic_multi_index_transform_helper.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "multi_index_transform_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 #include "blockwise_gemm_xdlops.hpp"
-#include "blockwise_dynamic_tensor_slice_transfer.hpp"
-#include "threadwise_dynamic_tensor_slice_transfer.hpp"
-#include "threadwise_dynamic_tensor_slice_set.hpp"
+#include "blockwise_tensor_slice_transfer.hpp"
+#include "threadwise_tensor_slice_transfer.hpp"
+#include "threadwise_tensor_slice_set.hpp"
 
 namespace ck {
 
@@ -24,13 +24,13 @@ __global__ void
 #if CK_USE_LAUNCH_BOUNDS
     __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_dynamic_gemm_xdlops_v2r3(const FloatAB* __restrict__ p_a_grid,
-                                        const FloatAB* __restrict__ p_b_grid,
-                                        FloatC* __restrict__ p_c_grid,
-                                        const AK0MK1GridDesc a_k0_m_k1_grid_desc,
-                                        const BK0NK1GridDesc b_k0_n_k1_grid_desc,
-                                        const CM0M1M2NGridDesc c_m0_m1_m2_n_grid_desc,
-                                        const CBlockClusterAdaptor c_block_cluster_adaptor)
+        kernel_gemm_xdlops_v2r3(const FloatAB* __restrict__ p_a_grid,
+                                const FloatAB* __restrict__ p_b_grid,
+                                FloatC* __restrict__ p_c_grid,
+                                const AK0MK1GridDesc a_k0_m_k1_grid_desc,
+                                const BK0NK1GridDesc b_k0_n_k1_grid_desc,
+                                const CM0M1M2NGridDesc c_m0_m1_m2_n_grid_desc,
+                                const CBlockClusterAdaptor c_block_cluster_adaptor)
 {
     constexpr index_t shared_block_size =
         GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
@@ -58,13 +58,13 @@ __global__ void
 #if CK_USE_LAUNCH_BOUNDS
     __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_dynamic_gemm_xdlops_v2r3(const FloatAB* __restrict__ p_a_grid,
-                                        const FloatAB* __restrict__ p_b_grid,
-                                        FloatC* __restrict__ p_c_grid,
-                                        const void CONSTANT* p_a_k0_m_k1_grid_desc,
-                                        const void CONSTANT* p_b_k0_n_k1_grid_desc,
-                                        const void CONSTANT* p_c_m0_m1_m2_n_grid_desc,
-                                        const void CONSTANT* p_c_block_cluster_adaptor)
+        kernel_gemm_xdlops_v2r3(const FloatAB* __restrict__ p_a_grid,
+                                const FloatAB* __restrict__ p_b_grid,
+                                FloatC* __restrict__ p_c_grid,
+                                const void CONSTANT* p_a_k0_m_k1_grid_desc,
+                                const void CONSTANT* p_b_k0_n_k1_grid_desc,
+                                const void CONSTANT* p_c_m0_m1_m2_n_grid_desc,
+                                const void CONSTANT* p_c_block_cluster_adaptor)
 {
     constexpr index_t shared_block_size =
         GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
@@ -132,7 +132,7 @@ template <index_t BlockSize,
           typename AGridMoveSliceWindowIteratorHacks,
           typename BGridMoveSliceWindowIteratorHacks,
           bool CAccessOrderMRepeatNRepeat>
-struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
+struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
 {
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
@@ -148,12 +148,12 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
 
         // A matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto a_k0_m_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned_v2(
             make_tuple(Number<KPerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
 
         // B matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto b_k0_n_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned_v2(
             make_tuple(Number<KPerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
 
         // LDS allocation for A and B: be careful of alignment
@@ -216,7 +216,7 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
 
         constexpr auto N1 = Number<CLayout.N0()>{};
 
-        const auto c_m0_m1_m2_n_grid_desc = transform_dynamic_tensor_descriptor(
+        const auto c_m0_m1_m2_n_grid_desc = transform_tensor_descriptor(
             c_m_n_grid_desc,
             make_tuple(make_unmerge_transform(make_tuple(MRepeat, MWaves, M0, M1, M2)),
                        make_unmerge_transform(make_tuple(NRepeat, NWaves, N1))),
@@ -290,67 +290,65 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
 
         // A matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto a_k0_m_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned_v2(
             make_tuple(Number<KPerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
 
         // B matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto b_k0_n_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned_v2(
             make_tuple(Number<KPerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
 
         // A matrix blockwise copy
         auto a_blockwise_copy =
-            BlockwiseDynamicTensorSliceTransfer_v4<BlockSize,
-                                                   InMemoryDataOperationEnum_t::Set,
-                                                   Sequence<KPerBlock, MPerBlock, K1>,
-                                                   ABlockTransferThreadSliceLengths_K0_M_K1,
-                                                   ABlockTransferThreadClusterLengths_K0_M_K1,
-                                                   ABlockTransferThreadClusterArrangeOrder,
-                                                   FloatAB,
-                                                   FloatAB,
-                                                   decltype(a_k0_m_k1_grid_desc),
-                                                   decltype(a_k0_m_k1_block_desc),
-                                                   ABlockTransferSrcAccessOrder,
-                                                   Sequence<1, 0, 2>,
-                                                   ABlockTransferSrcVectorDim,
-                                                   2,
-                                                   ABlockTransferSrcScalarPerVector,
-                                                   ABlockTransferDstScalarPerVector_K1,
-                                                   1,
-                                                   1,
-                                                   AThreadTransferSrcResetCoordinateAfterRun,
-                                                   true>(
-                a_k0_m_k1_grid_desc,
-                make_multi_index(0, m_block_data_idx_on_grid, 0),
-                a_k0_m_k1_block_desc,
-                make_multi_index(0, 0, 0));
+            BlockwiseTensorSliceTransfer_v4<BlockSize,
+                                            InMemoryDataOperationEnum_t::Set,
+                                            Sequence<KPerBlock, MPerBlock, K1>,
+                                            ABlockTransferThreadSliceLengths_K0_M_K1,
+                                            ABlockTransferThreadClusterLengths_K0_M_K1,
+                                            ABlockTransferThreadClusterArrangeOrder,
+                                            FloatAB,
+                                            FloatAB,
+                                            decltype(a_k0_m_k1_grid_desc),
+                                            decltype(a_k0_m_k1_block_desc),
+                                            ABlockTransferSrcAccessOrder,
+                                            Sequence<1, 0, 2>,
+                                            ABlockTransferSrcVectorDim,
+                                            2,
+                                            ABlockTransferSrcScalarPerVector,
+                                            ABlockTransferDstScalarPerVector_K1,
+                                            1,
+                                            1,
+                                            AThreadTransferSrcResetCoordinateAfterRun,
+                                            true>(a_k0_m_k1_grid_desc,
+                                                  make_multi_index(0, m_block_data_idx_on_grid, 0),
+                                                  a_k0_m_k1_block_desc,
+                                                  make_multi_index(0, 0, 0));
 
         // B matrix blockwise copy
         auto b_blockwise_copy =
-            BlockwiseDynamicTensorSliceTransfer_v4<BlockSize,
-                                                   InMemoryDataOperationEnum_t::Set,
-                                                   Sequence<KPerBlock, NPerBlock, K1>,
-                                                   BBlockTransferThreadSliceLengths_K0_N_K1,
-                                                   BBlockTransferThreadClusterLengths_K0_N_K1,
-                                                   BBlockTransferThreadClusterArrangeOrder,
-                                                   FloatAB,
-                                                   FloatAB,
-                                                   decltype(b_k0_n_k1_grid_desc),
-                                                   decltype(b_k0_n_k1_block_desc),
-                                                   BBlockTransferSrcAccessOrder,
-                                                   Sequence<1, 0, 2>,
-                                                   BBlockTransferSrcVectorDim,
-                                                   2,
-                                                   BBlockTransferSrcScalarPerVector,
-                                                   BBlockTransferDstScalarPerVector_K1,
-                                                   1,
-                                                   1,
-                                                   BThreadTransferSrcResetCoordinateAfterRun,
-                                                   true>(
-                b_k0_n_k1_grid_desc,
-                make_multi_index(0, n_block_data_idx_on_grid, 0),
-                b_k0_n_k1_block_desc,
-                make_multi_index(0, 0, 0));
+            BlockwiseTensorSliceTransfer_v4<BlockSize,
+                                            InMemoryDataOperationEnum_t::Set,
+                                            Sequence<KPerBlock, NPerBlock, K1>,
+                                            BBlockTransferThreadSliceLengths_K0_N_K1,
+                                            BBlockTransferThreadClusterLengths_K0_N_K1,
+                                            BBlockTransferThreadClusterArrangeOrder,
+                                            FloatAB,
+                                            FloatAB,
+                                            decltype(b_k0_n_k1_grid_desc),
+                                            decltype(b_k0_n_k1_block_desc),
+                                            BBlockTransferSrcAccessOrder,
+                                            Sequence<1, 0, 2>,
+                                            BBlockTransferSrcVectorDim,
+                                            2,
+                                            BBlockTransferSrcScalarPerVector,
+                                            BBlockTransferDstScalarPerVector_K1,
+                                            1,
+                                            1,
+                                            BThreadTransferSrcResetCoordinateAfterRun,
+                                            true>(b_k0_n_k1_grid_desc,
+                                                  make_multi_index(0, n_block_data_idx_on_grid, 0),
+                                                  b_k0_n_k1_block_desc,
+                                                  make_multi_index(0, 0, 0));
 
         // GEMM definition
         //   c_mtx += transpose(a_mtx) * b_mtx
@@ -364,7 +362,7 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
                           NPerBlock % (NPerWave * NRepeat) == 0,
                       "wrong!");
 
-        constexpr auto a_k0_m0_m1_k1_block_desc = transform_dynamic_tensor_descriptor(
+        constexpr auto a_k0_m0_m1_k1_block_desc = transform_tensor_descriptor(
             a_k0_m_k1_block_desc,
             make_tuple(make_pass_through_transform(Number<KPerBlock>{}),
                        make_unmerge_transform(
@@ -373,7 +371,7 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
             make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
             make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
 
-        constexpr auto b_k0_n0_n1_k1_block_desc = transform_dynamic_tensor_descriptor(
+        constexpr auto b_k0_n0_n1_k1_block_desc = transform_tensor_descriptor(
             b_k0_n_k1_block_desc,
             make_tuple(make_pass_through_transform(Number<KPerBlock>{}),
                        make_unmerge_transform(
@@ -399,8 +397,8 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
 
         static_assert(NumBlks == 1 && NumXdlops == 1, "K Reduction Mfma only");
 
-        constexpr auto c_mr_nr_blk_desc = make_dynamic_naive_tensor_descriptor_packed_v2(
-            make_tuple(Number<MRepeat>{}, Number<NRepeat>{}));
+        constexpr auto c_mr_nr_blk_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<MRepeat>{}, Number<NRepeat>{}));
 
         StaticBuffer<AddressSpaceEnum_t::Vgpr,
                      vector_type<FloatAcc, BlkSize>,
@@ -492,7 +490,7 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
             constexpr index_t N1 = CLayout.N0();
 
             constexpr auto c_m0_m1_m2_n_thread_desc =
-                make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(Number<MRepeat>{},
+                make_naive_tensor_descriptor_packed(make_tuple(Number<MRepeat>{},
                                                                           Number<NRepeat>{},
                                                                           Number<1>{},
                                                                           Number<1>{},
@@ -533,7 +531,7 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
             constexpr index_t MWaves = MPerBlock / (MPerWave * MRepeat);
             constexpr index_t NWaves = NPerBlock / (NPerWave * NRepeat);
 
-            ThreadwiseDynamicTensorSliceTransfer_v1r3<
+            ThreadwiseTensorSliceTransfer_v1r3<
                 FloatC,
                 FloatC,
                 decltype(c_m0_m1_m2_n_thread_desc),
@@ -567,9 +565,8 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
             constexpr index_t M1 = CLayout.N1();
             constexpr index_t M2 = CLayout.M0();
 
-            constexpr auto c_m0_m1_m2_n_thread_desc =
-                make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
-                    I1, I1, I1, I1, Number<M0>{}, Number<1>{}, Number<M2>{}, Number<1>{}));
+            constexpr auto c_m0_m1_m2_n_thread_desc = make_naive_tensor_descriptor_packed(
+                make_tuple(I1, I1, I1, I1, Number<M0>{}, Number<1>{}, Number<M2>{}, Number<1>{}));
 
             // calculate origin of thread output tensor on global memory
             //     blockwise GEMM c matrix starting index
@@ -585,17 +582,17 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
             constexpr auto c_m0_m1_m2_n_grid_tensor_iterator_hacks = CGridIteratorHacks{};
 
             auto c_thread_copy =
-                ThreadwiseDynamicTensorSliceTransfer_v1r3<FloatC,
-                                                          FloatC,
-                                                          decltype(c_m0_m1_m2_n_thread_desc),
-                                                          decltype(c_m0_m1_m2_n_grid_desc),
-                                                          Sequence<1, 1, 1, 1, M0, 1, M2, 1>,
-                                                          CThreadTransferSrcDstAccessOrder,
-                                                          CThreadTransferSrcDstVectorDim,
-                                                          CThreadTransferDstScalarPerVector,
-                                                          CGlobalMemoryDataOperation,
-                                                          1,
-                                                          true>{
+                ThreadwiseTensorSliceTransfer_v1r3<FloatC,
+                                                   FloatC,
+                                                   decltype(c_m0_m1_m2_n_thread_desc),
+                                                   decltype(c_m0_m1_m2_n_grid_desc),
+                                                   Sequence<1, 1, 1, 1, M0, 1, M2, 1>,
+                                                   CThreadTransferSrcDstAccessOrder,
+                                                   CThreadTransferSrcDstVectorDim,
+                                                   CThreadTransferDstScalarPerVector,
+                                                   CGlobalMemoryDataOperation,
+                                                   1,
+                                                   true>{
                     c_m0_m1_m2_n_grid_desc,
                     make_multi_index(0,
                                      0,
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_set.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_set.hpp
similarity index 83%
rename from src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_set.hpp
rename to src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_set.hpp
index f1b632aa84..6eb058711e 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_set.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_set.hpp
@@ -1,9 +1,9 @@
-#ifndef CK_THREADWISE_DYNAMIC_TENSOR_SET_HPP
-#define CK_THREADWISE_DYNAMIC_TENSOR_SET_HPP
+#ifndef CK_THREADWISE_TENSOR_SET_HPP
+#define CK_THREADWISE_TENSOR_SET_HPP
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 
 namespace ck {
 
@@ -16,7 +16,7 @@ template <typename Data,
           typename Desc,
           typename SliceLengths,
           typename std::enable_if<Desc::IsKnownAtCompileTime(), bool>::type = false>
-struct ThreadwiseDynamicTensorSliceSet_v1
+struct ThreadwiseTensorSliceSet_v1
 {
     static constexpr index_t nDim = SliceLengths::Size();
 
@@ -40,7 +40,7 @@ struct ThreadwiseDynamicTensorSliceSet_v1
         constexpr auto origin_idx = to_multi_index(OriginIdx{});
 
         static_ford<SliceLengths>{}([&](auto access_idx) {
-            constexpr auto coord = make_dynamic_tensor_coordinate(desc, origin_idx + access_idx);
+            constexpr auto coord = make_tensor_coordinate(desc, origin_idx + access_idx);
 
             constexpr bool is_valid =
                 coordinate_has_valid_offset_assuming_visible_index_is_valid(desc, coord);
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
similarity index 89%
rename from src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
rename to src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
index 9626113686..66be04c335 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
@@ -1,9 +1,9 @@
-#ifndef CK_THREADWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_HPP
-#define CK_THREADWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_HPP
+#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_HPP
+#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_HPP
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 
 namespace ck {
 
@@ -58,19 +58,19 @@ template <typename SrcData,
           index_t DstScalarStrideInVector,
           bool DstResetCoordinateAfterRun,
           typename std::enable_if<SrcDesc::IsKnownAtCompileTime(), bool>::type = false>
-struct ThreadwiseDynamicTensorSliceTransfer_v1r3
+struct ThreadwiseTensorSliceTransfer_v1r3
 {
     static constexpr index_t nDim = SliceLengths::Size();
 
     using Index = MultiIndex<nDim>;
 
-    using DstCoord = decltype(make_dynamic_tensor_coordinate(DstDesc{}, Index{}));
+    using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
 
-    using DstCoordIterator = decltype(make_dynamic_tensor_coordinate_iterator(DstDesc{}, Index{}));
+    using DstCoordIterator = decltype(make_tensor_coordinate_iterator(DstDesc{}, Index{}));
 
-    __device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v1r3(
-        const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
-        : dst_coord_(make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_idx))
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v1r3(const DstDesc& dst_desc,
+                                                            const Index& dst_slice_origin_idx)
+        : dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin_idx))
     {
         static_assert(SrcDesc::IsKnownAtCompileTime(),
                       "wrong! SrcDesc need to known at compile-time");
@@ -78,7 +78,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
 
     __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
     {
-        dst_coord_ = make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_idx);
+        dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
     }
 
     template <typename SrcSliceOriginIdx,
@@ -136,7 +136,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
                     forward_step(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
                 });
 
-                return make_dynamic_tensor_coordinate_iterator(
+                return make_tensor_coordinate_iterator(
                     dst_desc, forward_step, dst_iterator_hacks[I0][i]);
             },
             Number<nDim>{});
@@ -150,7 +150,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
                     backward_step(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
                 });
 
-                return make_dynamic_tensor_coordinate_iterator(
+                return make_tensor_coordinate_iterator(
                     dst_desc, backward_step, dst_iterator_hacks[I1][i]);
             },
             Number<nDim>{});
@@ -235,12 +235,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
                 {
                     if constexpr(forward_sweep[i])
                     {
-                        move_dynamic_tensor_coordinate(
+                        move_tensor_coordinate(
                             dst_desc, dst_coord_, dst_forward_iterators[dim_access_order[i]]);
                     }
                     else
                     {
-                        move_dynamic_tensor_coordinate(
+                        move_tensor_coordinate(
                             dst_desc, dst_coord_, dst_backward_iterators[dim_access_order[i]]);
                     }
                 }
@@ -251,9 +251,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
         if constexpr(DstResetCoordinateAfterRun)
         {
             const auto dst_reset_iterator =
-                make_dynamic_tensor_coordinate_iterator(dst_desc, GetDstCoordinateResetStep());
+                make_tensor_coordinate_iterator(dst_desc, GetDstCoordinateResetStep());
 
-            move_dynamic_tensor_coordinate(dst_desc, dst_coord_, dst_reset_iterator);
+            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_iterator);
         }
     }
 
@@ -345,10 +345,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
                                        : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
 
         // is it OK to construct a new step every time?
-        const auto adjusted_step =
-            make_dynamic_tensor_coordinate_iterator(dst_desc, adjusted_step_idx);
+        const auto adjusted_step = make_tensor_coordinate_iterator(dst_desc, adjusted_step_idx);
 
-        move_dynamic_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
+        move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
     }
 
     private:
@@ -375,19 +374,19 @@ template <typename SrcData,
           index_t SrcScalarStrideInVector,
           bool SrcResetCoordinateAfterRun,
           typename std::enable_if<DstDesc::IsKnownAtCompileTime(), bool>::type = false>
-struct ThreadwiseDynamicTensorSliceTransfer_v2
+struct ThreadwiseTensorSliceTransfer_v2
 {
     static constexpr index_t nDim = SliceLengths::Size();
 
     using Index = MultiIndex<nDim>;
 
-    using SrcCoord = decltype(make_dynamic_tensor_coordinate(SrcDesc{}, Index{}));
+    using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
 
-    using SrcCoordIterator = decltype(make_dynamic_tensor_coordinate_iterator(SrcDesc{}, Index{}));
+    using SrcCoordIterator = decltype(make_tensor_coordinate_iterator(SrcDesc{}, Index{}));
 
-    __device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v2(const SrcDesc& src_desc,
-                                                                 const Index& src_slice_origin_idx)
-        : src_coord_(make_dynamic_tensor_coordinate(src_desc, src_slice_origin_idx))
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v2(const SrcDesc& src_desc,
+                                                          const Index& src_slice_origin_idx)
+        : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin_idx))
     {
         static_assert(DstDesc::IsKnownAtCompileTime(),
                       "wrong! SrcDesc need to known at compile-time");
@@ -395,7 +394,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v2
 
     __device__ void SetDstSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
     {
-        src_coord_ = make_dynamic_tensor_coordinate(src_desc, src_slice_origin_idx);
+        src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx);
     }
 
     template <typename SrcBuffer,
@@ -451,7 +450,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v2
                     forward_step(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0;
                 });
 
-                return make_dynamic_tensor_coordinate_iterator(
+                return make_tensor_coordinate_iterator(
                     src_desc, forward_step, src_iterator_hacks[I0][i]);
             },
             Number<nDim>{});
@@ -465,7 +464,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v2
                     backward_step(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0;
                 });
 
-                return make_dynamic_tensor_coordinate_iterator(
+                return make_tensor_coordinate_iterator(
                     src_desc, backward_step, src_iterator_hacks[I1][i]);
             },
             Number<nDim>{});
@@ -548,12 +547,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v2
                 {
                     if constexpr(forward_sweep[i])
                     {
-                        move_dynamic_tensor_coordinate(
+                        move_tensor_coordinate(
                             src_desc, src_coord_, src_forward_iterators[dim_access_order[i]]);
                     }
                     else
                     {
-                        move_dynamic_tensor_coordinate(
+                        move_tensor_coordinate(
                             src_desc, src_coord_, src_backward_iterators[dim_access_order[i]]);
                     }
                 }
@@ -564,9 +563,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v2
         if constexpr(SrcResetCoordinateAfterRun)
         {
             const auto src_reset_iterator =
-                make_dynamic_tensor_coordinate_iterator(src_desc, GetSrcCoordinateResetStep());
+                make_tensor_coordinate_iterator(src_desc, GetSrcCoordinateResetStep());
 
-            move_dynamic_tensor_coordinate(src_desc, src_coord_, src_reset_iterator);
+            move_tensor_coordinate(src_desc, src_coord_, src_reset_iterator);
         }
     }
 
@@ -658,10 +657,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v2
                                        : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
 
         // is it OK to construct a new step every time?
-        const auto adjusted_step =
-            make_dynamic_tensor_coordinate_iterator(src_desc, adjusted_step_idx);
+        const auto adjusted_step = make_tensor_coordinate_iterator(src_desc, adjusted_step_idx);
 
-        move_dynamic_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
     }
 
     private:
@@ -693,23 +691,23 @@ template <typename SliceLengths,
           bool DstResetCoordinateAfterRun> // control whether to move back dst coordinate after each
                                            // RunWrite(),  will be fused with MoveDstSliceWindow to
                                            // save addr computation
-struct ThreadwiseDynamicTensorSliceTransfer_v3
+struct ThreadwiseTensorSliceTransfer_v3
 {
     static constexpr index_t nDim = SliceLengths::Size();
     using Index                   = MultiIndex<nDim>;
 
-    using SrcCoord = decltype(make_dynamic_tensor_coordinate(SrcDesc{}, Index{}));
-    using DstCoord = decltype(make_dynamic_tensor_coordinate(DstDesc{}, Index{}));
+    using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
+    using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
 
-    using SrcCoordIterator = decltype(make_dynamic_tensor_coordinate_iterator(SrcDesc{}, Index{}));
-    using DstCoordIterator = decltype(make_dynamic_tensor_coordinate_iterator(DstDesc{}, Index{}));
+    using SrcCoordIterator = decltype(make_tensor_coordinate_iterator(SrcDesc{}, Index{}));
+    using DstCoordIterator = decltype(make_tensor_coordinate_iterator(DstDesc{}, Index{}));
 
-    __device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v3(const SrcDesc& src_desc,
-                                                                 const Index& src_slice_origin,
-                                                                 const DstDesc& dst_desc,
-                                                                 const Index& dst_slice_origin)
-        : src_coord_(make_dynamic_tensor_coordinate(src_desc, src_slice_origin)),
-          dst_coord_(make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin))
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v3(const SrcDesc& src_desc,
+                                                          const Index& src_slice_origin,
+                                                          const DstDesc& dst_desc,
+                                                          const Index& dst_slice_origin)
+        : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)),
+          dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin))
     {
         // TODO: fix this
         static_assert(is_same<SrcData, DstData>::value,
@@ -718,12 +716,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
 
     __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
     {
-        src_coord_ = make_dynamic_tensor_coordinate(src_desc, src_slice_origin_idx);
+        src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx);
     }
 
     __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
     {
-        dst_coord_ = make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_idx);
+        dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
     }
 
     template <typename SrcBuffer, typename SrcIteratorHacks>
@@ -766,7 +764,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
                     forward_step(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0;
                 });
 
-                return make_dynamic_tensor_coordinate_iterator(
+                return make_tensor_coordinate_iterator(
                     src_desc, forward_step, src_iterator_hacks[I0][i]);
             },
             Number<nDim>{});
@@ -780,7 +778,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
                     backward_step(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0;
                 });
 
-                return make_dynamic_tensor_coordinate_iterator(
+                return make_tensor_coordinate_iterator(
                     src_desc, backward_step, src_iterator_hacks[I1][i]);
             },
             Number<nDim>{});
@@ -862,12 +860,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
                 {
                     if constexpr(forward_sweep[i])
                     {
-                        move_dynamic_tensor_coordinate(
+                        move_tensor_coordinate(
                             src_desc, src_coord_, src_forward_iterators[src_dim_access_order[i]]);
                     }
                     else
                     {
-                        move_dynamic_tensor_coordinate(
+                        move_tensor_coordinate(
                             src_desc, src_coord_, src_backward_iterators[src_dim_access_order[i]]);
                     }
                 }
@@ -878,9 +876,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
         if constexpr(SrcResetCoordinateAfterRun)
         {
             const auto src_reset_iterator =
-                make_dynamic_tensor_coordinate_iterator(src_desc, GetSrcCoordinateResetStep());
+                make_tensor_coordinate_iterator(src_desc, GetSrcCoordinateResetStep());
 
-            move_dynamic_tensor_coordinate(src_desc, src_coord_, src_reset_iterator);
+            move_tensor_coordinate(src_desc, src_coord_, src_reset_iterator);
         }
     }
 
@@ -924,7 +922,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
                     forward_step(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
                 });
 
-                const auto forward_iterator = make_dynamic_tensor_coordinate_iterator(
+                const auto forward_iterator = make_tensor_coordinate_iterator(
                     dst_desc, forward_step, dst_iterator_hacks[I0][i]);
 
                 return forward_iterator;
@@ -940,7 +938,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
                     backward_step(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
                 });
 
-                const auto backward_iterator = make_dynamic_tensor_coordinate_iterator(
+                const auto backward_iterator = make_tensor_coordinate_iterator(
                     dst_desc, backward_step, dst_iterator_hacks[I1][i]);
 
                 return backward_iterator;
@@ -1026,12 +1024,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
                 {
                     if constexpr(forward_sweep[i])
                     {
-                        move_dynamic_tensor_coordinate(
+                        move_tensor_coordinate(
                             dst_desc, dst_coord_, dst_forward_iterators[dst_dim_access_order[i]]);
                     }
                     else
                     {
-                        move_dynamic_tensor_coordinate(
+                        move_tensor_coordinate(
                             dst_desc, dst_coord_, dst_backward_iterators[dst_dim_access_order[i]]);
                     }
                 }
@@ -1042,9 +1040,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
         if constexpr(DstResetCoordinateAfterRun)
         {
             const auto dst_reset_iterator =
-                make_dynamic_tensor_coordinate_iterator(dst_desc, GetDstCoordinateResetStep());
+                make_tensor_coordinate_iterator(dst_desc, GetDstCoordinateResetStep());
 
-            move_dynamic_tensor_coordinate(dst_desc, dst_coord_, dst_reset_iterator);
+            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_iterator);
         }
     }
 
@@ -1206,10 +1204,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
                                        : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
 
         // is it OK to construct a new step every time?
-        const auto adjusted_step =
-            make_dynamic_tensor_coordinate_iterator(src_desc, adjusted_step_idx);
+        const auto adjusted_step = make_tensor_coordinate_iterator(src_desc, adjusted_step_idx);
 
-        move_dynamic_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
     }
 
     // src_slice_origin_step_idx need to be known at compile-time, for performance reason
@@ -1225,10 +1222,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
                                        : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
 
         // is it OK to construct a new step every time?
-        const auto adjusted_step = make_dynamic_tensor_coordinate_iterator(
+        const auto adjusted_step = make_tensor_coordinate_iterator(
             src_desc, adjusted_step_idx, src_move_slice_window_iterator_hack);
 
-        move_dynamic_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
     }
     // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
     __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
@@ -1240,15 +1237,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
                                        : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
 
         // is it OK to construct a new step every time?
-        const auto adjusted_step =
-            make_dynamic_tensor_coordinate_iterator(dst_desc, adjusted_step_idx);
+        const auto adjusted_step = make_tensor_coordinate_iterator(dst_desc, adjusted_step_idx);
 
-        move_dynamic_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
+        move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
     }
 
     private:
     static constexpr auto buffer_desc_ =
-        make_dynamic_naive_tensor_descriptor_packed_v2(sequence_to_tuple_of_number(SliceLengths{}));
+        make_naive_tensor_descriptor_packed(sequence_to_tuple_of_number(SliceLengths{}));
 
     static constexpr auto buffer_size_ = buffer_desc_.GetElementSpaceSize();
 
@@ -1283,18 +1279,18 @@ template <
     index_t SrcScalarStrideInVector,
     typename std::enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
                             bool>::type = false>
-struct ThreadwiseDynamicTensorSliceTransfer_v4
+struct ThreadwiseTensorSliceTransfer_v4
 {
     static constexpr index_t nDim = SliceLengths::Size();
 
     using Index = MultiIndex<nDim>;
 
-    using SrcCoord = decltype(make_dynamic_tensor_coordinate(SrcDesc{}, Index{}));
+    using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
 
-    using SrcCoordIterator = decltype(make_dynamic_tensor_coordinate_iterator(SrcDesc{}, Index{}));
+    using SrcCoordIterator = decltype(make_tensor_coordinate_iterator(SrcDesc{}, Index{}));
 
-    __device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v4(const Index& src_ref_idx)
-        : src_ref_coord_(make_dynamic_tensor_coordinate(SrcDesc{}, src_ref_idx))
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v4(const Index& src_ref_idx)
+        : src_ref_coord_(make_tensor_coordinate(SrcDesc{}, src_ref_idx))
     {
         static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
                       "wrong! SrcDesc and DstDesc need to known at compile-time");
@@ -1391,12 +1387,11 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4
                 src_ref_to_origin_disp_idx + data_to_origin_disp_idx;
 
             constexpr auto src_ref_to_data_disp_coord_iterator =
-                make_dynamic_tensor_coordinate_iterator(src_desc, src_ref_to_data_disp_idx);
+                make_tensor_coordinate_iterator(src_desc, src_ref_to_data_disp_idx);
 
             auto src_data_coord = src_ref_coord_;
 
-            move_dynamic_tensor_coordinate(
-                src_desc, src_data_coord, src_ref_to_data_disp_coord_iterator);
+            move_tensor_coordinate(src_desc, src_data_coord, src_ref_to_data_disp_coord_iterator);
 
             vector_type_maker_t<SrcData, SrcScalarPerVector> src_tmp_vector;
 
@@ -1435,10 +1430,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4
     {
         constexpr auto src_desc = SrcDesc{};
 
-        const auto src_slice_move_step_iter = make_dynamic_tensor_coordinate_iterator(
-            src_desc, to_multi_index(src_slice_move_step_idx));
+        const auto src_slice_move_step_iter =
+            make_tensor_coordinate_iterator(src_desc, to_multi_index(src_slice_move_step_idx));
 
-        move_dynamic_tensor_coordinate(SrcDesc{}, src_ref_coord_, src_slice_move_step_iter);
+        move_tensor_coordinate(SrcDesc{}, src_ref_coord_, src_slice_move_step_iter);
     }
 
     private:
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer_v2.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp
similarity index 86%
rename from src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer_v2.hpp
rename to src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp
index ba60e26c38..a2613f2e2d 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer_v2.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp
@@ -1,9 +1,9 @@
-#ifndef CK_THREADWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_V2_HPP
-#define CK_THREADWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_V2_HPP
+#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V2_HPP
+#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V2_HPP
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 
 namespace ck {
 
@@ -30,7 +30,7 @@ template <typename SliceLengths,
           bool DstResetCoordinateAfterRun> // control whether to move back dst coordinate after each
                                            // RunWrite(),  will be fused with MoveDstSliceWindow to
                                            // save addr computation
-struct ThreadwiseDynamicTensorSliceTransfer_v3r1
+struct ThreadwiseTensorSliceTransfer_v3r1
 {
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
@@ -38,18 +38,18 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
     static constexpr index_t nDim = SliceLengths::Size();
     using Index                   = MultiIndex<nDim>;
 
-    using SrcCoord = decltype(make_dynamic_tensor_coordinate(SrcDesc{}, Index{}));
-    using DstCoord = decltype(make_dynamic_tensor_coordinate(DstDesc{}, Index{}));
+    using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
+    using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
 
-    using SrcCoordIterator = decltype(make_dynamic_tensor_coordinate_iterator(SrcDesc{}, Index{}));
-    using DstCoordIterator = decltype(make_dynamic_tensor_coordinate_iterator(DstDesc{}, Index{}));
+    using SrcCoordIterator = decltype(make_tensor_coordinate_iterator(SrcDesc{}, Index{}));
+    using DstCoordIterator = decltype(make_tensor_coordinate_iterator(DstDesc{}, Index{}));
 
-    __device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v3r1(const SrcDesc& src_desc,
-                                                                   const Index& src_slice_origin,
-                                                                   const DstDesc& dst_desc,
-                                                                   const Index& dst_slice_origin)
-        : src_coord_(make_dynamic_tensor_coordinate(src_desc, src_slice_origin)),
-          dst_coord_(make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin))
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v3r1(const SrcDesc& src_desc,
+                                                            const Index& src_slice_origin,
+                                                            const DstDesc& dst_desc,
+                                                            const Index& dst_slice_origin)
+        : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)),
+          dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin))
     {
         // TODO: fix this
         static_assert(is_same<SrcData, DstData>::value,
@@ -64,12 +64,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
 
     __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
     {
-        src_coord_ = make_dynamic_tensor_coordinate(src_desc, src_slice_origin_idx);
+        src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx);
     }
 
     __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
     {
-        dst_coord_ = make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_idx);
+        dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
     }
 
     template <typename SrcBuffer, typename SrcIteratorHacks>
@@ -96,9 +96,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
                 I1),
             SrcVectorTensorContiguousDimOrder{});
 
-        constexpr auto src_vector_desc = make_dynamic_naive_tensor_descriptor_v2(
-            sequence_to_tuple_of_number(src_vector_tensor_lengths),
-            sequence_to_tuple_of_number(src_vector_tensor_strides));
+        constexpr auto src_vector_desc =
+            make_naive_tensor_descriptor_v2(sequence_to_tuple_of_number(src_vector_tensor_lengths),
+                                            sequence_to_tuple_of_number(src_vector_tensor_strides));
 
         // access order and lengths
         constexpr auto src_access_lengths = SliceLengths{} / src_vector_tensor_lengths;
@@ -117,7 +117,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
                     forward_step(j) = (i.value == j.value) ? src_vector_tensor_lengths[i] : 0;
                 });
 
-                return make_dynamic_tensor_coordinate_iterator(
+                return make_tensor_coordinate_iterator(
                     src_desc, forward_step, src_iterator_hacks[I0][i]);
             },
             Number<nDim>{});
@@ -131,7 +131,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
                     backward_step(j) = (i.value == j.value) ? -src_vector_tensor_lengths[i] : 0;
                 });
 
-                return make_dynamic_tensor_coordinate_iterator(
+                return make_tensor_coordinate_iterator(
                     src_desc, backward_step, src_iterator_hacks[I1][i]);
             },
             Number<nDim>{});
@@ -219,12 +219,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
                 {
                     if constexpr(forward_sweep[i])
                     {
-                        move_dynamic_tensor_coordinate(
+                        move_tensor_coordinate(
                             src_desc, src_coord_, src_forward_iterators[src_dim_access_order[i]]);
                     }
                     else
                     {
-                        move_dynamic_tensor_coordinate(
+                        move_tensor_coordinate(
                             src_desc, src_coord_, src_backward_iterators[src_dim_access_order[i]]);
                     }
                 }
@@ -235,9 +235,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
         if constexpr(SrcResetCoordinateAfterRun)
         {
             const auto src_reset_iterator =
-                make_dynamic_tensor_coordinate_iterator(src_desc, GetSrcCoordinateResetStep());
+                make_tensor_coordinate_iterator(src_desc, GetSrcCoordinateResetStep());
 
-            move_dynamic_tensor_coordinate(src_desc, src_coord_, src_reset_iterator);
+            move_tensor_coordinate(src_desc, src_coord_, src_reset_iterator);
         }
     }
 
@@ -265,9 +265,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
                 I1),
             DstVectorTensorContiguousDimOrder{});
 
-        constexpr auto dst_vector_desc = make_dynamic_naive_tensor_descriptor_v2(
-            sequence_to_tuple_of_number(dst_vector_tensor_lengths),
-            sequence_to_tuple_of_number(dst_vector_tensor_strides));
+        constexpr auto dst_vector_desc =
+            make_naive_tensor_descriptor_v2(sequence_to_tuple_of_number(dst_vector_tensor_lengths),
+                                            sequence_to_tuple_of_number(dst_vector_tensor_strides));
 
         // dst access order and lengths
         constexpr auto dst_access_lengths = SliceLengths{} / dst_vector_tensor_lengths;
@@ -286,7 +286,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
                     forward_step(j) = (i.value == j.value) ? dst_vector_tensor_lengths[i] : 0;
                 });
 
-                const auto forward_iterator = make_dynamic_tensor_coordinate_iterator(
+                const auto forward_iterator = make_tensor_coordinate_iterator(
                     dst_desc, forward_step, dst_iterator_hacks[I0][i]);
 
                 return forward_iterator;
@@ -302,7 +302,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
                     backward_step(j) = (i.value == j.value) ? -dst_vector_tensor_lengths[i] : 0;
                 });
 
-                const auto backward_iterator = make_dynamic_tensor_coordinate_iterator(
+                const auto backward_iterator = make_tensor_coordinate_iterator(
                     dst_desc, backward_step, dst_iterator_hacks[I1][i]);
 
                 return backward_iterator;
@@ -394,12 +394,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
                 {
                     if constexpr(forward_sweep[i])
                     {
-                        move_dynamic_tensor_coordinate(
+                        move_tensor_coordinate(
                             dst_desc, dst_coord_, dst_forward_iterators[dst_dim_access_order[i]]);
                     }
                     else
                     {
-                        move_dynamic_tensor_coordinate(
+                        move_tensor_coordinate(
                             dst_desc, dst_coord_, dst_backward_iterators[dst_dim_access_order[i]]);
                     }
                 }
@@ -410,9 +410,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
         if constexpr(DstResetCoordinateAfterRun)
         {
             const auto dst_reset_iterator =
-                make_dynamic_tensor_coordinate_iterator(dst_desc, GetDstCoordinateResetStep());
+                make_tensor_coordinate_iterator(dst_desc, GetDstCoordinateResetStep());
 
-            move_dynamic_tensor_coordinate(dst_desc, dst_coord_, dst_reset_iterator);
+            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_iterator);
         }
     }
 
@@ -564,10 +564,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
                                        : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
 
         // is it OK to construct a new step every time?
-        const auto adjusted_step =
-            make_dynamic_tensor_coordinate_iterator(src_desc, adjusted_step_idx);
+        const auto adjusted_step = make_tensor_coordinate_iterator(src_desc, adjusted_step_idx);
 
-        move_dynamic_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
     }
 
     // src_slice_origin_step_idx need to be known at compile-time, for performance reason
@@ -583,10 +582,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
                                        : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
 
         // is it OK to construct a new step every time?
-        const auto adjusted_step = make_dynamic_tensor_coordinate_iterator(
+        const auto adjusted_step = make_tensor_coordinate_iterator(
             src_desc, adjusted_step_idx, src_move_slice_window_iterator_hack);
 
-        move_dynamic_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
     }
     // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
     __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
@@ -598,15 +597,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
                                        : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
 
         // is it OK to construct a new step every time?
-        const auto adjusted_step =
-            make_dynamic_tensor_coordinate_iterator(dst_desc, adjusted_step_idx);
+        const auto adjusted_step = make_tensor_coordinate_iterator(dst_desc, adjusted_step_idx);
 
-        move_dynamic_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
+        move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
     }
 
     private:
     static constexpr auto buffer_desc_ =
-        make_dynamic_naive_tensor_descriptor_packed_v2(sequence_to_tuple_of_number(SliceLengths{}));
+        make_naive_tensor_descriptor_packed(sequence_to_tuple_of_number(SliceLengths{}));
 
     static constexpr auto buffer_size_ = buffer_desc_.GetElementSpaceSize();
 
@@ -640,7 +638,7 @@ template <
     typename SrcVectorTensorContiguousDimOrder,
     typename std::enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
                             bool>::type = false>
-struct ThreadwiseDynamicTensorSliceTransfer_v4r1
+struct ThreadwiseTensorSliceTransfer_v4r1
 {
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
@@ -649,12 +647,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4r1
 
     using Index = MultiIndex<nDim>;
 
-    using SrcCoord = decltype(make_dynamic_tensor_coordinate(SrcDesc{}, Index{}));
+    using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
 
-    using SrcCoordIterator = decltype(make_dynamic_tensor_coordinate_iterator(SrcDesc{}, Index{}));
+    using SrcCoordIterator = decltype(make_tensor_coordinate_iterator(SrcDesc{}, Index{}));
 
-    __device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v4r1(const Index& src_ref_idx)
-        : src_ref_coord_(make_dynamic_tensor_coordinate(SrcDesc{}, src_ref_idx))
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v4r1(const Index& src_ref_idx)
+        : src_ref_coord_(make_tensor_coordinate(SrcDesc{}, src_ref_idx))
     {
         static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
                       "wrong! SrcDesc and DstDesc need to known at compile-time");
@@ -712,9 +710,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4r1
                 I1),
             SrcVectorTensorContiguousDimOrder{});
 
-        constexpr auto src_vector_desc = make_dynamic_naive_tensor_descriptor_v2(
-            sequence_to_tuple_of_number(src_vector_tensor_lengths),
-            sequence_to_tuple_of_number(src_vector_tensor_strides));
+        constexpr auto src_vector_desc =
+            make_naive_tensor_descriptor_v2(sequence_to_tuple_of_number(src_vector_tensor_lengths),
+                                            sequence_to_tuple_of_number(src_vector_tensor_strides));
 
         // access order and lengths
         constexpr auto access_lengths = SliceLengths{} / src_vector_tensor_lengths;
@@ -735,12 +733,11 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4r1
                 src_ref_to_origin_disp_idx + data_to_origin_disp_idx;
 
             constexpr auto src_ref_to_data_disp_coord_iterator =
-                make_dynamic_tensor_coordinate_iterator(src_desc, src_ref_to_data_disp_idx);
+                make_tensor_coordinate_iterator(src_desc, src_ref_to_data_disp_idx);
 
             auto src_data_coord = src_ref_coord_;
 
-            move_dynamic_tensor_coordinate(
-                src_desc, src_data_coord, src_ref_to_data_disp_coord_iterator);
+            move_tensor_coordinate(src_desc, src_data_coord, src_ref_to_data_disp_coord_iterator);
 
             vector_type_maker_t<SrcData, src_vector_desc.GetElementSpaceSize()> src_vector;
 
@@ -775,10 +772,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4r1
     {
         constexpr auto src_desc = SrcDesc{};
 
-        const auto src_slice_move_step_iter = make_dynamic_tensor_coordinate_iterator(
-            src_desc, to_multi_index(src_slice_move_step_idx));
+        const auto src_slice_move_step_iter =
+            make_tensor_coordinate_iterator(src_desc, to_multi_index(src_slice_move_step_idx));
 
-        move_dynamic_tensor_coordinate(SrcDesc{}, src_ref_coord_, src_slice_move_step_iter);
+        move_tensor_coordinate(SrcDesc{}, src_ref_coord_, src_slice_move_step_iter);
     }
 
     private:
diff --git a/src/composable_kernel/composable_kernel/include/utility/config.hpp b/src/composable_kernel/composable_kernel/include/utility/config.hpp
index 547d1fadbe..49f1bb7a5a 100644
--- a/src/composable_kernel/composable_kernel/include/utility/config.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/config.hpp
@@ -99,8 +99,8 @@
 // hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
 // thread-invariant, otherwise it's a bug
 // TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread"
-#ifndef CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
-#define CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0
+#ifndef CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
+#define CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0
 #endif
 
 // workaround for compiler crash when compiling recursive lambda
diff --git a/src/composable_kernel/composable_kernel/include/utility/dynamic_buffer.hpp b/src/composable_kernel/composable_kernel/include/utility/dynamic_buffer.hpp
index b41639051f..920a716765 100644
--- a/src/composable_kernel/composable_kernel/include/utility/dynamic_buffer.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/dynamic_buffer.hpp
@@ -1,5 +1,5 @@
-#ifndef CK_DYNAMIC_BUFFER_HPP
-#define CK_DYNAMIC_BUFFER_HPP
+#ifndef CK_BUFFER_HPP
+#define CK_BUFFER_HPP
 
 #include "amd_buffer_addressing.hpp"
 #include "c_style_pointer_cast.hpp"
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp
similarity index 67%
rename from src/composable_kernel/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp
rename to src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp
index 652ccdb926..1843a0ca64 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp
@@ -1,7 +1,7 @@
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "gridwise_dynamic_gemm_dlops_v1r2.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "gridwise_gemm_dlops_v1r2.hpp"
 #include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp"
 
 using namespace ck;
@@ -64,8 +64,7 @@ constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDs
 constexpr bool HasMainKBlockLoop       = static_cast<bool>(CK_PARAM_HAS_MAIN_KBLOCK_LOOP);
 constexpr bool HasDoubleTailKBlockLoop = static_cast<bool>(CK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP);
 
-extern "C" __global__ void
-dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw_prepare(
+extern "C" __global__ void convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw_prepare(
     int n,
     int c,
     int hi,
@@ -93,12 +92,9 @@ dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw_prepare(
     const index_t ho = (hi + leftPadH + rightPadH - convDilationY * (y - 1) - 1) / convStrideH + 1;
     const index_t wo = (wi + leftPadW + rightPadW - convDilationX * (x - 1) - 1) / convStrideW + 1;
 
-    const auto in_n_c_hi_wi_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(n, c, hi, wi));
-    const auto wei_k_c_y_x_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(k, c, y, x));
-    const auto out_n_k_ho_wo_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(n, k, ho, wo));
+    const auto in_n_c_hi_wi_desc  = make_naive_tensor_descriptor_packed(make_tuple(n, c, hi, wi));
+    const auto wei_k_c_y_x_desc   = make_naive_tensor_descriptor_packed(make_tuple(k, c, y, x));
+    const auto out_n_k_ho_wo_desc = make_naive_tensor_descriptor_packed(make_tuple(n, k, ho, wo));
 
     const auto descs = transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(
         wei_k_c_y_x_desc,
@@ -151,48 +147,48 @@ dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw_prepare(
     using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
 
     using GridwiseGemm =
-        GridwiseDynamicGemmDlops_km_kn_mn_v1r2<BlockSize,
-                                               FloatAB,
-                                               FloatAcc,
-                                               FloatC,
-                                               InMemoryDataOperationEnum_t::Set, /* ToDo tunable */
-                                               AKMGridDesc,
-                                               BKNGridDesc,
-                                               CMNGridDesc,
-                                               MPerBlock,
-                                               NPerBlock,
-                                               KPerBlock,
-                                               M1PerThread,
-                                               N1PerThread,
-                                               KPerThread,
-                                               M1N1ThreadClusterM10,
-                                               M1N1ThreadClusterN10,
-                                               M1N1ThreadClusterM11,
-                                               M1N1ThreadClusterN11,
-                                               ABlockTransferThreadSliceLengths_K_M0_M1,
-                                               ABlockTransferThreadClusterLengths_K_M0_M1,
-                                               ABlockTransferThreadClusterArrangeOrder,
-                                               ABlockTransferSrcAccessOrder,
-                                               ABlockTransferSrcVectorDim,
-                                               ABlockTransferSrcScalarPerVector,
-                                               ABlockTransferDstScalarPerVector_M1,
-                                               AThreadTransferSrcResetCoordinateAfterRun,
-                                               BBlockTransferThreadSliceLengths_K_N0_N1,
-                                               BBlockTransferThreadClusterLengths_K_N0_N1,
-                                               BBlockTransferThreadClusterArrangeOrder,
-                                               BBlockTransferSrcAccessOrder,
-                                               BBlockTransferSrcVectorDim,
-                                               BBlockTransferSrcScalarPerVector,
-                                               BBlockTransferDstScalarPerVector_N1,
-                                               BThreadTransferSrcResetCoordinateAfterRun,
-                                               CThreadTransferSrcDstAccessOrder,
-                                               CThreadTransferSrcDstVectorDim,
-                                               CThreadTransferDstScalarPerVector,
-                                               AGridIteratorHacks,
-                                               BGridIteratorHacks,
-                                               CGridIteratorHacks,
-                                               AGridMoveSliceWindowIteratorHacks,
-                                               BGridMoveSliceWindowIteratorHacks>;
+        GridwiseGemmDlops_km_kn_mn_v1r2<BlockSize,
+                                        FloatAB,
+                                        FloatAcc,
+                                        FloatC,
+                                        InMemoryDataOperationEnum_t::Set, /* ToDo tunable */
+                                        AKMGridDesc,
+                                        BKNGridDesc,
+                                        CMNGridDesc,
+                                        MPerBlock,
+                                        NPerBlock,
+                                        KPerBlock,
+                                        M1PerThread,
+                                        N1PerThread,
+                                        KPerThread,
+                                        M1N1ThreadClusterM10,
+                                        M1N1ThreadClusterN10,
+                                        M1N1ThreadClusterM11,
+                                        M1N1ThreadClusterN11,
+                                        ABlockTransferThreadSliceLengths_K_M0_M1,
+                                        ABlockTransferThreadClusterLengths_K_M0_M1,
+                                        ABlockTransferThreadClusterArrangeOrder,
+                                        ABlockTransferSrcAccessOrder,
+                                        ABlockTransferSrcVectorDim,
+                                        ABlockTransferSrcScalarPerVector,
+                                        ABlockTransferDstScalarPerVector_M1,
+                                        AThreadTransferSrcResetCoordinateAfterRun,
+                                        BBlockTransferThreadSliceLengths_K_N0_N1,
+                                        BBlockTransferThreadClusterLengths_K_N0_N1,
+                                        BBlockTransferThreadClusterArrangeOrder,
+                                        BBlockTransferSrcAccessOrder,
+                                        BBlockTransferSrcVectorDim,
+                                        BBlockTransferSrcScalarPerVector,
+                                        BBlockTransferDstScalarPerVector_N1,
+                                        BThreadTransferSrcResetCoordinateAfterRun,
+                                        CThreadTransferSrcDstAccessOrder,
+                                        CThreadTransferSrcDstVectorDim,
+                                        CThreadTransferDstScalarPerVector,
+                                        AGridIteratorHacks,
+                                        BGridIteratorHacks,
+                                        CGridIteratorHacks,
+                                        AGridMoveSliceWindowIteratorHacks,
+                                        BGridMoveSliceWindowIteratorHacks>;
 
     auto a_k_m0_m1_grid_desc = GridwiseGemm::MakeAKM0M1GridDescriptor(a_k_m_grid_desc);
     auto b_k_n0_n1_grid_desc = GridwiseGemm::MakeBKN0N1GridDescriptor(b_k_n_grid_desc);
@@ -216,7 +212,7 @@ extern "C" __global__ void
 #if CK_USE_LAUNCH_BOUNDS
     __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
+        convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
             const FloatAB* __restrict__ p_a_grid,
             const FloatAB* __restrict__ p_b_grid,
             FloatC* __restrict__ p_c_grid,
@@ -230,11 +226,11 @@ extern "C" __global__ void
     constexpr auto I2 = Number<2>{};
 
     constexpr auto in_n_c_hi_wi_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 256, 28, 28));
+        make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28));
     constexpr auto wei_k_c_y_x_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 256, 3, 3));
+        make_naive_tensor_descriptor_packed(make_tuple(256, 256, 3, 3));
     constexpr auto out_n_k_ho_wo_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 256, 28, 28));
+        make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28));
 
     constexpr auto descs =
         transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
@@ -287,48 +283,48 @@ extern "C" __global__ void
     using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
 
     using GridwiseGemm =
-        GridwiseDynamicGemmDlops_km_kn_mn_v1r2<BlockSize,
-                                               FloatAB,
-                                               FloatAcc,
-                                               FloatC,
-                                               InMemoryDataOperationEnum_t::Set, /* ToDo tunable */
-                                               AKMGridDesc,
-                                               BKNGridDesc,
-                                               CMNGridDesc,
-                                               MPerBlock,
-                                               NPerBlock,
-                                               KPerBlock,
-                                               M1PerThread,
-                                               N1PerThread,
-                                               KPerThread,
-                                               M1N1ThreadClusterM10,
-                                               M1N1ThreadClusterN10,
-                                               M1N1ThreadClusterM11,
-                                               M1N1ThreadClusterN11,
-                                               ABlockTransferThreadSliceLengths_K_M0_M1,
-                                               ABlockTransferThreadClusterLengths_K_M0_M1,
-                                               ABlockTransferThreadClusterArrangeOrder,
-                                               ABlockTransferSrcAccessOrder,
-                                               ABlockTransferSrcVectorDim,
-                                               ABlockTransferSrcScalarPerVector,
-                                               ABlockTransferDstScalarPerVector_M1,
-                                               AThreadTransferSrcResetCoordinateAfterRun,
-                                               BBlockTransferThreadSliceLengths_K_N0_N1,
-                                               BBlockTransferThreadClusterLengths_K_N0_N1,
-                                               BBlockTransferThreadClusterArrangeOrder,
-                                               BBlockTransferSrcAccessOrder,
-                                               BBlockTransferSrcVectorDim,
-                                               BBlockTransferSrcScalarPerVector,
-                                               BBlockTransferDstScalarPerVector_N1,
-                                               BThreadTransferSrcResetCoordinateAfterRun,
-                                               CThreadTransferSrcDstAccessOrder,
-                                               CThreadTransferSrcDstVectorDim,
-                                               CThreadTransferDstScalarPerVector,
-                                               AGridIteratorHacks,
-                                               BGridIteratorHacks,
-                                               CGridIteratorHacks,
-                                               AGridMoveSliceWindowIteratorHacks,
-                                               BGridMoveSliceWindowIteratorHacks>;
+        GridwiseGemmDlops_km_kn_mn_v1r2<BlockSize,
+                                        FloatAB,
+                                        FloatAcc,
+                                        FloatC,
+                                        InMemoryDataOperationEnum_t::Set, /* ToDo tunable */
+                                        AKMGridDesc,
+                                        BKNGridDesc,
+                                        CMNGridDesc,
+                                        MPerBlock,
+                                        NPerBlock,
+                                        KPerBlock,
+                                        M1PerThread,
+                                        N1PerThread,
+                                        KPerThread,
+                                        M1N1ThreadClusterM10,
+                                        M1N1ThreadClusterN10,
+                                        M1N1ThreadClusterM11,
+                                        M1N1ThreadClusterN11,
+                                        ABlockTransferThreadSliceLengths_K_M0_M1,
+                                        ABlockTransferThreadClusterLengths_K_M0_M1,
+                                        ABlockTransferThreadClusterArrangeOrder,
+                                        ABlockTransferSrcAccessOrder,
+                                        ABlockTransferSrcVectorDim,
+                                        ABlockTransferSrcScalarPerVector,
+                                        ABlockTransferDstScalarPerVector_M1,
+                                        AThreadTransferSrcResetCoordinateAfterRun,
+                                        BBlockTransferThreadSliceLengths_K_N0_N1,
+                                        BBlockTransferThreadClusterLengths_K_N0_N1,
+                                        BBlockTransferThreadClusterArrangeOrder,
+                                        BBlockTransferSrcAccessOrder,
+                                        BBlockTransferSrcVectorDim,
+                                        BBlockTransferSrcScalarPerVector,
+                                        BBlockTransferDstScalarPerVector_N1,
+                                        BThreadTransferSrcResetCoordinateAfterRun,
+                                        CThreadTransferSrcDstAccessOrder,
+                                        CThreadTransferSrcDstVectorDim,
+                                        CThreadTransferDstScalarPerVector,
+                                        AGridIteratorHacks,
+                                        BGridIteratorHacks,
+                                        CGridIteratorHacks,
+                                        AGridMoveSliceWindowIteratorHacks,
+                                        BGridMoveSliceWindowIteratorHacks>;
 
     constexpr auto a_k_m0_m1_grid_desc_tmp =
         GridwiseGemm::MakeAKM0M1GridDescriptor(a_k_m_grid_desc);
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
similarity index 64%
rename from src/composable_kernel/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
rename to src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
index d33bc74aa6..d434dab6fe 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
@@ -1,7 +1,7 @@
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "gridwise_dynamic_gemm_xdlops_v2r3.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "gridwise_gemm_xdlops_v2r3.hpp"
 #include "transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp"
 
 using namespace ck;
@@ -60,8 +60,7 @@ using CThreadTransferSrcDstAccessOrder = Sequence<CK_PARAM_CThreadTransferSrcDst
 constexpr index_t CThreadTransferSrcDstVectorDim    = CK_PARAM_CThreadTransferSrcDstVectorDim;
 constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDstScalarPerVector;
 
-extern "C" __global__ void
-dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_prepare(
+extern "C" __global__ void convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_prepare(
     int n,
     int c,
     int hi,
@@ -89,12 +88,9 @@ dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_prepare(
     const index_t ho = (hi + leftPadH + rightPadH - convDilationY * (y - 1) - 1) / convStrideH + 1;
     const index_t wo = (wi + leftPadW + rightPadW - convDilationX * (x - 1) - 1) / convStrideW + 1;
 
-    const auto in_n_c_hi_wi_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(n, c, hi, wi));
-    const auto wei_k_c_y_x_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(k, c, y, x));
-    const auto out_n_k_ho_wo_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(n, k, ho, wo));
+    const auto in_n_c_hi_wi_desc  = make_naive_tensor_descriptor_packed(make_tuple(n, c, hi, wi));
+    const auto wei_k_c_y_x_desc   = make_naive_tensor_descriptor_packed(make_tuple(k, c, y, x));
+    const auto out_n_k_ho_wo_desc = make_naive_tensor_descriptor_packed(make_tuple(n, k, ho, wo));
 
     const auto descs = transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(
         wei_k_c_y_x_desc,
@@ -148,47 +144,47 @@ dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_prepare(
     using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
 
     using GridwiseGemm =
-        GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
-                                                       FloatAB,
-                                                       FloatAcc,
-                                                       FloatC,
-                                                       InMemoryDataOperationEnum_t::Set,
-                                                       AK0MK1GridDesc,
-                                                       BK0NK1GridDesc,
-                                                       CMNGridDesc,
-                                                       MPerBlock,
-                                                       NPerBlock,
-                                                       KPerBlock,
-                                                       MPerWave,
-                                                       NPerWave,
-                                                       K1,
-                                                       MRepeat,
-                                                       NRepeat,
-                                                       ABlockTransferThreadSliceLengths_K0_M_K1,
-                                                       ABlockTransferThreadClusterLengths_K0_M_K1,
-                                                       ABlockTransferThreadClusterArrangeOrder,
-                                                       ABlockTransferSrcAccessOrder,
-                                                       ABlockTransferSrcVectorDim,
-                                                       ABlockTransferSrcScalarPerVector,
-                                                       ABlockTransferDstScalarPerVector_K1,
-                                                       AThreadTransferSrcResetCoordinateAfterRun,
-                                                       BBlockTransferThreadSliceLengths_K0_N_K1,
-                                                       BBlockTransferThreadClusterLengths_K0_N_K1,
-                                                       BBlockTransferThreadClusterArrangeOrder,
-                                                       BBlockTransferSrcAccessOrder,
-                                                       BBlockTransferSrcVectorDim,
-                                                       BBlockTransferSrcScalarPerVector,
-                                                       BBlockTransferDstScalarPerVector_K1,
-                                                       BThreadTransferSrcResetCoordinateAfterRun,
-                                                       CThreadTransferSrcDstAccessOrder,
-                                                       CThreadTransferSrcDstVectorDim,
-                                                       CThreadTransferDstScalarPerVector,
-                                                       AGridIteratorHacks,
-                                                       BGridIteratorHacks,
-                                                       CGridIteratorHacks,
-                                                       AGridMoveSliceWindowIteratorHacks,
-                                                       BGridMoveSliceWindowIteratorHacks,
-                                                       false>;
+        GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
+                                                FloatAB,
+                                                FloatAcc,
+                                                FloatC,
+                                                InMemoryDataOperationEnum_t::Set,
+                                                AK0MK1GridDesc,
+                                                BK0NK1GridDesc,
+                                                CMNGridDesc,
+                                                MPerBlock,
+                                                NPerBlock,
+                                                KPerBlock,
+                                                MPerWave,
+                                                NPerWave,
+                                                K1,
+                                                MRepeat,
+                                                NRepeat,
+                                                ABlockTransferThreadSliceLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                ABlockTransferSrcAccessOrder,
+                                                ABlockTransferSrcVectorDim,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_K1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                BBlockTransferThreadSliceLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                BBlockTransferSrcAccessOrder,
+                                                BBlockTransferSrcVectorDim,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_K1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                CThreadTransferSrcDstAccessOrder,
+                                                CThreadTransferSrcDstVectorDim,
+                                                CThreadTransferDstScalarPerVector,
+                                                AGridIteratorHacks,
+                                                BGridIteratorHacks,
+                                                CGridIteratorHacks,
+                                                AGridMoveSliceWindowIteratorHacks,
+                                                BGridMoveSliceWindowIteratorHacks,
+                                                false>;
 
     auto c_m0_m1_m2_n_grid_desc = GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
 
@@ -212,7 +208,7 @@ extern "C" __global__ void
 #if CK_USE_LAUNCH_BOUNDS
     __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(
+        convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(
             const FloatAB* __restrict__ p_a_grid,
             const FloatAB* __restrict__ p_b_grid,
             FloatC* __restrict__ p_c_grid,
@@ -227,11 +223,11 @@ extern "C" __global__ void
     constexpr auto I2 = Number<2>{};
 
     constexpr auto in_n_c_hi_wi_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 256, 28, 28));
+        make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28));
     constexpr auto wei_k_c_y_x_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 256, 3, 3));
+        make_naive_tensor_descriptor_packed(make_tuple(256, 256, 3, 3));
     constexpr auto out_n_k_ho_wo_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 256, 28, 28));
+        make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28));
 
     constexpr auto descs =
         transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
@@ -285,47 +281,47 @@ extern "C" __global__ void
     using CMNGridDesc    = decltype(c_m_n_grid_desc);
 
     using GridwiseGemm =
-        GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
-                                                       FloatAB,
-                                                       FloatAcc,
-                                                       FloatC,
-                                                       InMemoryDataOperationEnum_t::Set,
-                                                       AK0MK1GridDesc,
-                                                       BK0NK1GridDesc,
-                                                       CMNGridDesc,
-                                                       MPerBlock,
-                                                       NPerBlock,
-                                                       KPerBlock,
-                                                       MPerWave,
-                                                       NPerWave,
-                                                       K1,
-                                                       MRepeat,
-                                                       NRepeat,
-                                                       ABlockTransferThreadSliceLengths_K0_M_K1,
-                                                       ABlockTransferThreadClusterLengths_K0_M_K1,
-                                                       ABlockTransferThreadClusterArrangeOrder,
-                                                       ABlockTransferSrcAccessOrder,
-                                                       ABlockTransferSrcVectorDim,
-                                                       ABlockTransferSrcScalarPerVector,
-                                                       ABlockTransferDstScalarPerVector_K1,
-                                                       AThreadTransferSrcResetCoordinateAfterRun,
-                                                       BBlockTransferThreadSliceLengths_K0_N_K1,
-                                                       BBlockTransferThreadClusterLengths_K0_N_K1,
-                                                       BBlockTransferThreadClusterArrangeOrder,
-                                                       BBlockTransferSrcAccessOrder,
-                                                       BBlockTransferSrcVectorDim,
-                                                       BBlockTransferSrcScalarPerVector,
-                                                       BBlockTransferDstScalarPerVector_K1,
-                                                       BThreadTransferSrcResetCoordinateAfterRun,
-                                                       CThreadTransferSrcDstAccessOrder,
-                                                       CThreadTransferSrcDstVectorDim,
-                                                       CThreadTransferDstScalarPerVector,
-                                                       AGridIteratorHacks,
-                                                       BGridIteratorHacks,
-                                                       CGridIteratorHacks,
-                                                       AGridMoveSliceWindowIteratorHacks,
-                                                       BGridMoveSliceWindowIteratorHacks,
-                                                       false>;
+        GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
+                                                FloatAB,
+                                                FloatAcc,
+                                                FloatC,
+                                                InMemoryDataOperationEnum_t::Set,
+                                                AK0MK1GridDesc,
+                                                BK0NK1GridDesc,
+                                                CMNGridDesc,
+                                                MPerBlock,
+                                                NPerBlock,
+                                                KPerBlock,
+                                                MPerWave,
+                                                NPerWave,
+                                                K1,
+                                                MRepeat,
+                                                NRepeat,
+                                                ABlockTransferThreadSliceLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                ABlockTransferSrcAccessOrder,
+                                                ABlockTransferSrcVectorDim,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_K1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                BBlockTransferThreadSliceLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                BBlockTransferSrcAccessOrder,
+                                                BBlockTransferSrcVectorDim,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_K1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                CThreadTransferSrcDstAccessOrder,
+                                                CThreadTransferSrcDstVectorDim,
+                                                CThreadTransferDstScalarPerVector,
+                                                AGridIteratorHacks,
+                                                BGridIteratorHacks,
+                                                CGridIteratorHacks,
+                                                AGridMoveSliceWindowIteratorHacks,
+                                                BGridMoveSliceWindowIteratorHacks,
+                                                false>;
 
     constexpr auto c_m0_m1_m2_n_grid_desc_tmp =
         GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp
similarity index 64%
rename from src/composable_kernel/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp
rename to src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp
index d946bc63ee..7678a69b12 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp
@@ -1,7 +1,7 @@
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "gridwise_dynamic_gemm_xdlops_v2r3.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "gridwise_gemm_xdlops_v2r3.hpp"
 #include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
 
 using namespace ck;
@@ -60,8 +60,7 @@ using CThreadTransferSrcDstAccessOrder = Sequence<CK_PARAM_CThreadTransferSrcDst
 constexpr index_t CThreadTransferSrcDstVectorDim    = CK_PARAM_CThreadTransferSrcDstVectorDim;
 constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDstScalarPerVector;
 
-extern "C" __global__ void
-dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk_prepare(
+extern "C" __global__ void convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk_prepare(
     int n,
     int hi,
     int wi,
@@ -89,12 +88,9 @@ dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk_prepare(
     const index_t ho = (hi + leftPadH + rightPadH - convDilationY * (y - 1) - 1) / convStrideH + 1;
     const index_t wo = (wi + leftPadW + rightPadW - convDilationX * (x - 1) - 1) / convStrideW + 1;
 
-    const auto in_n_hi_wi_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(n, hi, wi, c));
-    const auto wei_k_y_x_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(k, y, x, c));
-    const auto out_n_ho_wo_k_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(n, ho, wo, k));
+    const auto in_n_hi_wi_c_desc  = make_naive_tensor_descriptor_packed(make_tuple(n, hi, wi, c));
+    const auto wei_k_y_x_c_desc   = make_naive_tensor_descriptor_packed(make_tuple(k, y, x, c));
+    const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(make_tuple(n, ho, wo, k));
 
     const auto descs = transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad(
         in_n_hi_wi_c_desc,
@@ -148,47 +144,47 @@ dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk_prepare(
     using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0>;
 
     using GridwiseGemm =
-        GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
-                                                       FloatAB,
-                                                       FloatAcc,
-                                                       FloatC,
-                                                       InMemoryDataOperationEnum_t::Set,
-                                                       AK0MK1GridDesc,
-                                                       BK0NK1GridDesc,
-                                                       CMNGridDesc,
-                                                       MPerBlock,
-                                                       NPerBlock,
-                                                       KPerBlock,
-                                                       MPerWave,
-                                                       NPerWave,
-                                                       K1,
-                                                       MRepeat,
-                                                       NRepeat,
-                                                       ABlockTransferThreadSliceLengths_K0_M_K1,
-                                                       ABlockTransferThreadClusterLengths_K0_M_K1,
-                                                       ABlockTransferThreadClusterArrangeOrder,
-                                                       ABlockTransferSrcAccessOrder,
-                                                       ABlockTransferSrcVectorDim,
-                                                       ABlockTransferSrcScalarPerVector,
-                                                       ABlockTransferDstScalarPerVector_K1,
-                                                       AThreadTransferSrcResetCoordinateAfterRun,
-                                                       BBlockTransferThreadSliceLengths_K0_N_K1,
-                                                       BBlockTransferThreadClusterLengths_K0_N_K1,
-                                                       BBlockTransferThreadClusterArrangeOrder,
-                                                       BBlockTransferSrcAccessOrder,
-                                                       BBlockTransferSrcVectorDim,
-                                                       BBlockTransferSrcScalarPerVector,
-                                                       BBlockTransferDstScalarPerVector_K1,
-                                                       BThreadTransferSrcResetCoordinateAfterRun,
-                                                       CThreadTransferSrcDstAccessOrder,
-                                                       CThreadTransferSrcDstVectorDim,
-                                                       CThreadTransferDstScalarPerVector,
-                                                       AGridIteratorHacks,
-                                                       BGridIteratorHacks,
-                                                       CGridIteratorHacks,
-                                                       AGridMoveSliceWindowIteratorHacks,
-                                                       BGridMoveSliceWindowIteratorHacks,
-                                                       false>;
+        GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
+                                                FloatAB,
+                                                FloatAcc,
+                                                FloatC,
+                                                InMemoryDataOperationEnum_t::Set,
+                                                AK0MK1GridDesc,
+                                                BK0NK1GridDesc,
+                                                CMNGridDesc,
+                                                MPerBlock,
+                                                NPerBlock,
+                                                KPerBlock,
+                                                MPerWave,
+                                                NPerWave,
+                                                K1,
+                                                MRepeat,
+                                                NRepeat,
+                                                ABlockTransferThreadSliceLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                ABlockTransferSrcAccessOrder,
+                                                ABlockTransferSrcVectorDim,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_K1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                BBlockTransferThreadSliceLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                BBlockTransferSrcAccessOrder,
+                                                BBlockTransferSrcVectorDim,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_K1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                CThreadTransferSrcDstAccessOrder,
+                                                CThreadTransferSrcDstVectorDim,
+                                                CThreadTransferDstScalarPerVector,
+                                                AGridIteratorHacks,
+                                                BGridIteratorHacks,
+                                                CGridIteratorHacks,
+                                                AGridMoveSliceWindowIteratorHacks,
+                                                BGridMoveSliceWindowIteratorHacks,
+                                                false>;
 
     auto c_m0_m1_m2_n_grid_desc = GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
 
@@ -212,7 +208,7 @@ extern "C" __global__ void
 #if CK_USE_LAUNCH_BOUNDS
     __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk(
+        convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk(
             const FloatAB* __restrict__ p_a_grid,
             const FloatAB* __restrict__ p_b_grid,
             FloatC* __restrict__ p_c_grid,
@@ -227,11 +223,11 @@ extern "C" __global__ void
     constexpr auto I2 = Number<2>{};
 
     constexpr auto in_n_hi_wi_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 28, 28, 256));
+        make_naive_tensor_descriptor_packed(make_tuple(256, 28, 28, 256));
     constexpr auto wei_k_y_x_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 3, 3, 256));
+        make_naive_tensor_descriptor_packed(make_tuple(256, 3, 3, 256));
     constexpr auto out_n_ho_wo_k_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 28, 28, 256));
+        make_naive_tensor_descriptor_packed(make_tuple(256, 28, 28, 256));
 
     constexpr auto descs =
         transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad(in_n_hi_wi_c_desc,
@@ -285,47 +281,47 @@ extern "C" __global__ void
     using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0>;
 
     using GridwiseGemm =
-        GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
-                                                       FloatAB,
-                                                       FloatAcc,
-                                                       FloatC,
-                                                       InMemoryDataOperationEnum_t::Set,
-                                                       AK0MK1GridDesc,
-                                                       BK0NK1GridDesc,
-                                                       CMNGridDesc,
-                                                       MPerBlock,
-                                                       NPerBlock,
-                                                       KPerBlock,
-                                                       MPerWave,
-                                                       NPerWave,
-                                                       K1,
-                                                       MRepeat,
-                                                       NRepeat,
-                                                       ABlockTransferThreadSliceLengths_K0_M_K1,
-                                                       ABlockTransferThreadClusterLengths_K0_M_K1,
-                                                       ABlockTransferThreadClusterArrangeOrder,
-                                                       ABlockTransferSrcAccessOrder,
-                                                       ABlockTransferSrcVectorDim,
-                                                       ABlockTransferSrcScalarPerVector,
-                                                       ABlockTransferDstScalarPerVector_K1,
-                                                       AThreadTransferSrcResetCoordinateAfterRun,
-                                                       BBlockTransferThreadSliceLengths_K0_N_K1,
-                                                       BBlockTransferThreadClusterLengths_K0_N_K1,
-                                                       BBlockTransferThreadClusterArrangeOrder,
-                                                       BBlockTransferSrcAccessOrder,
-                                                       BBlockTransferSrcVectorDim,
-                                                       BBlockTransferSrcScalarPerVector,
-                                                       BBlockTransferDstScalarPerVector_K1,
-                                                       BThreadTransferSrcResetCoordinateAfterRun,
-                                                       CThreadTransferSrcDstAccessOrder,
-                                                       CThreadTransferSrcDstVectorDim,
-                                                       CThreadTransferDstScalarPerVector,
-                                                       AGridIteratorHacks,
-                                                       BGridIteratorHacks,
-                                                       CGridIteratorHacks,
-                                                       AGridMoveSliceWindowIteratorHacks,
-                                                       BGridMoveSliceWindowIteratorHacks,
-                                                       false>;
+        GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
+                                                FloatAB,
+                                                FloatAcc,
+                                                FloatC,
+                                                InMemoryDataOperationEnum_t::Set,
+                                                AK0MK1GridDesc,
+                                                BK0NK1GridDesc,
+                                                CMNGridDesc,
+                                                MPerBlock,
+                                                NPerBlock,
+                                                KPerBlock,
+                                                MPerWave,
+                                                NPerWave,
+                                                K1,
+                                                MRepeat,
+                                                NRepeat,
+                                                ABlockTransferThreadSliceLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                ABlockTransferSrcAccessOrder,
+                                                ABlockTransferSrcVectorDim,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_K1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                BBlockTransferThreadSliceLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                BBlockTransferSrcAccessOrder,
+                                                BBlockTransferSrcVectorDim,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_K1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                CThreadTransferSrcDstAccessOrder,
+                                                CThreadTransferSrcDstVectorDim,
+                                                CThreadTransferDstScalarPerVector,
+                                                AGridIteratorHacks,
+                                                BGridIteratorHacks,
+                                                CGridIteratorHacks,
+                                                AGridMoveSliceWindowIteratorHacks,
+                                                BGridMoveSliceWindowIteratorHacks,
+                                                false>;
     constexpr auto c_m0_m1_m2_n_grid_desc_tmp =
         GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
     constexpr auto c_blockid_to_m0_n0_block_cluster_adaptor_tmp =
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp
similarity index 93%
rename from src/composable_kernel/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp
rename to src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp
index 90c957bb0b..ac7e1dd6d4 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp
@@ -1,7 +1,7 @@
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "gridwise_dynamic_contraction_dlops_v1r2.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "gridwise_contraction_dlops_v1r2.hpp"
 #include "transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp"
 
 using namespace ck;
@@ -62,22 +62,22 @@ constexpr bool HasMainKBlockLoop       = static_cast<bool>(CK_PARAM_HasMainKBloc
 constexpr bool HasDoubleTailKBlockLoop = static_cast<bool>(CK_PARAM_HasDoubleTailKBlockLoop);
 
 extern "C" __global__ void
-dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare(index_t N,
-                                                                            index_t C,
-                                                                            index_t Hi,
-                                                                            index_t Wi,
-                                                                            index_t K,
-                                                                            index_t Y,
-                                                                            index_t X,
-                                                                            index_t ConvStrideH,
-                                                                            index_t ConvStrideW,
-                                                                            index_t ConvDilationH,
-                                                                            index_t ConvDilationW,
-                                                                            index_t InLeftPadH,
-                                                                            index_t InLeftPadW,
-                                                                            index_t InRightPadH,
-                                                                            index_t InRightPadW,
-                                                                            void* p_desc_tuple)
+convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare(index_t N,
+                                                                    index_t C,
+                                                                    index_t Hi,
+                                                                    index_t Wi,
+                                                                    index_t K,
+                                                                    index_t Y,
+                                                                    index_t X,
+                                                                    index_t ConvStrideH,
+                                                                    index_t ConvStrideW,
+                                                                    index_t ConvDilationH,
+                                                                    index_t ConvDilationW,
+                                                                    index_t InLeftPadH,
+                                                                    index_t InLeftPadW,
+                                                                    index_t InRightPadH,
+                                                                    index_t InRightPadW,
+                                                                    void* p_desc_tuple)
 {
     constexpr auto I0 = Number<0>{};
     constexpr auto I1 = Number<1>{};
@@ -88,12 +88,9 @@ dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare(inde
     const index_t Wo =
         (Wi + InLeftPadW + InRightPadW - ConvDilationW * (X - 1) - 1) / ConvStrideW + 1;
 
-    const auto in_n_c_hi_wi_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, C, Hi, Wi));
-    const auto wei_k_c_y_x_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C, Y, X));
-    const auto out_n_k_ho_wo_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K, Ho, Wo));
+    const auto in_n_c_hi_wi_desc  = make_naive_tensor_descriptor_packed(make_tuple(N, C, Hi, Wi));
+    const auto wei_k_c_y_x_desc   = make_naive_tensor_descriptor_packed(make_tuple(K, C, Y, X));
+    const auto out_n_k_ho_wo_desc = make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho, Wo));
 
     const auto descs = transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad(
         wei_k_c_y_x_desc,
@@ -160,7 +157,7 @@ dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare(inde
         Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0>;
 
     using GridwiseContraction =
-        GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1<
+        GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1<
             BlockSize,
             FloatAB,
             FloatAcc,
@@ -220,7 +217,7 @@ extern "C" __global__ void
 #if CK_USE_LAUNCH_BOUNDS
     __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
+        convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
             const FloatAB* __restrict__ p_a_grid,
             const FloatAB* __restrict__ p_b_grid,
             FloatC* __restrict__ p_c_grid,
@@ -232,11 +229,11 @@ extern "C" __global__ void
     constexpr auto I3 = Number<3>{};
 
     constexpr auto in_n_c_hi_wi_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 256, 28, 28));
+        make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28));
     constexpr auto wei_k_c_y_x_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 256, 3, 3));
+        make_naive_tensor_descriptor_packed(make_tuple(256, 256, 3, 3));
     constexpr auto out_n_k_ho_wo_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 256, 28, 28));
+        make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28));
 
     constexpr auto descs =
         transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
@@ -303,7 +300,7 @@ extern "C" __global__ void
         Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0>;
 
     using GridwiseContraction =
-        GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1<
+        GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1<
             BlockSize,
             FloatAB,
             FloatAcc,
diff --git a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp b/src/composable_kernel/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
similarity index 96%
rename from src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
rename to src/composable_kernel/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
index 187a05554b..5f162ec24b 100644
--- a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/src/composable_kernel/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
@@ -2,7 +2,7 @@
 #include "device.hpp"
 #include "host_tensor.hpp"
 #include "transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp"
-#include "driver_dynamic_gemm_xdlops_v2r3.hpp"
+#include "driver_gemm_xdlops_v2r3.hpp"
 
 template <typename TInWei,
           typename TAcc,
@@ -14,7 +14,7 @@ template <typename TInWei,
           typename ConvDilations,
           typename InLeftPads,
           typename InRightPads>
-void device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk(
+void device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk(
     const InLengths& in_n_hi_wi_c_lengths,
     const WeiLengths& wei_k_y_x_c_lengths,
     const OutLengths& out_n_ho_wo_k_lengths,
@@ -44,12 +44,9 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyx
     wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data());
     out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
 
-    const auto in_n_hi_wi_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths);
-    const auto wei_k_y_x_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths);
-    const auto out_n_ho_wo_k_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths);
+    const auto in_n_hi_wi_c_desc  = make_naive_tensor_descriptor_packed(in_n_hi_wi_c_lengths);
+    const auto wei_k_y_x_c_desc   = make_naive_tensor_descriptor_packed(wei_k_y_x_c_lengths);
+    const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(out_n_ho_wo_k_lengths);
 
 #if 1
     // [M, N, K0, K1] = [128, 128, 4, 4] for fp32
@@ -254,7 +251,7 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyx
 
     for(index_t i = 0; i < 5; ++i)
     {
-        float ave_time = driver_dynamic_gemm_xdlops_v2r3<
+        float ave_time = driver_gemm_xdlops_v2r3<
             BlockSize,
             TInWei,
             TAcc,
diff --git a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp b/src/composable_kernel/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
similarity index 96%
rename from src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
rename to src/composable_kernel/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
index 85c418c52f..82539fdd11 100644
--- a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/src/composable_kernel/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
@@ -2,7 +2,7 @@
 #include "device.hpp"
 #include "host_tensor.hpp"
 #include "transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp"
-#include "driver_dynamic_gemm_xdlops_v2r3.hpp"
+#include "driver_gemm_xdlops_v2r3.hpp"
 
 template <typename TInWei,
           typename TAcc,
@@ -14,7 +14,7 @@ template <typename TInWei,
           typename ConvDilations,
           typename InLeftPads,
           typename InRightPads>
-void device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk(
+void device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk(
     const InLengths& in_n_hi_wi_c_lengths,
     const WeiLengths& wei_k_y_x_c_lengths,
     const OutLengths& out_n_ho_wo_k_lengths,
@@ -44,12 +44,9 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_k
     wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data());
     out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
 
-    const auto in_n_hi_wi_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths);
-    const auto wei_k_y_x_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths);
-    const auto out_n_ho_wo_k_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths);
+    const auto in_n_hi_wi_c_desc  = make_naive_tensor_descriptor_packed(in_n_hi_wi_c_lengths);
+    const auto wei_k_y_x_c_desc   = make_naive_tensor_descriptor_packed(wei_k_y_x_c_lengths);
+    const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(out_n_ho_wo_k_lengths);
 
 #if 0
     // [M, N, K0, K1] = [256, 128, 4, 4] for fp32
@@ -226,7 +223,7 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_k
 
     for(index_t i = 0; i < 5; ++i)
     {
-        float ave_time = driver_dynamic_gemm_xdlops_v2r3<
+        float ave_time = driver_gemm_xdlops_v2r3<
             BlockSize,
             TInWei,
             TAcc,
diff --git a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp b/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
similarity index 94%
rename from src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
rename to src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
index c044036a2c..a2af8eab28 100644
--- a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
+++ b/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
@@ -2,7 +2,7 @@
 #include "device.hpp"
 #include "host_tensor.hpp"
 #include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp"
-#include "driver_dynamic_gemm_dlops_v1r2.hpp"
+#include "driver_gemm_dlops_v1r2.hpp"
 
 template <typename TInWei,
           typename TAcc,
@@ -14,7 +14,7 @@ template <typename TInWei,
           typename ConvDilations,
           typename InLeftPads,
           typename InRightPads>
-void device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
+void device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
     const InLengths& in_n_c_hi_wi_lengths,
     const WeiLengths& wei_k_c_y_x_lengths,
     const OutLengths& out_n_k_ho_wo_lengths,
@@ -43,12 +43,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
     wei_k_c_y_x_device_buf.ToDevice(wei_k_c_y_x.mData.data());
     out_n_k_ho_wo_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
 
-    const auto in_n_c_hi_wi_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
-    const auto wei_k_c_y_x_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
-    const auto out_n_k_ho_wo_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
+    const auto in_n_c_hi_wi_desc  = make_naive_tensor_descriptor_packed(in_n_c_hi_wi_lengths);
+    const auto wei_k_c_y_x_desc   = make_naive_tensor_descriptor_packed(wei_k_c_y_x_lengths);
+    const auto out_n_k_ho_wo_desc = make_naive_tensor_descriptor_packed(out_n_k_ho_wo_lengths);
 
 #if 1
     // cdata = 64, BlockSize = 256, 128x128x8
@@ -136,7 +133,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
 
     for(index_t i = 0; i < 5; ++i)
     {
-        float ave_time = driver_dynamic_gemm_dlops_v1r2<
+        float ave_time = driver_gemm_dlops_v1r2<
             BlockSize,
             TInWei,
             TAcc,
diff --git a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp b/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
similarity index 94%
rename from src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
rename to src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
index b6b1cc8969..4a9d01081c 100644
--- a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
+++ b/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
@@ -1,7 +1,7 @@
 #include <unistd.h>
 #include "device.hpp"
 #include "host_tensor.hpp"
-#include "driver_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
+#include "driver_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
 
 template <typename TInWei,
           typename TAcc,
@@ -13,7 +13,7 @@ template <typename TInWei,
           typename ConvDilations,
           typename InLeftPads,
           typename InRightPads>
-void device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(
+void device_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(
     const InLengths& in_n_c_hi_wi_lengths,
     const WeiLengths& wei_k_c_y_x_lengths,
     const OutLengths& out_n_k_ho_wo_lengths,
@@ -48,12 +48,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw
     wei_k_c_y_x_device_buf.ToDevice(wei_k_c_y_x.mData.data());
     out_n_k_ho_wo_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
 
-    const auto in_n_c_hi_wi_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
-    const auto wei_k_c_y_x_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
-    const auto out_n_k_ho_wo_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
+    const auto in_n_c_hi_wi_desc  = make_naive_tensor_descriptor_packed(in_n_c_hi_wi_lengths);
+    const auto wei_k_c_y_x_desc   = make_naive_tensor_descriptor_packed(wei_k_c_y_x_lengths);
+    const auto out_n_k_ho_wo_desc = make_naive_tensor_descriptor_packed(out_n_k_ho_wo_lengths);
 
 #if 0
     constexpr index_t BlockSize = 256;
@@ -212,9 +209,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw
     for(index_t i = 0; i < 5; ++i)
     {
 #if 0
-        float ave_time = launch_kernel_dynamic_gemm_xdlops_v1
+        float ave_time = launch_kernel_gemm_xdlops_v1
 #else
-        float ave_time = launch_kernel_dynamic_gemm_xdlops_v2
+        float ave_time = launch_kernel_gemm_xdlops_v2
 #endif
         <BlockSize,
          TInWei,
diff --git a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp b/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp
similarity index 96%
rename from src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp
rename to src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp
index 64557b642e..d32eeea9cd 100644
--- a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp
+++ b/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp
@@ -2,7 +2,7 @@
 #include "device.hpp"
 #include "host_tensor.hpp"
 #include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
-#include "driver_dynamic_gemm_dlops_v1r3.hpp"
+#include "driver_gemm_dlops_v1r3.hpp"
 
 template <typename TInWei,
           typename TAcc,
@@ -14,7 +14,7 @@ template <typename TInWei,
           typename ConvDilations,
           typename InLeftPads,
           typename InRightPads>
-void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk(
+void device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk(
     const InLengths& in_n_hi_wi_c_lengths,
     const WeiLengths& wei_k_y_x_c_lengths,
     const OutLengths& out_n_ho_wo_k_lengths,
@@ -44,12 +44,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhw
     wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data());
     out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
 
-    const auto in_n_hi_wi_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths);
-    const auto wei_k_y_x_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths);
-    const auto out_n_ho_wo_k_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths);
+    const auto in_n_hi_wi_c_desc  = make_naive_tensor_descriptor_packed(in_n_hi_wi_c_lengths);
+    const auto wei_k_y_x_c_desc   = make_naive_tensor_descriptor_packed(wei_k_y_x_c_lengths);
+    const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(out_n_ho_wo_k_lengths);
 
 #if 0
     // [M, N, K0, K1] = [128, 128, 8, 1] for fp32
@@ -200,7 +197,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhw
 
     for(index_t i = 0; i < 5; ++i)
     {
-        float ave_time = driver_dynamic_gemm_dlops_v1r3<
+        float ave_time = driver_gemm_dlops_v1r3<
             BlockSize,
             TInWei,
             TAcc,
diff --git a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp b/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
similarity index 94%
rename from src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
rename to src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
index 514ff6a3a9..d82fbf69d6 100644
--- a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
+++ b/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
@@ -2,7 +2,7 @@
 #include "device.hpp"
 #include "host_tensor.hpp"
 #include "transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp"
-#include "driver_dynamic_gemm_xdlops_v2r3.hpp"
+#include "driver_gemm_xdlops_v2r3.hpp"
 
 template <typename TInWei,
           typename TAcc,
@@ -14,7 +14,7 @@ template <typename TInWei,
           typename ConvDilations,
           typename InLeftPads,
           typename InRightPads>
-void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw(
+void device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw(
     const InLengths& in_n_c_hi_wi_lengths,
     const WeiLengths& wei_k_c_y_x_lengths,
     const OutLengths& out_n_k_ho_wo_lengths,
@@ -43,12 +43,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
     wei_k_c_y_x_device_buf.ToDevice(wei_k_c_y_x.mData.data());
     out_n_k_ho_wo_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
 
-    const auto in_n_c_hi_wi_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
-    const auto wei_k_c_y_x_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
-    const auto out_n_k_ho_wo_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
+    const auto in_n_c_hi_wi_desc  = make_naive_tensor_descriptor_packed(in_n_c_hi_wi_lengths);
+    const auto wei_k_c_y_x_desc   = make_naive_tensor_descriptor_packed(wei_k_c_y_x_lengths);
+    const auto out_n_k_ho_wo_desc = make_naive_tensor_descriptor_packed(out_n_k_ho_wo_lengths);
 
 #if 1
     // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
@@ -134,7 +131,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
 
     for(index_t i = 0; i < 5; ++i)
     {
-        float ave_time = driver_dynamic_gemm_xdlops_v2r3<
+        float ave_time = driver_gemm_xdlops_v2r3<
             BlockSize,
             TInWei,
             TAcc,
diff --git a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp b/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp
similarity index 94%
rename from src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp
rename to src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp
index 5310503318..37d89ec5a2 100644
--- a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp
@@ -2,7 +2,7 @@
 #include "device.hpp"
 #include "host_tensor.hpp"
 #include "transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp"
-#include "driver_dynamic_gemm_xdlops_v2r2.hpp"
+#include "driver_gemm_xdlops_v2r2.hpp"
 
 template <typename TInWei,
           typename TAcc,
@@ -14,7 +14,7 @@ template <typename TInWei,
           typename ConvDilations,
           typename InLeftPads,
           typename InRightPads>
-void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk(
+void device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk(
     const InLengths& in_n_hi_wi_c_lengths,
     const WeiLengths& wei_k_y_x_c_lengths,
     const OutLengths& out_n_ho_wo_k_lengths,
@@ -44,12 +44,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nh
     wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data());
     out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
 
-    const auto in_n_hi_wi_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths);
-    const auto wei_k_y_x_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths);
-    const auto out_n_ho_wo_k_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths);
+    const auto in_n_hi_wi_c_desc  = make_naive_tensor_descriptor_packed(in_n_hi_wi_c_lengths);
+    const auto wei_k_y_x_c_desc   = make_naive_tensor_descriptor_packed(wei_k_y_x_c_lengths);
+    const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(out_n_ho_wo_k_lengths);
 
 #if 1
     // [M, N, K0, K1] = [256, 128, 4, 4] for fp32
@@ -155,7 +152,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nh
 
     for(index_t i = 0; i < 5; ++i)
     {
-        float ave_time = driver_dynamic_gemm_xdlops_v2r2<
+        float ave_time = driver_gemm_xdlops_v2r2<
             BlockSize,
             TInWei,
             TAcc,
diff --git a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp b/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp
similarity index 96%
rename from src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp
rename to src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp
index f2a30fb525..d1671bb87c 100644
--- a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp
@@ -2,7 +2,7 @@
 #include "device.hpp"
 #include "host_tensor.hpp"
 #include "transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp"
-#include "driver_dynamic_gemm_xdlops_v2r3.hpp"
+#include "driver_gemm_xdlops_v2r3.hpp"
 
 template <typename TInWei,
           typename TAcc,
@@ -14,7 +14,7 @@ template <typename TInWei,
           typename ConvDilations,
           typename InLeftPads,
           typename InRightPads>
-void device_dynamic_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk(
+void device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk(
     const InLengths& in_n_hi_wi_c_lengths,
     const WeiLengths& wei_k_y_x_c_lengths,
     const OutLengths& out_n_ho_wo_k_lengths,
@@ -49,12 +49,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nh
     wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data());
     out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
 
-    const auto in_n_hi_wi_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths);
-    const auto wei_k_y_x_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths);
-    const auto out_n_ho_wo_k_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths);
+    const auto in_n_hi_wi_c_desc  = make_naive_tensor_descriptor_packed(in_n_hi_wi_c_lengths);
+    const auto wei_k_y_x_c_desc   = make_naive_tensor_descriptor_packed(wei_k_y_x_c_lengths);
+    const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(out_n_ho_wo_k_lengths);
 
 #if 1
     // [M, N, K0, K1] = [256, 128, 4, 4] for fp32
@@ -224,7 +221,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nh
 
     for(index_t i = 0; i < 5; ++i)
     {
-        float ave_time = driver_dynamic_gemm_xdlops_v2r3<
+        float ave_time = driver_gemm_xdlops_v2r3<
             BlockSize,
             TInWei,
             TAcc,
diff --git a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp b/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
similarity index 96%
rename from src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
rename to src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
index 0d49c417de..7a38b569c9 100644
--- a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
@@ -2,7 +2,7 @@
 #include "device.hpp"
 #include "host_tensor.hpp"
 #include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
-#include "driver_dynamic_gemm_xdlops_v2r3.hpp"
+#include "driver_gemm_xdlops_v2r3.hpp"
 
 template <typename TInWei,
           typename TAcc,
@@ -14,7 +14,7 @@ template <typename TInWei,
           typename ConvDilations,
           typename InLeftPads,
           typename InRightPads>
-void device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk(
+void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk(
     const InLengths& in_n_hi_wi_c_lengths,
     const WeiLengths& wei_k_y_x_c_lengths,
     const OutLengths& out_n_ho_wo_k_lengths,
@@ -44,12 +44,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nh
     wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data());
     out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
 
-    const auto in_n_hi_wi_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths);
-    const auto wei_k_y_x_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths);
-    const auto out_n_ho_wo_k_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths);
+    const auto in_n_hi_wi_c_desc  = make_naive_tensor_descriptor_packed(in_n_hi_wi_c_lengths);
+    const auto wei_k_y_x_c_desc   = make_naive_tensor_descriptor_packed(wei_k_y_x_c_lengths);
+    const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(out_n_ho_wo_k_lengths);
 
 #if 0
     // [M, N, K0, K1] = [256, 128, 4, 4] for fp32
@@ -278,7 +275,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nh
 
     for(index_t i = 0; i < 5; ++i)
     {
-        float ave_time = driver_dynamic_gemm_xdlops_v2r3<
+        float ave_time = driver_gemm_xdlops_v2r3<
             BlockSize,
             TInWei,
             TAcc,
diff --git a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp b/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
similarity index 91%
rename from src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
rename to src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
index 583c8a8a79..b5e5f91d59 100644
--- a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
@@ -1,8 +1,8 @@
 #include <unistd.h>
 #include "device.hpp"
 #include "host_tensor.hpp"
-#include "driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp"
-#include "driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp"
+#include "driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp"
+#include "driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp"
 
 template <typename TInWei,
           ck::index_t InWeiVectorSize,
@@ -15,7 +15,7 @@ template <typename TInWei,
           typename ConvDilations,
           typename InLeftPads,
           typename InRightPads>
-void device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
+void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
     const InLengths& in_n_c_hi_wi_lengths,
     const WeiLengths& wei_k_c_y_x_lengths,
     const OutLengths& out_n_k_ho_wo_lengths,
@@ -85,12 +85,10 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
     in_n_c0_hi_wi_c1_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data());
     wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data());
 
-    const auto in_n_c0_hi_wi_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, C0, Hi, Wi));
-    const auto wei_k_c0_y_x_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C0, Y, X));
+    const auto in_n_c0_hi_wi_desc = make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi));
+    const auto wei_k_c0_y_x_desc  = make_naive_tensor_descriptor_packed(make_tuple(K, C0, Y, X));
     const auto out_n_k0_ho_wo_k1_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Ho, Wo, K1));
+        make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1));
 
 #if 1
     // cdata = 64, BlockSize = 64, 16x8x32x4
diff --git a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp b/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
similarity index 95%
rename from src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
rename to src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
index 37ea6f1b17..f2a8a1a2b2 100644
--- a/src/composable_kernel/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
@@ -3,7 +3,7 @@
 #include "device.hpp"
 #include "host_tensor.hpp"
 #include "transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp"
-#include "driver_dynamic_contraction_dlops_v1r2.hpp"
+#include "driver_contraction_dlops_v1r2.hpp"
 
 template <typename TInWei,
           typename TAcc,
@@ -15,7 +15,7 @@ template <typename TInWei,
           typename ConvDilations,
           typename InLeftPads,
           typename InRightPads>
-void device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
+void device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
     const InLengths& in_n_c_hi_wi_lengths,
     const WeiLengths& wei_k_c_y_x_lengths,
     const OutLengths& out_n_k_ho_wo_lengths,
@@ -44,12 +44,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
     wei_k_c_y_x_device_buf.ToDevice(wei_k_c_y_x.mData.data());
     out_n_k_ho_wo_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
 
-    const auto in_desc_n_c_hi_wi =
-        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
-    const auto wei_desc_k_c_y_x =
-        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
-    const auto out_desc_n_k_ho_wo =
-        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
+    const auto in_desc_n_c_hi_wi  = make_naive_tensor_descriptor_packed(in_n_c_hi_wi_lengths);
+    const auto wei_desc_k_c_y_x   = make_naive_tensor_descriptor_packed(wei_k_c_y_x_lengths);
+    const auto out_desc_n_k_ho_wo = make_naive_tensor_descriptor_packed(out_n_k_ho_wo_lengths);
 
 #if 0
     // [8, 1, 128, 1] * [8, 4, 32, 1] = [1, 128, 4, 32] for fp32
@@ -180,7 +177,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
 
     for(index_t i = 0; i < 5; ++i)
     {
-        float ave_time = driver_dynamic_contraction_dlops_v1r2<
+        float ave_time = driver_contraction_dlops_v1r2<
             BlockSize,
             TInWei,
             TAcc,
diff --git a/src/composable_kernel/host/driver_offline/include/driver_dynamic_contraction_dlops_v1r2.hpp b/src/composable_kernel/host/driver_offline/include/driver_contraction_dlops_v1r2.hpp
similarity index 88%
rename from src/composable_kernel/host/driver_offline/include/driver_dynamic_contraction_dlops_v1r2.hpp
rename to src/composable_kernel/host/driver_offline/include/driver_contraction_dlops_v1r2.hpp
index b520be5b6a..fbd1ce4e5e 100644
--- a/src/composable_kernel/host/driver_offline/include/driver_dynamic_contraction_dlops_v1r2.hpp
+++ b/src/composable_kernel/host/driver_offline/include/driver_contraction_dlops_v1r2.hpp
@@ -1,10 +1,10 @@
-#ifndef DRIVER_DYNAMIC_CONTRACTION_DLOPS_V1R2_HPP
-#define DRIVER_DYNAMIC_CONTRACTION_DLOPS_V1R2_HPP
+#ifndef DRIVER_CONTRACTION_DLOPS_V1R2_HPP
+#define DRIVER_CONTRACTION_DLOPS_V1R2_HPP
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "gridwise_dynamic_contraction_dlops_v1r2.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "gridwise_contraction_dlops_v1r2.hpp"
 
 template <ck::index_t BlockSize,
           typename FloatAB,
@@ -45,18 +45,18 @@ template <ck::index_t BlockSize,
           typename AGridMoveSliceWindowIteratorHacks,
           typename BGridMoveSliceWindowIteratorHacks>
 __host__ float
-driver_dynamic_contraction_dlops_v1r2(const FloatAB* p_a_grid,
-                                      const FloatAB* p_b_grid,
-                                      FloatC* p_c_grid,
-                                      const AGridDesc_GK0_GM0_GM1_GK1& a_grid_desc_gk0_gm0_gm1_gk1,
-                                      const BGridDesc_GK0_GN0_GN1_GK1& b_grid_desc_gk0_gn0_gn1_gk1,
-                                      const CGridDesc_GM0_GM1_GN0_GN1& c_grid_desc_gm0_gm1_gn0_gn1,
-                                      AGridIteratorHacks,
-                                      BGridIteratorHacks,
-                                      CGridIteratorHacks,
-                                      AGridMoveSliceWindowIteratorHacks,
-                                      BGridMoveSliceWindowIteratorHacks,
-                                      ck::index_t nrepeat)
+driver_contraction_dlops_v1r2(const FloatAB* p_a_grid,
+                              const FloatAB* p_b_grid,
+                              FloatC* p_c_grid,
+                              const AGridDesc_GK0_GM0_GM1_GK1& a_grid_desc_gk0_gm0_gm1_gk1,
+                              const BGridDesc_GK0_GN0_GN1_GK1& b_grid_desc_gk0_gn0_gn1_gk1,
+                              const CGridDesc_GM0_GM1_GN0_GN1& c_grid_desc_gm0_gm1_gn0_gn1,
+                              AGridIteratorHacks,
+                              BGridIteratorHacks,
+                              CGridIteratorHacks,
+                              AGridMoveSliceWindowIteratorHacks,
+                              BGridMoveSliceWindowIteratorHacks,
+                              ck::index_t nrepeat)
 
 {
     using namespace ck;
@@ -70,7 +70,7 @@ driver_dynamic_contraction_dlops_v1r2(const FloatAB* p_a_grid,
 
     // GEMM
     using GridwiseContraction =
-        GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1<
+        GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1<
             BlockSize,
             FloatAB,
             FloatAcc,
@@ -116,7 +116,7 @@ driver_dynamic_contraction_dlops_v1r2(const FloatAB* p_a_grid,
            a_grid_desc_gk0_gm0_gm1_gk1, b_grid_desc_gk0_gn0_gn1_gk1, c_grid_desc_gm0_gm1_gn0_gn1))
     {
         throw std::runtime_error("wrong! "
-                                 "GridwiseDynamicContraction_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_"
+                                 "GridwiseContraction_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_"
                                  "GM0_GM1_GN0_GN1 has invalid setting");
     }
 
@@ -178,7 +178,7 @@ driver_dynamic_contraction_dlops_v1r2(const FloatAB* p_a_grid,
 
     if(has_main_k_block_loop && has_double_tail_k_block_loop)
     {
-        const auto kernel = kernel_dynamic_contraction_dlops_v1r2<
+        const auto kernel = kernel_contraction_dlops_v1r2<
             GridwiseContraction,
             FloatAB,
             FloatC,
@@ -204,7 +204,7 @@ driver_dynamic_contraction_dlops_v1r2(const FloatAB* p_a_grid,
     }
     else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
     {
-        const auto kernel = kernel_dynamic_contraction_dlops_v1r2<
+        const auto kernel = kernel_contraction_dlops_v1r2<
             GridwiseContraction,
             FloatAB,
             FloatC,
@@ -230,7 +230,7 @@ driver_dynamic_contraction_dlops_v1r2(const FloatAB* p_a_grid,
     }
     else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
     {
-        const auto kernel = kernel_dynamic_contraction_dlops_v1r2<
+        const auto kernel = kernel_contraction_dlops_v1r2<
             GridwiseContraction,
             FloatAB,
             FloatC,
@@ -256,7 +256,7 @@ driver_dynamic_contraction_dlops_v1r2(const FloatAB* p_a_grid,
     }
     else
     {
-        const auto kernel = kernel_dynamic_contraction_dlops_v1r2<
+        const auto kernel = kernel_contraction_dlops_v1r2<
             GridwiseContraction,
             FloatAB,
             FloatC,
diff --git a/src/composable_kernel/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp b/src/composable_kernel/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
similarity index 92%
rename from src/composable_kernel/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
rename to src/composable_kernel/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
index 693045cd16..6f4db5ff7b 100644
--- a/src/composable_kernel/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/src/composable_kernel/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
@@ -1,10 +1,10 @@
-#ifndef DRIVER_DYNAMIC_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_NCHW_KCYX_NKHW_HPP
-#define DRIVER_DYNAMIC_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_NCHW_KCYX_NKHW_HPP
+#ifndef DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_NCHW_KCYX_NKHW_HPP
+#define DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_NCHW_KCYX_NKHW_HPP
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "gridwise_dynamic_gemm_dlops_v2.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "gridwise_gemm_dlops_v2.hpp"
 #include "gridwise_operation_wrapper.hpp"
 
 template <ck::index_t BlockSize,
@@ -34,9 +34,9 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
               typename ConvDilations,
               typename InLeftPads,
               typename InRightPads>
-    __host__ void Run(const ck::DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
-                      const ck::DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
-                      const ck::DynamicTensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
+    __host__ void Run(const ck::TensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
+                      const ck::TensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
+                      const ck::TensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
                       const ConvStrides& conv_strides,
                       const ConvDilations& conv_dilations,
                       const InLeftPads& in_left_pads,
@@ -82,14 +82,14 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
         const auto InRightPadW = in_right_pads[I1];
 
         // weight tensor
-        const auto wei_e_k_global_desc = transform_dynamic_tensor_descriptor(
-            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C * Y * X)),
+        const auto wei_e_k_global_desc = transform_tensor_descriptor(
+            make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)),
             make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
             make_tuple(Sequence<0>{}, Sequence<1>{}),
             make_tuple(Sequence<1>{}, Sequence<0>{}));
 
         // input tensor
-        const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
+        const auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor(
             in_n_c_hi_wi_global_desc,
             make_tuple(make_pass_through_transform(N),
                        make_pass_through_transform(C),
@@ -98,7 +98,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
             make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
             make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
 
-        const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor(
+        const auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor(
             in_n_c_hip_wip_global_desc,
             make_tuple(
                 make_pass_through_transform(N),
@@ -108,7 +108,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
             make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
             make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
 
-        const auto in_e_n_ho_wo_global_desc = transform_dynamic_tensor_descriptor(
+        const auto in_e_n_ho_wo_global_desc = transform_tensor_descriptor(
             in_n_c_y_ho_x_wo_global_desc,
             make_tuple(make_merge_transform(make_tuple(C, Y, X)),
                        make_pass_through_transform(N),
@@ -118,8 +118,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
             make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
 
         // output tensor
-        const auto out_k_n_ho_wo_global_desc = transform_dynamic_tensor_descriptor(
-            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Ho, Wo, K1)),
+        const auto out_k_n_ho_wo_global_desc = transform_tensor_descriptor(
+            make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1)),
             make_tuple(make_merge_transform(make_tuple(K0, K1)),
                        make_pass_through_transform(N),
                        make_pass_through_transform(Ho),
@@ -169,7 +169,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
 
 #if 1
         // GEMM
-        using gridwise_gemm = GridwiseDynamicGemmDlops_km_kn_mn_v3<
+        using gridwise_gemm = GridwiseGemmDlops_km_kn_mn_v3<
             BlockSize,
             FloatAB,
             FloatAcc,
diff --git a/src/composable_kernel/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp b/src/composable_kernel/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
similarity index 92%
rename from src/composable_kernel/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
rename to src/composable_kernel/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
index 2238b355f9..1b7179173c 100644
--- a/src/composable_kernel/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
+++ b/src/composable_kernel/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
@@ -1,10 +1,10 @@
-#ifndef DRIVER_DYNAMIC_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NCHW_KCYX_NKHW_OUTPAD_HPP
-#define DRIVER_DYNAMIC_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NCHW_KCYX_NKHW_OUTPAD_HPP
+#ifndef DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NCHW_KCYX_NKHW_OUTPAD_HPP
+#define DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NCHW_KCYX_NKHW_OUTPAD_HPP
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "gridwise_dynamic_gemm_dlops_v2.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "gridwise_gemm_dlops_v2.hpp"
 #include "gridwise_operation_wrapper.hpp"
 
 template <ck::index_t BlockSize,
@@ -34,9 +34,9 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
               typename ConvDilations,
               typename InLeftPads,
               typename InRightPads>
-    __host__ void Run(const ck::DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
-                      const ck::DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
-                      const ck::DynamicTensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
+    __host__ void Run(const ck::TensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
+                      const ck::TensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
+                      const ck::TensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
                       const ConvStrides& conv_strides,
                       const ConvDilations& conv_dilations,
                       const InLeftPads& in_left_pads,
@@ -93,14 +93,14 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
                   << std::endl;
 
         // weight tensor
-        const auto wei_e_k_global_desc = transform_dynamic_tensor_descriptor(
-            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C * Y * X)),
+        const auto wei_e_k_global_desc = transform_tensor_descriptor(
+            make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)),
             make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
             make_tuple(Sequence<0>{}, Sequence<1>{}),
             make_tuple(Sequence<1>{}, Sequence<0>{}));
 
         // input tensor
-        const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
+        const auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor(
             in_n_c_hi_wi_global_desc,
             make_tuple(make_pass_through_transform(N),
                        make_pass_through_transform(C),
@@ -109,7 +109,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
             make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
             make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
 
-        const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor(
+        const auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor(
             in_n_c_hip_wip_global_desc,
             make_tuple(
                 make_pass_through_transform(N),
@@ -119,7 +119,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
             make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
             make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
 
-        const auto in_e_n_ho_wo_global_desc = transform_dynamic_tensor_descriptor(
+        const auto in_e_n_ho_wo_global_desc = transform_tensor_descriptor(
             in_n_c_y_ho_x_wo_global_desc,
             make_tuple(make_merge_transform(make_tuple(C, Y, X)),
                        make_pass_through_transform(N),
@@ -129,8 +129,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
             make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
 
         // output tensor
-        const auto out_k_n_hop_wop_global_desc = transform_dynamic_tensor_descriptor(
-            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Ho, Wo, K1)),
+        const auto out_k_n_hop_wop_global_desc = transform_tensor_descriptor(
+            make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1)),
             make_tuple(make_merge_transform(make_tuple(K0, K1)),
                        make_pass_through_transform(N),
                        make_pad_transform(Ho, 0, OutRightPadH),
@@ -181,7 +181,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
                                   Sequence<0, 0, 0, 0, 0>{}));
 
         // GEMM
-        using gridwise_gemm = GridwiseDynamicGemmDlops_km_kn_mn_v3<
+        using gridwise_gemm = GridwiseGemmDlops_km_kn_mn_v3<
             BlockSize,
             FloatAB,
             FloatAcc,
diff --git a/src/composable_kernel/host/driver_offline/include/driver_dynamic_gemm_dlops_v1r2.hpp b/src/composable_kernel/host/driver_offline/include/driver_gemm_dlops_v1r2.hpp
similarity index 56%
rename from src/composable_kernel/host/driver_offline/include/driver_dynamic_gemm_dlops_v1r2.hpp
rename to src/composable_kernel/host/driver_offline/include/driver_gemm_dlops_v1r2.hpp
index 29a72502d5..114f31e760 100644
--- a/src/composable_kernel/host/driver_offline/include/driver_dynamic_gemm_dlops_v1r2.hpp
+++ b/src/composable_kernel/host/driver_offline/include/driver_gemm_dlops_v1r2.hpp
@@ -1,10 +1,10 @@
-#ifndef DRIVER_DYNAMIC_GEMM_DLOPS_V1R2
-#define DRIVER_DYNAMIC_GEMM_DLOPS_V1R2
+#ifndef DRIVER_GEMM_DLOPS_V1R2
+#define DRIVER_GEMM_DLOPS_V1R2
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "gridwise_dynamic_gemm_dlops_v1r2.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "gridwise_gemm_dlops_v1r2.hpp"
 
 template <ck::index_t BlockSize,
           typename FloatAB,
@@ -48,18 +48,18 @@ template <ck::index_t BlockSize,
           typename CGridIteratorHacks,
           typename AGridMoveSliceWindowIteratorHacks,
           typename BGridMoveSliceWindowIteratorHacks>
-__host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
-                                              const FloatAB* p_b_grid,
-                                              FloatC* p_c_grid,
-                                              const AKMGridDesc& a_k_m_grid_desc,
-                                              const BKNGridDesc& b_k_n_grid_desc,
-                                              const CMNGridDesc& c_m_n_grid_desc,
-                                              AGridIteratorHacks,
-                                              BGridIteratorHacks,
-                                              CGridIteratorHacks,
-                                              AGridMoveSliceWindowIteratorHacks,
-                                              BGridMoveSliceWindowIteratorHacks,
-                                              ck::index_t nrepeat)
+__host__ float driver_gemm_dlops_v1r2(const FloatAB* p_a_grid,
+                                      const FloatAB* p_b_grid,
+                                      FloatC* p_c_grid,
+                                      const AKMGridDesc& a_k_m_grid_desc,
+                                      const BKNGridDesc& b_k_n_grid_desc,
+                                      const CMNGridDesc& c_m_n_grid_desc,
+                                      AGridIteratorHacks,
+                                      BGridIteratorHacks,
+                                      CGridIteratorHacks,
+                                      AGridMoveSliceWindowIteratorHacks,
+                                      BGridMoveSliceWindowIteratorHacks,
+                                      ck::index_t nrepeat)
 
 {
     using namespace ck;
@@ -72,49 +72,48 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
     constexpr auto I5 = Number<5>{};
 
     // GEMM
-    using GridwiseGemm =
-        GridwiseDynamicGemmDlops_km_kn_mn_v1r2<BlockSize,
-                                               FloatAB,
-                                               FloatAcc,
-                                               FloatC,
-                                               CGlobalMemoryDataOperation,
-                                               AKMGridDesc,
-                                               BKNGridDesc,
-                                               CMNGridDesc,
-                                               MPerBlock,
-                                               NPerBlock,
-                                               KPerBlock,
-                                               M1PerThread,
-                                               N1PerThread,
-                                               KPerThread,
-                                               M1N1ThreadClusterM10,
-                                               M1N1ThreadClusterN10,
-                                               M1N1ThreadClusterM11,
-                                               M1N1ThreadClusterN11,
-                                               ABlockTransferThreadSliceLengths_K_M0_M1,
-                                               ABlockTransferThreadClusterLengths_K_M0_M1,
-                                               ABlockTransferThreadClusterArrangeOrder,
-                                               ABlockTransferSrcAccessOrder,
-                                               ABlockTransferSrcVectorDim,
-                                               ABlockTransferSrcScalarPerVector,
-                                               ABlockTransferDstScalarPerVector_M1,
-                                               AThreadTransferSrcResetCoordinateAfterRun,
-                                               BBlockTransferThreadSliceLengths_K_N0_N1,
-                                               BBlockTransferThreadClusterLengths_K_N0_N1,
-                                               BBlockTransferThreadClusterArrangeOrder,
-                                               BBlockTransferSrcAccessOrder,
-                                               BBlockTransferSrcVectorDim,
-                                               BBlockTransferSrcScalarPerVector,
-                                               BBlockTransferDstScalarPerVector_N1,
-                                               BThreadTransferSrcResetCoordinateAfterRun,
-                                               CThreadTransferSrcDstAccessOrder,
-                                               CThreadTransferSrcDstVectorDim,
-                                               CThreadTransferDstScalarPerVector,
-                                               AGridIteratorHacks,
-                                               BGridIteratorHacks,
-                                               CGridIteratorHacks,
-                                               AGridMoveSliceWindowIteratorHacks,
-                                               BGridMoveSliceWindowIteratorHacks>;
+    using GridwiseGemm = GridwiseGemmDlops_km_kn_mn_v1r2<BlockSize,
+                                                         FloatAB,
+                                                         FloatAcc,
+                                                         FloatC,
+                                                         CGlobalMemoryDataOperation,
+                                                         AKMGridDesc,
+                                                         BKNGridDesc,
+                                                         CMNGridDesc,
+                                                         MPerBlock,
+                                                         NPerBlock,
+                                                         KPerBlock,
+                                                         M1PerThread,
+                                                         N1PerThread,
+                                                         KPerThread,
+                                                         M1N1ThreadClusterM10,
+                                                         M1N1ThreadClusterN10,
+                                                         M1N1ThreadClusterM11,
+                                                         M1N1ThreadClusterN11,
+                                                         ABlockTransferThreadSliceLengths_K_M0_M1,
+                                                         ABlockTransferThreadClusterLengths_K_M0_M1,
+                                                         ABlockTransferThreadClusterArrangeOrder,
+                                                         ABlockTransferSrcAccessOrder,
+                                                         ABlockTransferSrcVectorDim,
+                                                         ABlockTransferSrcScalarPerVector,
+                                                         ABlockTransferDstScalarPerVector_M1,
+                                                         AThreadTransferSrcResetCoordinateAfterRun,
+                                                         BBlockTransferThreadSliceLengths_K_N0_N1,
+                                                         BBlockTransferThreadClusterLengths_K_N0_N1,
+                                                         BBlockTransferThreadClusterArrangeOrder,
+                                                         BBlockTransferSrcAccessOrder,
+                                                         BBlockTransferSrcVectorDim,
+                                                         BBlockTransferSrcScalarPerVector,
+                                                         BBlockTransferDstScalarPerVector_N1,
+                                                         BThreadTransferSrcResetCoordinateAfterRun,
+                                                         CThreadTransferSrcDstAccessOrder,
+                                                         CThreadTransferSrcDstVectorDim,
+                                                         CThreadTransferDstScalarPerVector,
+                                                         AGridIteratorHacks,
+                                                         BGridIteratorHacks,
+                                                         CGridIteratorHacks,
+                                                         AGridMoveSliceWindowIteratorHacks,
+                                                         BGridMoveSliceWindowIteratorHacks>;
 
     const auto M = a_k_m_grid_desc.GetLength(I1);
     const auto N = b_k_n_grid_desc.GetLength(I1);
@@ -122,8 +121,7 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
 
     if(!GridwiseGemm::CheckValidity(a_k_m_grid_desc, b_k_n_grid_desc, c_m_n_grid_desc))
     {
-        throw std::runtime_error(
-            "wrong! GridwiseDynamicGemmDlops_km_kn_mn_v1r2 has invalid setting");
+        throw std::runtime_error("wrong! GridwiseGemmDlops_km_kn_mn_v1r2 has invalid setting");
     }
 
     const auto a_k_m0_m1_grid_desc = GridwiseGemm::MakeAKM0M1GridDescriptor(a_k_m_grid_desc);
@@ -174,15 +172,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
     if(has_main_k_block_loop && has_double_tail_k_block_loop)
     {
         const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm,
-                                           FloatAB,
-                                           FloatC,
-                                           remove_reference_t<AKM0M1GridDesc>,
-                                           remove_reference_t<BKN0N1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           true,
-                                           true>;
+            kernel_gemm_dlops_v1r2<GridwiseGemm,
+                                   FloatAB,
+                                   FloatC,
+                                   remove_reference_t<AKM0M1GridDesc>,
+                                   remove_reference_t<BKN0N1GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   true,
+                                   true>;
 
         ave_time = launch_and_time_kernel(kernel,
                                           nrepeat,
@@ -200,15 +198,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
     else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
     {
         const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm,
-                                           FloatAB,
-                                           FloatC,
-                                           remove_reference_t<AKM0M1GridDesc>,
-                                           remove_reference_t<BKN0N1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           true,
-                                           false>;
+            kernel_gemm_dlops_v1r2<GridwiseGemm,
+                                   FloatAB,
+                                   FloatC,
+                                   remove_reference_t<AKM0M1GridDesc>,
+                                   remove_reference_t<BKN0N1GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   true,
+                                   false>;
 
         ave_time = launch_and_time_kernel(kernel,
                                           nrepeat,
@@ -226,15 +224,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
     else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
     {
         const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm,
-                                           FloatAB,
-                                           FloatC,
-                                           remove_reference_t<AKM0M1GridDesc>,
-                                           remove_reference_t<BKN0N1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           false,
-                                           true>;
+            kernel_gemm_dlops_v1r2<GridwiseGemm,
+                                   FloatAB,
+                                   FloatC,
+                                   remove_reference_t<AKM0M1GridDesc>,
+                                   remove_reference_t<BKN0N1GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   false,
+                                   true>;
 
         ave_time = launch_and_time_kernel(kernel,
                                           nrepeat,
@@ -252,15 +250,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
     else
     {
         const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm,
-                                           FloatAB,
-                                           FloatC,
-                                           remove_reference_t<AKM0M1GridDesc>,
-                                           remove_reference_t<BKN0N1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           false,
-                                           false>;
+            kernel_gemm_dlops_v1r2<GridwiseGemm,
+                                   FloatAB,
+                                   FloatC,
+                                   remove_reference_t<AKM0M1GridDesc>,
+                                   remove_reference_t<BKN0N1GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   false,
+                                   false>;
 
         ave_time = launch_and_time_kernel(kernel,
                                           nrepeat,
@@ -295,15 +293,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
     if(has_main_k_block_loop && has_double_tail_k_block_loop)
     {
         const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm,
-                                           FloatAB,
-                                           FloatC,
-                                           remove_reference_t<AKM0M1GridDesc>,
-                                           remove_reference_t<BKN0N1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           true,
-                                           true>;
+            kernel_gemm_dlops_v1r2<GridwiseGemm,
+                                   FloatAB,
+                                   FloatC,
+                                   remove_reference_t<AKM0M1GridDesc>,
+                                   remove_reference_t<BKN0N1GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   true,
+                                   true>;
 
         ave_time = launch_and_time_kernel(
             kernel,
@@ -324,15 +322,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
     else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
     {
         const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm,
-                                           FloatAB,
-                                           FloatC,
-                                           remove_reference_t<AKM0M1GridDesc>,
-                                           remove_reference_t<BKN0N1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           true,
-                                           false>;
+            kernel_gemm_dlops_v1r2<GridwiseGemm,
+                                   FloatAB,
+                                   FloatC,
+                                   remove_reference_t<AKM0M1GridDesc>,
+                                   remove_reference_t<BKN0N1GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   true,
+                                   false>;
 
         ave_time = launch_and_time_kernel(
             kernel,
@@ -353,15 +351,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
     else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
     {
         const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm,
-                                           FloatAB,
-                                           FloatC,
-                                           remove_reference_t<AKM0M1GridDesc>,
-                                           remove_reference_t<BKN0N1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           false,
-                                           true>;
+            kernel_gemm_dlops_v1r2<GridwiseGemm,
+                                   FloatAB,
+                                   FloatC,
+                                   remove_reference_t<AKM0M1GridDesc>,
+                                   remove_reference_t<BKN0N1GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   false,
+                                   true>;
 
         ave_time = launch_and_time_kernel(
             kernel,
@@ -382,15 +380,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
     else
     {
         const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm,
-                                           FloatAB,
-                                           FloatC,
-                                           remove_reference_t<AKM0M1GridDesc>,
-                                           remove_reference_t<BKN0N1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           false,
-                                           false>;
+            kernel_gemm_dlops_v1r2<GridwiseGemm,
+                                   FloatAB,
+                                   FloatC,
+                                   remove_reference_t<AKM0M1GridDesc>,
+                                   remove_reference_t<BKN0N1GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   false,
+                                   false>;
 
         ave_time = launch_and_time_kernel(
             kernel,
diff --git a/src/composable_kernel/host/driver_offline/include/driver_dynamic_gemm_dlops_v1r3.hpp b/src/composable_kernel/host/driver_offline/include/driver_gemm_dlops_v1r3.hpp
similarity index 57%
rename from src/composable_kernel/host/driver_offline/include/driver_dynamic_gemm_dlops_v1r3.hpp
rename to src/composable_kernel/host/driver_offline/include/driver_gemm_dlops_v1r3.hpp
index 242bcfb28b..a9350bf0f8 100644
--- a/src/composable_kernel/host/driver_offline/include/driver_dynamic_gemm_dlops_v1r3.hpp
+++ b/src/composable_kernel/host/driver_offline/include/driver_gemm_dlops_v1r3.hpp
@@ -1,10 +1,10 @@
-#ifndef DRIVER_DYNAMIC_GEMM_DLOPS_V1R3
-#define DRIVER_DYNAMIC_GEMM_DLOPS_V1R3
+#ifndef DRIVER_GEMM_DLOPS_V1R3
+#define DRIVER_GEMM_DLOPS_V1R3
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "gridwise_dynamic_gemm_dlops_v1r3.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "gridwise_gemm_dlops_v1r3.hpp"
 
 template <ck::index_t BlockSize,
           typename FloatAB,
@@ -44,18 +44,18 @@ template <ck::index_t BlockSize,
           typename CGridIteratorHacks,
           typename AGridMoveSliceWindowIteratorHacks,
           typename BGridMoveSliceWindowIteratorHacks>
-__host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
-                                              const FloatAB* p_b_grid,
-                                              FloatC* p_c_grid,
-                                              const AK0MK1GridDesc& a_k0_m_k1_grid_desc,
-                                              const BK0NK1GridDesc& b_k0_n_k1_grid_desc,
-                                              const CMNGridDesc& c_m_n_grid_desc,
-                                              AGridIteratorHacks,
-                                              BGridIteratorHacks,
-                                              CGridIteratorHacks,
-                                              AGridMoveSliceWindowIteratorHacks,
-                                              BGridMoveSliceWindowIteratorHacks,
-                                              ck::index_t nrepeat)
+__host__ float driver_gemm_dlops_v1r3(const FloatAB* p_a_grid,
+                                      const FloatAB* p_b_grid,
+                                      FloatC* p_c_grid,
+                                      const AK0MK1GridDesc& a_k0_m_k1_grid_desc,
+                                      const BK0NK1GridDesc& b_k0_n_k1_grid_desc,
+                                      const CMNGridDesc& c_m_n_grid_desc,
+                                      AGridIteratorHacks,
+                                      BGridIteratorHacks,
+                                      CGridIteratorHacks,
+                                      AGridMoveSliceWindowIteratorHacks,
+                                      BGridMoveSliceWindowIteratorHacks,
+                                      ck::index_t nrepeat)
 
 {
     using namespace ck;
@@ -69,44 +69,44 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
 
     // GEMM
     using GridwiseGemm =
-        GridwiseDynamicGemmDlops_km_kn_mn_v1r3<BlockSize,
-                                               FloatAB,
-                                               FloatAcc,
-                                               FloatC,
-                                               CGlobalMemoryDataOperation,
-                                               AK0MK1GridDesc,
-                                               BK0NK1GridDesc,
-                                               CMNGridDesc,
-                                               MPerBlock,
-                                               NPerBlock,
-                                               KPerBlock,
-                                               M1PerThread,
-                                               N1PerThread,
-                                               KPerThread,
-                                               M1N1ThreadClusterM1Xs,
-                                               M1N1ThreadClusterN1Xs,
-                                               ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
-                                               ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
-                                               ABlockTransferThreadClusterArrangeOrder,
-                                               ABlockTransferSrcAccessOrder,
-                                               ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
-                                               ABlockTransferSrcVectorTensorContiguousDimOrder,
-                                               ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
-                                               BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
-                                               BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
-                                               BBlockTransferThreadClusterArrangeOrder,
-                                               BBlockTransferSrcAccessOrder,
-                                               BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
-                                               BBlockTransferSrcVectorTensorContiguousDimOrder,
-                                               BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
-                                               CThreadTransferSrcDstAccessOrder,
-                                               CThreadTransferSrcDstVectorDim,
-                                               CThreadTransferDstScalarPerVector,
-                                               AGridIteratorHacks,
-                                               BGridIteratorHacks,
-                                               CGridIteratorHacks,
-                                               AGridMoveSliceWindowIteratorHacks,
-                                               BGridMoveSliceWindowIteratorHacks>;
+        GridwiseGemmDlops_km_kn_mn_v1r3<BlockSize,
+                                        FloatAB,
+                                        FloatAcc,
+                                        FloatC,
+                                        CGlobalMemoryDataOperation,
+                                        AK0MK1GridDesc,
+                                        BK0NK1GridDesc,
+                                        CMNGridDesc,
+                                        MPerBlock,
+                                        NPerBlock,
+                                        KPerBlock,
+                                        M1PerThread,
+                                        N1PerThread,
+                                        KPerThread,
+                                        M1N1ThreadClusterM1Xs,
+                                        M1N1ThreadClusterN1Xs,
+                                        ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
+                                        ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
+                                        ABlockTransferThreadClusterArrangeOrder,
+                                        ABlockTransferSrcAccessOrder,
+                                        ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
+                                        ABlockTransferSrcVectorTensorContiguousDimOrder,
+                                        ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
+                                        BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
+                                        BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
+                                        BBlockTransferThreadClusterArrangeOrder,
+                                        BBlockTransferSrcAccessOrder,
+                                        BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
+                                        BBlockTransferSrcVectorTensorContiguousDimOrder,
+                                        BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
+                                        CThreadTransferSrcDstAccessOrder,
+                                        CThreadTransferSrcDstVectorDim,
+                                        CThreadTransferDstScalarPerVector,
+                                        AGridIteratorHacks,
+                                        BGridIteratorHacks,
+                                        CGridIteratorHacks,
+                                        AGridMoveSliceWindowIteratorHacks,
+                                        BGridMoveSliceWindowIteratorHacks>;
 
     const auto M  = a_k0_m_k1_grid_desc.GetLength(I1);
     const auto N  = b_k0_n_k1_grid_desc.GetLength(I1);
@@ -114,8 +114,7 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
 
     if(!GridwiseGemm::CheckValidity(a_k0_m_k1_grid_desc, b_k0_n_k1_grid_desc, c_m_n_grid_desc))
     {
-        throw std::runtime_error(
-            "wrong! GridwiseDynamicGemmDlops_km_kn_mn_v1r3 has invalid setting");
+        throw std::runtime_error("wrong! GridwiseGemmDlops_km_kn_mn_v1r3 has invalid setting");
     }
 
     const auto a_k0_m0_m1_k1_grid_desc =
@@ -170,15 +169,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
     if(has_main_k_block_loop && has_double_tail_k_block_loop)
     {
         const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm,
-                                           FloatAB,
-                                           FloatC,
-                                           remove_reference_t<AK0M0M1K1GridDesc>,
-                                           remove_reference_t<BK0N0N1K1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           true,
-                                           true>;
+            kernel_gemm_dlops_v1r3<GridwiseGemm,
+                                   FloatAB,
+                                   FloatC,
+                                   remove_reference_t<AK0M0M1K1GridDesc>,
+                                   remove_reference_t<BK0N0N1K1GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   true,
+                                   true>;
 
         ave_time = launch_and_time_kernel(kernel,
                                           nrepeat,
@@ -196,15 +195,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
     else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
     {
         const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm,
-                                           FloatAB,
-                                           FloatC,
-                                           remove_reference_t<AK0M0M1K1GridDesc>,
-                                           remove_reference_t<BK0N0N1K1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           true,
-                                           false>;
+            kernel_gemm_dlops_v1r3<GridwiseGemm,
+                                   FloatAB,
+                                   FloatC,
+                                   remove_reference_t<AK0M0M1K1GridDesc>,
+                                   remove_reference_t<BK0N0N1K1GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   true,
+                                   false>;
 
         ave_time = launch_and_time_kernel(kernel,
                                           nrepeat,
@@ -222,15 +221,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
     else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
     {
         const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm,
-                                           FloatAB,
-                                           FloatC,
-                                           remove_reference_t<AK0M0M1K1GridDesc>,
-                                           remove_reference_t<BK0N0N1K1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           false,
-                                           true>;
+            kernel_gemm_dlops_v1r3<GridwiseGemm,
+                                   FloatAB,
+                                   FloatC,
+                                   remove_reference_t<AK0M0M1K1GridDesc>,
+                                   remove_reference_t<BK0N0N1K1GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   false,
+                                   true>;
 
         ave_time = launch_and_time_kernel(kernel,
                                           nrepeat,
@@ -248,15 +247,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
     else
     {
         const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm,
-                                           FloatAB,
-                                           FloatC,
-                                           remove_reference_t<AK0M0M1K1GridDesc>,
-                                           remove_reference_t<BK0N0N1K1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           false,
-                                           false>;
+            kernel_gemm_dlops_v1r3<GridwiseGemm,
+                                   FloatAB,
+                                   FloatC,
+                                   remove_reference_t<AK0M0M1K1GridDesc>,
+                                   remove_reference_t<BK0N0N1K1GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   false,
+                                   false>;
 
         ave_time = launch_and_time_kernel(kernel,
                                           nrepeat,
@@ -291,15 +290,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
     if(has_main_k_block_loop && has_double_tail_k_block_loop)
     {
         const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm,
-                                           FloatAB,
-                                           FloatC,
-                                           remove_reference_t<AK0M0M1K1GridDesc>,
-                                           remove_reference_t<BK0N0N1K1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           true,
-                                           true>;
+            kernel_gemm_dlops_v1r3<GridwiseGemm,
+                                   FloatAB,
+                                   FloatC,
+                                   remove_reference_t<AK0M0M1K1GridDesc>,
+                                   remove_reference_t<BK0N0N1K1GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   true,
+                                   true>;
 
         ave_time = launch_and_time_kernel(
             kernel,
@@ -322,15 +321,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
     else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
     {
         const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm,
-                                           FloatAB,
-                                           FloatC,
-                                           remove_reference_t<AK0M0M1K1GridDesc>,
-                                           remove_reference_t<BK0N0N1K1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           true,
-                                           false>;
+            kernel_gemm_dlops_v1r3<GridwiseGemm,
+                                   FloatAB,
+                                   FloatC,
+                                   remove_reference_t<AK0M0M1K1GridDesc>,
+                                   remove_reference_t<BK0N0N1K1GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   true,
+                                   false>;
 
         ave_time = launch_and_time_kernel(
             kernel,
@@ -353,15 +352,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
     else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
     {
         const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm,
-                                           FloatAB,
-                                           FloatC,
-                                           remove_reference_t<AK0M0M1K1GridDesc>,
-                                           remove_reference_t<BK0N0N1K1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           false,
-                                           true>;
+            kernel_gemm_dlops_v1r3<GridwiseGemm,
+                                   FloatAB,
+                                   FloatC,
+                                   remove_reference_t<AK0M0M1K1GridDesc>,
+                                   remove_reference_t<BK0N0N1K1GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   false,
+                                   true>;
 
         ave_time = launch_and_time_kernel(
             kernel,
@@ -384,15 +383,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
     else
     {
         const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm,
-                                           FloatAB,
-                                           FloatC,
-                                           remove_reference_t<AK0M0M1K1GridDesc>,
-                                           remove_reference_t<BK0N0N1K1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           false,
-                                           false>;
+            kernel_gemm_dlops_v1r3<GridwiseGemm,
+                                   FloatAB,
+                                   FloatC,
+                                   remove_reference_t<AK0M0M1K1GridDesc>,
+                                   remove_reference_t<BK0N0N1K1GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   false,
+                                   false>;
 
         ave_time = launch_and_time_kernel(
             kernel,
diff --git a/src/composable_kernel/host/driver_offline/include/driver_dynamic_gemm_xdlops_v2r3.hpp b/src/composable_kernel/host/driver_offline/include/driver_gemm_xdlops_v2r3.hpp
similarity index 50%
rename from src/composable_kernel/host/driver_offline/include/driver_dynamic_gemm_xdlops_v2r3.hpp
rename to src/composable_kernel/host/driver_offline/include/driver_gemm_xdlops_v2r3.hpp
index 85f5e27b8d..c29dbdae69 100644
--- a/src/composable_kernel/host/driver_offline/include/driver_dynamic_gemm_xdlops_v2r3.hpp
+++ b/src/composable_kernel/host/driver_offline/include/driver_gemm_xdlops_v2r3.hpp
@@ -1,10 +1,10 @@
-#ifndef DRIVER_DYNAMIC_GEMM_XDLOPS_V2R3
-#define DRIVER_DYNAMIC_GEMM_XDLOPS_V2R3
+#ifndef DRIVER_GEMM_XDLOPS_V2R3
+#define DRIVER_GEMM_XDLOPS_V2R3
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "gridwise_dynamic_gemm_xdlops_v2r3.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "gridwise_gemm_xdlops_v2r3.hpp"
 
 template <ck::index_t BlockSize,
           typename FloatAB,
@@ -47,18 +47,18 @@ template <ck::index_t BlockSize,
           typename AGridMoveSliceWindowIteratorHacks,
           typename BGridMoveSliceWindowIteratorHacks,
           bool CAccessOrderMRepeatNRepeat>
-__host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
-                                               const FloatAB* p_b_grid,
-                                               FloatC* p_c_grid,
-                                               const AK0MK1GridDesc& a_k0_m_k1_grid_desc,
-                                               const BK0NK1GridDesc& b_k0_n_k1_grid_desc,
-                                               const CMNGridDesc& c_m_n_grid_desc,
-                                               AGridIteratorHacks,
-                                               BGridIteratorHacks,
-                                               CGridIteratorHacks,
-                                               AGridMoveSliceWindowIteratorHacks,
-                                               BGridMoveSliceWindowIteratorHacks,
-                                               ck::index_t nrepeat)
+__host__ float driver_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
+                                       const FloatAB* p_b_grid,
+                                       FloatC* p_c_grid,
+                                       const AK0MK1GridDesc& a_k0_m_k1_grid_desc,
+                                       const BK0NK1GridDesc& b_k0_n_k1_grid_desc,
+                                       const CMNGridDesc& c_m_n_grid_desc,
+                                       AGridIteratorHacks,
+                                       BGridIteratorHacks,
+                                       CGridIteratorHacks,
+                                       AGridMoveSliceWindowIteratorHacks,
+                                       BGridMoveSliceWindowIteratorHacks,
+                                       ck::index_t nrepeat)
 
 {
     using namespace ck;
@@ -68,47 +68,47 @@ __host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
     constexpr auto I2 = Number<2>{};
 
     using GridwiseGemm =
-        GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
-                                                       FloatAB,
-                                                       FloatAcc,
-                                                       FloatC,
-                                                       CGlobalMemoryDataOperation,
-                                                       AK0MK1GridDesc,
-                                                       BK0NK1GridDesc,
-                                                       CMNGridDesc,
-                                                       MPerBlock,
-                                                       NPerBlock,
-                                                       KPerBlock,
-                                                       MPerWave,
-                                                       NPerWave,
-                                                       K1,
-                                                       MRepeat,
-                                                       NRepeat,
-                                                       ABlockTransferThreadSliceLengths_K0_M_K1,
-                                                       ABlockTransferThreadClusterLengths_K0_M_K1,
-                                                       ABlockTransferThreadClusterArrangeOrder,
-                                                       ABlockTransferSrcAccessOrder,
-                                                       ABlockTransferSrcVectorDim,
-                                                       ABlockTransferSrcScalarPerVector,
-                                                       ABlockTransferDstScalarPerVector_K1,
-                                                       AThreadTransferSrcResetCoordinateAfterRun,
-                                                       BBlockTransferThreadSliceLengths_K0_N_K1,
-                                                       BBlockTransferThreadClusterLengths_K0_N_K1,
-                                                       BBlockTransferThreadClusterArrangeOrder,
-                                                       BBlockTransferSrcAccessOrder,
-                                                       BBlockTransferSrcVectorDim,
-                                                       BBlockTransferSrcScalarPerVector,
-                                                       BBlockTransferDstScalarPerVector_K1,
-                                                       BThreadTransferSrcResetCoordinateAfterRun,
-                                                       CThreadTransferSrcDstAccessOrder,
-                                                       CThreadTransferSrcDstVectorDim,
-                                                       CThreadTransferDstScalarPerVector,
-                                                       AGridIteratorHacks,
-                                                       BGridIteratorHacks,
-                                                       CGridIteratorHacks,
-                                                       AGridMoveSliceWindowIteratorHacks,
-                                                       BGridMoveSliceWindowIteratorHacks,
-                                                       CAccessOrderMRepeatNRepeat>;
+        GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
+                                                FloatAB,
+                                                FloatAcc,
+                                                FloatC,
+                                                CGlobalMemoryDataOperation,
+                                                AK0MK1GridDesc,
+                                                BK0NK1GridDesc,
+                                                CMNGridDesc,
+                                                MPerBlock,
+                                                NPerBlock,
+                                                KPerBlock,
+                                                MPerWave,
+                                                NPerWave,
+                                                K1,
+                                                MRepeat,
+                                                NRepeat,
+                                                ABlockTransferThreadSliceLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                ABlockTransferSrcAccessOrder,
+                                                ABlockTransferSrcVectorDim,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_K1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                BBlockTransferThreadSliceLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                BBlockTransferSrcAccessOrder,
+                                                BBlockTransferSrcVectorDim,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_K1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                CThreadTransferSrcDstAccessOrder,
+                                                CThreadTransferSrcDstVectorDim,
+                                                CThreadTransferDstScalarPerVector,
+                                                AGridIteratorHacks,
+                                                BGridIteratorHacks,
+                                                CGridIteratorHacks,
+                                                AGridMoveSliceWindowIteratorHacks,
+                                                BGridMoveSliceWindowIteratorHacks,
+                                                CAccessOrderMRepeatNRepeat>;
 
     {
         std::cout << "a_k0_m_k1_grid_desc{" << a_k0_m_k1_grid_desc.GetLength(I0) << ", "
@@ -126,7 +126,7 @@ __host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
     if(!GridwiseGemm::CheckValidity(a_k0_m_k1_grid_desc, b_k0_n_k1_grid_desc, c_m_n_grid_desc))
     {
         throw std::runtime_error(
-            "wrong! GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v2r3 has invalid setting");
+            "wrong! GridwiseGemm_km_kn_m0m1n0n1_xdlops_v2r3 has invalid setting");
     }
 
     const auto c_m0_m1_m2_n_grid_desc = GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
@@ -139,13 +139,13 @@ __host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
 
     const index_t grid_size = GridwiseGemm::CalculateGridSize(c_m_n_grid_desc);
 
-    const auto kernel = kernel_dynamic_gemm_xdlops_v2r3<GridwiseGemm,
-                                                        FloatAB,
-                                                        FloatC,
-                                                        remove_reference_t<AK0MK1GridDesc>,
-                                                        remove_reference_t<BK0NK1GridDesc>,
-                                                        remove_reference_t<CM0M1M2NGridDesc>,
-                                                        remove_reference_t<CBlockClusterAdaptor>>;
+    const auto kernel = kernel_gemm_xdlops_v2r3<GridwiseGemm,
+                                                FloatAB,
+                                                FloatC,
+                                                remove_reference_t<AK0MK1GridDesc>,
+                                                remove_reference_t<BK0NK1GridDesc>,
+                                                remove_reference_t<CM0M1M2NGridDesc>,
+                                                remove_reference_t<CBlockClusterAdaptor>>;
 
 #if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
     float ave_time = launch_and_time_kernel(kernel,
diff --git a/src/composable_kernel/host/driver_offline/src/conv_bwd_driver_offline.cpp b/src/composable_kernel/host/driver_offline/src/conv_bwd_driver_offline.cpp
index 34fa7eb3fa..67cea94813 100644
--- a/src/composable_kernel/host/driver_offline/src/conv_bwd_driver_offline.cpp
+++ b/src/composable_kernel/host/driver_offline/src/conv_bwd_driver_offline.cpp
@@ -12,10 +12,10 @@
 #include "conv_common.hpp"
 #include "host_conv_bwd_data.hpp"
 #include "device_tensor.hpp"
-#include "device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp"
-#include "device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp"
+#include "device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp"
+#include "device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp"
 
-#define USE_DYNAMIC_MODE 1
+#define USE_MODE 1
 #define USE_CONV_BWD_V4R1_XDL_NHWC 1
 #define USE_CONV_BWD_V4R1R2_XDL_NHWC 1
 
@@ -37,7 +37,7 @@ int main(int argc, char* argv[])
     constexpr auto I5 = Number<5>{};
     constexpr auto I6 = Number<6>{};
 
-#if USE_DYNAMIC_MODE
+#if USE_MODE
     // dynamic mode
     if(argc != 22)
     {
@@ -212,7 +212,7 @@ int main(int argc, char* argv[])
     }
 
     auto f_make_for_device_nhwc = [&]() {
-#if USE_DYNAMIC_MODE
+#if USE_MODE
         const auto in_lengths_dev     = make_tuple(N, Hi, Wi, C);
         const auto wei_lengths_dev    = make_tuple(K, Y, X, C);
         const auto out_lengths_dev    = make_tuple(N, Ho, Wo, K);
@@ -253,20 +253,20 @@ int main(int argc, char* argv[])
 
         const auto tmp = f_make_for_device_nhwc();
 
-        device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk<
-            in_data_t,
-            acc_data_t,
-            out_data_t>(tmp[I0],
-                        tmp[I1],
-                        tmp[I2],
-                        tmp[I3],
-                        tmp[I4],
-                        tmp[I5],
-                        tmp[I6],
-                        in_device,
-                        wei,
-                        out,
-                        nrepeat);
+        device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk<in_data_t,
+                                                                                  acc_data_t,
+                                                                                  out_data_t>(
+            tmp[I0],
+            tmp[I1],
+            tmp[I2],
+            tmp[I3],
+            tmp[I4],
+            tmp[I5],
+            tmp[I6],
+            in_device,
+            wei,
+            out,
+            nrepeat);
     }
 #endif
 
@@ -280,20 +280,20 @@ int main(int argc, char* argv[])
 
         const auto tmp = f_make_for_device_nhwc();
 
-        device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk<
-            in_data_t,
-            acc_data_t,
-            out_data_t>(tmp[I0],
-                        tmp[I1],
-                        tmp[I2],
-                        tmp[I3],
-                        tmp[I4],
-                        tmp[I5],
-                        tmp[I6],
-                        in_device,
-                        wei,
-                        out,
-                        nrepeat);
+        device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk<in_data_t,
+                                                                                    acc_data_t,
+                                                                                    out_data_t>(
+            tmp[I0],
+            tmp[I1],
+            tmp[I2],
+            tmp[I3],
+            tmp[I4],
+            tmp[I5],
+            tmp[I6],
+            in_device,
+            wei,
+            out,
+            nrepeat);
     }
 #endif
 
diff --git a/src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp b/src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp
index 968501e947..2653929c32 100644
--- a/src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp
+++ b/src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp
@@ -12,14 +12,14 @@
 #include "conv_common.hpp"
 #include "host_conv.hpp"
 #include "device_tensor.hpp"
-#include "device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp"
-#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp"
-#include "device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp"
-#include "device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp"
-#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
-#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
-
-#define USE_DYNAMIC_MODE 1
+#include "device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp"
+#include "device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp"
+#include "device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp"
+#include "device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp"
+#include "device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
+#include "device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
+
+#define USE_MODE 1
 #define USE_CONV_FWD_V4R4_NCHW 0
 #define USE_CONV_FWD_V4R4R2_NHWC 1
 #define USE_CONV_FWD_V6R1_NCHW 1
@@ -49,7 +49,7 @@ int main(int argc, char* argv[])
     constexpr auto I5 = Number<5>{};
     constexpr auto I6 = Number<6>{};
 
-#if USE_DYNAMIC_MODE
+#if USE_MODE
     // dynamic mode
     if(argc != 22)
     {
@@ -228,7 +228,7 @@ int main(int argc, char* argv[])
     }
 
     auto f_make_for_device_nchw = [&]() {
-#if USE_DYNAMIC_MODE
+#if USE_MODE
         const auto in_lengths_dev     = make_tuple(N, C, Hi, Wi);
         const auto wei_lengths_dev    = make_tuple(K, C, Y, X);
         const auto out_lengths_dev    = make_tuple(N, K, Ho, Wo);
@@ -260,7 +260,7 @@ int main(int argc, char* argv[])
     };
 
     auto f_make_for_device_nhwc = [&]() {
-#if USE_DYNAMIC_MODE
+#if USE_MODE
         const auto in_lengths_dev     = make_tuple(N, Hi, Wi, C);
         const auto wei_lengths_dev    = make_tuple(K, Y, X, C);
         const auto out_lengths_dev    = make_tuple(N, Ho, Wo, K);
@@ -301,20 +301,19 @@ int main(int argc, char* argv[])
 
         const auto tmp = f_make_for_device_nchw();
 
-        device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw<in_data_t,
-                                                                                   acc_data_t,
-                                                                                   out_data_t>(
-            tmp[I0],
-            tmp[I1],
-            tmp[I2],
-            tmp[I3],
-            tmp[I4],
-            tmp[I5],
-            tmp[I6],
-            in,
-            wei,
-            out_device,
-            nrepeat);
+        device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw<in_data_t,
+                                                                           acc_data_t,
+                                                                           out_data_t>(tmp[I0],
+                                                                                       tmp[I1],
+                                                                                       tmp[I2],
+                                                                                       tmp[I3],
+                                                                                       tmp[I4],
+                                                                                       tmp[I5],
+                                                                                       tmp[I6],
+                                                                                       in,
+                                                                                       wei,
+                                                                                       out_device,
+                                                                                       nrepeat);
     }
 #endif
 
@@ -328,20 +327,19 @@ int main(int argc, char* argv[])
 
         const auto tmp = f_make_for_device_nhwc();
 
-        device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk<in_data_t,
-                                                                                     acc_data_t,
-                                                                                     out_data_t>(
-            tmp[I0],
-            tmp[I1],
-            tmp[I2],
-            tmp[I3],
-            tmp[I4],
-            tmp[I5],
-            tmp[I6],
-            in,
-            wei,
-            out_device,
-            nrepeat);
+        device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk<in_data_t,
+                                                                             acc_data_t,
+                                                                             out_data_t>(tmp[I0],
+                                                                                         tmp[I1],
+                                                                                         tmp[I2],
+                                                                                         tmp[I3],
+                                                                                         tmp[I4],
+                                                                                         tmp[I5],
+                                                                                         tmp[I6],
+                                                                                         in,
+                                                                                         wei,
+                                                                                         out_device,
+                                                                                         nrepeat);
     }
 #endif
 
@@ -355,20 +353,19 @@ int main(int argc, char* argv[])
 
         const auto tmp = f_make_for_device_nchw();
 
-        device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw<in_data_t,
-                                                                                   acc_data_t,
-                                                                                   out_data_t>(
-            tmp[I0],
-            tmp[I1],
-            tmp[I2],
-            tmp[I3],
-            tmp[I4],
-            tmp[I5],
-            tmp[I6],
-            in,
-            wei,
-            out_device,
-            nrepeat);
+        device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw<in_data_t,
+                                                                           acc_data_t,
+                                                                           out_data_t>(tmp[I0],
+                                                                                       tmp[I1],
+                                                                                       tmp[I2],
+                                                                                       tmp[I3],
+                                                                                       tmp[I4],
+                                                                                       tmp[I5],
+                                                                                       tmp[I6],
+                                                                                       in,
+                                                                                       wei,
+                                                                                       out_device,
+                                                                                       nrepeat);
     }
 #endif
 
@@ -382,21 +379,20 @@ int main(int argc, char* argv[])
 
         const auto tmp = f_make_for_device_nchw();
 
-        device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw<in_data_t,
-                                                                                   16,
-                                                                                   acc_data_t,
-                                                                                   out_data_t>(
-            tmp[I0],
-            tmp[I1],
-            tmp[I2],
-            tmp[I3],
-            tmp[I4],
-            tmp[I5],
-            tmp[I6],
-            in,
-            wei,
-            out_device,
-            nrepeat);
+        device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw<in_data_t,
+                                                                           16,
+                                                                           acc_data_t,
+                                                                           out_data_t>(tmp[I0],
+                                                                                       tmp[I1],
+                                                                                       tmp[I2],
+                                                                                       tmp[I3],
+                                                                                       tmp[I4],
+                                                                                       tmp[I5],
+                                                                                       tmp[I6],
+                                                                                       in,
+                                                                                       wei,
+                                                                                       out_device,
+                                                                                       nrepeat);
     }
 #endif
 
@@ -410,9 +406,9 @@ int main(int argc, char* argv[])
 
         const auto tmp = f_make_for_device_nchw();
 
-        device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw<in_data_t,
-                                                                                      acc_data_t,
-                                                                                      out_data_t>(
+        device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw<in_data_t,
+                                                                              acc_data_t,
+                                                                              out_data_t>(
             tmp[I0],
             tmp[I1],
             tmp[I2],
@@ -437,9 +433,9 @@ int main(int argc, char* argv[])
 
         const auto tmp = f_make_for_device_nhwc();
 
-        device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk<in_data_t,
-                                                                                      acc_data_t,
-                                                                                      out_data_t>(
+        device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk<in_data_t,
+                                                                              acc_data_t,
+                                                                              out_data_t>(
             tmp[I0],
             tmp[I1],
             tmp[I2],
diff --git a/src/composable_kernel/host/host_tensor/include/conv_common.hpp b/src/composable_kernel/host/host_tensor/include/conv_common.hpp
index ca95c1f138..4bf2c23494 100644
--- a/src/composable_kernel/host/host_tensor/include/conv_common.hpp
+++ b/src/composable_kernel/host/host_tensor/include/conv_common.hpp
@@ -1,7 +1,7 @@
 #ifndef CONV_COMMON_HPP
 #define CONV_COMMON_HPP
 
-#include "dynamic_tensor_descriptor.hpp"
+#include "tensor_descriptor.hpp"
 
 enum ConvTensorLayout
 {
@@ -19,8 +19,8 @@ template <typename... InDesc,
           typename LeftPads,
           typename RightPads>
 constexpr auto get_convolution_output_default_4d_tensor_descriptor(
-    const ck::DynamicTensorDescriptor<InDesc...>& in_desc,
-    const ck::DynamicTensorDescriptor<WeiDesc...>& wei_desc,
+    const ck::TensorDescriptor<InDesc...>& in_desc,
+    const ck::TensorDescriptor<WeiDesc...>& wei_desc,
     const ConvStrides& conv_strides,
     const ConvDilations conv_dilations,
     const LeftPads& left_pads,
@@ -57,7 +57,7 @@ constexpr auto get_convolution_output_default_4d_tensor_descriptor(
     const auto Ho = (Hi + LeftPadH + RightPadH - YEff) / conv_strides[I0] + I1;
     const auto Wo = (Wi + LeftPadW + RightPadW - XEff) / conv_strides[I1] + I1;
 
-    return make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K, Ho, Wo));
+    return make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho, Wo));
 }
 
 template <class InDesc, class WeiDesc, class OutDesc>
diff --git a/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp b/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp
index c6e538543d..8fee95f66a 100644
--- a/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp
+++ b/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp
@@ -142,10 +142,10 @@ ConvSolution ConvCkIgemmFwdV6r1DlopsNchw::GetSolution(
     // kernel0: prepare
     {
         kernel0_info.kernel_file =
-            "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp";
+            "convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp";
 
         kernel0_info.kernel_name =
-            "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare";
+            "convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare";
 
         kernel0_info.l_wk = {1, 1, 1};
         kernel0_info.g_wk = {1, 1, 1};
@@ -157,10 +157,9 @@ ConvSolution ConvCkIgemmFwdV6r1DlopsNchw::GetSolution(
     // kernel1: compute
     {
         kernel1_info.kernel_file =
-            "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp";
+            "convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp";
 
-        kernel1_info.kernel_name =
-            "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw";
+        kernel1_info.kernel_name = "convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw";
 
         const auto block_size =
             std::size_t(ck::driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetBlockSize(

From f03af0c3408765eb8bb0e340f34dbf658ca72265 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Wed, 11 Aug 2021 00:08:42 +0000
Subject: [PATCH 31/57] rename

---
 .../tensor_description/tensor_descriptor.hpp  |  43 ++--
 .../blockwise_tensor_slice_transfer.hpp       |  17 +-
 .../blockwise_tensor_slice_transfer_v2.hpp    |  17 +-
 .../gridwise_contraction_dlops_v1r2.hpp       |  40 ++--
 .../gridwise_gemm_dlops_v1r2.hpp              |  72 +++---
 .../gridwise_gemm_dlops_v1r3.hpp              |  50 ++--
 .../gridwise_gemm_dlops_v2.hpp                |  35 ++-
 .../gridwise_gemm_xdlops_v2r3.hpp             |  52 ++---
 .../threadwise_tensor_slice_set.hpp           |   2 +-
 .../threadwise_tensor_slice_transfer.hpp      | 214 +++++++++---------
 .../threadwise_tensor_slice_transfer_v2.hpp   | 122 +++++-----
 ...mplicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp | 118 +++++-----
 ...plicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp | 104 ++++-----
 ...plicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp | 104 ++++-----
 ...mplicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp |  40 ++--
 ...plicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp |  30 +--
 ...icit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp |  30 +--
 ...mplicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp |  30 +--
 ...licit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp |  30 +--
 ...icit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp |  30 +--
 ...icit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp |  30 +--
 ...icit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp |  30 +--
 ...icit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp |  30 +--
 ...mplicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp |  30 +--
 .../include/driver_contraction_dlops_v1r2.hpp |  30 +--
 ...mplicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp |  20 +-
 ..._gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp |  20 +-
 .../include/driver_gemm_dlops_v1r2.hpp        |  30 +--
 .../include/driver_gemm_dlops_v1r3.hpp        |  30 +--
 .../include/driver_gemm_xdlops_v2r3.hpp       |  30 +--
 30 files changed, 714 insertions(+), 746 deletions(-)

diff --git a/src/composable_kernel/composable_kernel/include/tensor_description/tensor_descriptor.hpp b/src/composable_kernel/composable_kernel/include/tensor_description/tensor_descriptor.hpp
index 9821ee8641..4038ef63da 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_description/tensor_descriptor.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_description/tensor_descriptor.hpp
@@ -10,7 +10,7 @@ template <index_t NDimHidden, typename VisibleDimensionIds>
 struct TensorCoordinate;
 
 template <index_t NTransform, index_t NDimVisible, typename UpdateLowerIndexHack>
-struct TensorCoordinateIterator;
+struct TensorCoordinateStep;
 
 // Transforms: Tuple<transforms...>
 // LowerDimensionIdss : Tuple<Sequence<...>, ...>
@@ -252,17 +252,16 @@ struct TensorCoordinate
 };
 
 template <index_t NTransform, index_t NDimVisible, typename UpdateLowerIndexHack>
-struct TensorCoordinateIterator
+struct TensorCoordinateStep
 {
     // TODO make these private
     using VisibleIndex = MultiIndex<NDimVisible>;
 
     public:
-    __host__ __device__ constexpr TensorCoordinateIterator() = default;
+    __host__ __device__ constexpr TensorCoordinateStep() = default;
 
-    __host__
-        __device__ constexpr TensorCoordinateIterator(const VisibleIndex& idx_diff_visible,
-                                                      const MultiIndex<NTransform>& do_transforms)
+    __host__ __device__ constexpr TensorCoordinateStep(const VisibleIndex& idx_diff_visible,
+                                                       const MultiIndex<NTransform>& do_transforms)
         : idx_diff_visible_{idx_diff_visible}, do_transforms_{do_transforms}
     {
     }
@@ -423,8 +422,9 @@ __host__ __device__ constexpr auto make_tensor_coordinate(const TensorDesc& tens
 // UpdateLowerIndexHack: Sequence<...>
 // HACK: control UpdateLowerIndex
 template <typename TensorDesc, typename VisibleIndex, typename UpdateLowerIndexHack>
-__host__ __device__ constexpr auto make_tensor_coordinate_iterator(
-    const TensorDesc&, const VisibleIndex& idx_diff_visible, UpdateLowerIndexHack)
+__host__ __device__ constexpr auto make_tensor_coordinate_step(const TensorDesc&,
+                                                               const VisibleIndex& idx_diff_visible,
+                                                               UpdateLowerIndexHack)
 {
     static_assert(TensorDesc::GetNumOfDimension() == VisibleIndex::Size(),
                   "wrong! # of dimension inconsistent");
@@ -471,24 +471,24 @@ __host__ __device__ constexpr auto make_tensor_coordinate_iterator(
         set_container_subset(is_non_zero_diff, dims_low, non_zero_diff_pick_low);
     });
 
-    return TensorCoordinateIterator<ntransform, ndim_visible, UpdateLowerIndexHack>{
-        idx_diff_visible, do_transforms};
+    return TensorCoordinateStep<ntransform, ndim_visible, UpdateLowerIndexHack>{idx_diff_visible,
+                                                                                do_transforms};
 }
 
 template <typename TensorDesc, typename VisibleIndex>
-__host__ __device__ constexpr auto
-make_tensor_coordinate_iterator(const TensorDesc&, const VisibleIndex& idx_diff_visible)
+__host__ __device__ constexpr auto make_tensor_coordinate_step(const TensorDesc&,
+                                                               const VisibleIndex& idx_diff_visible)
 {
     constexpr index_t ntransform = TensorDesc::GetNumOfTransform();
 
-    return make_tensor_coordinate_iterator(
+    return make_tensor_coordinate_step(
         TensorDesc{}, idx_diff_visible, typename uniform_sequence_gen<ntransform, 0>::type{});
 }
 
-template <typename TensorDesc, typename TensorCoord, typename TensorCoordIterator>
+template <typename TensorDesc, typename TensorCoord, typename TensorCoordStep>
 __host__ __device__ constexpr void move_tensor_coordinate(const TensorDesc& tensor_desc,
                                                           TensorCoord& coord,
-                                                          const TensorCoordIterator& coord_iterator)
+                                                          const TensorCoordStep& coord_step)
 {
     constexpr index_t ndim_hidden = TensorDesc::GetNumOfHiddenDimension();
     constexpr index_t ntransform  = TensorDesc::GetNumOfTransform();
@@ -497,9 +497,8 @@ __host__ __device__ constexpr void move_tensor_coordinate(const TensorDesc& tens
     auto idx_diff_hidden = make_zero_multi_index<ndim_hidden>();
 
     // initialize visible index diff
-    set_container_subset(idx_diff_hidden,
-                         TensorDesc::GetVisibleDimensionIds(),
-                         coord_iterator.GetVisibleIndexDiff());
+    set_container_subset(
+        idx_diff_hidden, TensorDesc::GetVisibleDimensionIds(), coord_step.GetVisibleIndexDiff());
 
     // this is what needs to be updated
     auto& idx_hidden = coord.GetHiddenIndex();
@@ -508,13 +507,13 @@ __host__ __device__ constexpr void move_tensor_coordinate(const TensorDesc& tens
     auto idx_hidden_pick_visible =
         get_container_subset(idx_hidden, TensorDesc::GetVisibleDimensionIds());
 
-    idx_hidden_pick_visible += coord_iterator.GetIndexDiff();
+    idx_hidden_pick_visible += coord_step.GetIndexDiff();
 
     set_container_subset(idx_hidden, TensorDesc::GetVisibleDimensionIds(), idx_hidden_pick_visible);
 
     // update rest of hidden index
     static_for<ntransform - 1, -1, -1>{}([&](auto itran) {
-        if(coord_iterator.do_transforms_[itran])
+        if(coord_step.do_transforms_[itran])
         {
             const auto& tran        = tensor_desc.GetTransforms().At(itran);
             constexpr auto dims_low = TensorDesc::GetLowerDimensionIdss().At(itran);
@@ -527,7 +526,7 @@ __host__ __device__ constexpr void move_tensor_coordinate(const TensorDesc& tens
             MultiIndex<dims_low.Size()> idx_diff_low;
 
             // HACK: control UpdateLowerIndex for Merge using hack
-            constexpr index_t Hack = decltype(coord_iterator.update_lower_index_hack_)::At(itran);
+            constexpr index_t Hack = decltype(coord_step.update_lower_index_hack_)::At(itran);
 
             tran.UpdateLowerIndex(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
 
@@ -591,7 +590,7 @@ using TensorCoordinate_t = decltype(make_tensor_coordinate(
     TensorDesc{}, MultiIndex<remove_cv_t<remove_reference_t<TensorDesc>>::GetNumOfDimension()>{}));
 
 template <typename TensorDesc>
-using TensorCoordinateIterator_t = decltype(make_tensor_coordinate_iterator(
+using TensorCoordinateStep_t = decltype(make_tensor_coordinate_step(
     TensorDesc{}, MultiIndex<remove_cv_t<remove_reference_t<TensorDesc>>::GetNumOfDimension()>{}));
 
 } // namespace ck
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp
index 4303b6a4ca..cf21123de6 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp
@@ -77,15 +77,14 @@ struct BlockwiseTensorSliceTransfer_v4
         }
     }
 
-    template <typename SrcBuffer, typename SrcIteratorHacks>
-    __device__ void RunRead(const SrcDesc& src_desc,
-                            const SrcBuffer& src_buf,
-                            const SrcIteratorHacks& src_iterator_hacks)
+    template <typename SrcBuffer, typename SrcStepHacks>
+    __device__ void
+    RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf, const SrcStepHacks& src_step_hacks)
     {
         if(BlockSize == thread_cluster_desc_.GetElementSize() or
            get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
         {
-            threadwise_transfer_.RunRead(src_desc, src_buf, src_iterator_hacks);
+            threadwise_transfer_.RunRead(src_desc, src_buf, src_step_hacks);
         }
     }
 
@@ -118,18 +117,18 @@ struct BlockwiseTensorSliceTransfer_v4
         }
     }
 
-    // SrcMoveSliceWindowIteratorHack to control index calculation move slice window
-    template <typename SrcMoveSliceWindowIteratorHack>
+    // SrcMoveSliceWindowStepHack to control index calculation move slice window
+    template <typename SrcMoveSliceWindowStepHack>
     __device__ void
     MoveSrcSliceWindow(const SrcDesc& src_desc,
                        const Index& step,
-                       const SrcMoveSliceWindowIteratorHack& src_move_slice_window_iterator_hack)
+                       const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
     {
         if(BlockSize == thread_cluster_desc_.GetElementSize() or
            get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
         {
             threadwise_transfer_.MoveSrcSliceWindow(
-                src_desc, step, src_move_slice_window_iterator_hack);
+                src_desc, step, src_move_slice_window_step_hack);
         }
     }
 
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer_v2.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer_v2.hpp
index 25df52904d..4f3336f9f7 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer_v2.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer_v2.hpp
@@ -75,15 +75,14 @@ struct BlockwiseTensorSliceTransfer_v4r1
         }
     }
 
-    template <typename SrcBuffer, typename SrcIteratorHacks>
-    __device__ void RunRead(const SrcDesc& src_desc,
-                            const SrcBuffer& src_buf,
-                            const SrcIteratorHacks& src_iterator_hacks)
+    template <typename SrcBuffer, typename SrcStepHacks>
+    __device__ void
+    RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf, const SrcStepHacks& src_step_hacks)
     {
         if(BlockSize == thread_cluster_desc_.GetElementSize() or
            get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
         {
-            threadwise_transfer_.RunRead(src_desc, src_buf, src_iterator_hacks);
+            threadwise_transfer_.RunRead(src_desc, src_buf, src_step_hacks);
         }
     }
 
@@ -106,18 +105,18 @@ struct BlockwiseTensorSliceTransfer_v4r1
         }
     }
 
-    // SrcMoveSliceWindowIteratorHack to control index calculation move slice window
-    template <typename SrcMoveSliceWindowIteratorHack>
+    // SrcMoveSliceWindowStepHack to control index calculation move slice window
+    template <typename SrcMoveSliceWindowStepHack>
     __device__ void
     MoveSrcSliceWindow(const SrcDesc& src_desc,
                        const Index& step,
-                       const SrcMoveSliceWindowIteratorHack& src_move_slice_window_iterator_hack)
+                       const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
     {
         if(BlockSize == thread_cluster_desc_.GetElementSize() or
            get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
         {
             threadwise_transfer_.MoveSrcSliceWindow(
-                src_desc, step, src_move_slice_window_iterator_hack);
+                src_desc, step, src_move_slice_window_step_hack);
         }
     }
 
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_contraction_dlops_v1r2.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_contraction_dlops_v1r2.hpp
index 3070045554..366451dcc3 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_contraction_dlops_v1r2.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_contraction_dlops_v1r2.hpp
@@ -84,11 +84,11 @@ template <index_t BlockSize,
           typename CThreadTransferSrcDstAccessOrder,
           index_t CThreadTransferSrcDstVectorDim,
           index_t CThreadTransferDstScalarPerVector,
-          typename AGridIteratorHacks,
-          typename BGridIteratorHacks,
-          typename CGridIteratorHacks,
-          typename AGridMoveSliceWindowIteratorHacks,
-          typename BGridMoveSliceWindowIteratorHacks>
+          typename AGridStepHacks,
+          typename BGridStepHacks,
+          typename CGridStepHacks,
+          typename AGridMoveSliceWindowStepHacks,
+          typename BGridMoveSliceWindowStepHacks>
 struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1
 {
     static constexpr auto I0 = Number<0>{};
@@ -496,9 +496,9 @@ struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN
         // LDS double buffer: preload data into LDS
         {
             a_blockwise_copy.RunRead(
-                a_grid_desc_gk0_gm0_gm10_gm11_gk1, a_global_buf, AGridIteratorHacks{});
+                a_grid_desc_gk0_gm0_gm10_gm11_gk1, a_global_buf, AGridStepHacks{});
             b_blockwise_copy.RunRead(
-                b_grid_desc_gk0_gn0_gn10_gn11_gk1, b_global_buf, BGridIteratorHacks{});
+                b_grid_desc_gk0_gn0_gn10_gn11_gk1, b_global_buf, BGridStepHacks{});
 
             a_blockwise_copy.RunWrite(a_block_desc_gk0_gm0_gm10_gm11_gk1, a_block_even_buf);
             b_blockwise_copy.RunWrite(b_block_desc_gk0_gn0_gn10_gn11_gk1, b_block_even_buf);
@@ -515,18 +515,18 @@ struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN
                 // even iteration
                 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_gk0_gm0_gm10_gm11_gk1,
                                                     a_block_slice_copy_step,
-                                                    AGridMoveSliceWindowIteratorHacks{});
+                                                    AGridMoveSliceWindowStepHacks{});
                 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc_gk0_gn0_gn10_gn11_gk1,
                                                     b_block_slice_copy_step,
-                                                    BGridMoveSliceWindowIteratorHacks{});
+                                                    BGridMoveSliceWindowStepHacks{});
 
                 __syncthreads();
 
                 // LDS doubel buffer: load next data from device mem
                 a_blockwise_copy.RunRead(
-                    a_grid_desc_gk0_gm0_gm10_gm11_gk1, a_global_buf, AGridIteratorHacks{});
+                    a_grid_desc_gk0_gm0_gm10_gm11_gk1, a_global_buf, AGridStepHacks{});
                 b_blockwise_copy.RunRead(
-                    b_grid_desc_gk0_gn0_gn10_gn11_gk1, b_global_buf, BGridIteratorHacks{});
+                    b_grid_desc_gk0_gn0_gn10_gn11_gk1, b_global_buf, BGridStepHacks{});
 
                 // LDS double buffer: GEMM on current data
                 blockwise_gemm.Run(c_thread_desc_bm0_bm1_bn0_bn1,
@@ -541,18 +541,18 @@ struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN
                 // odd iteration
                 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_gk0_gm0_gm10_gm11_gk1,
                                                     a_block_slice_copy_step,
-                                                    AGridMoveSliceWindowIteratorHacks{});
+                                                    AGridMoveSliceWindowStepHacks{});
                 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc_gk0_gn0_gn10_gn11_gk1,
                                                     b_block_slice_copy_step,
-                                                    BGridMoveSliceWindowIteratorHacks{});
+                                                    BGridMoveSliceWindowStepHacks{});
 
                 __syncthreads();
 
                 // LDS doubel buffer: load next data from device mem
                 a_blockwise_copy.RunRead(
-                    a_grid_desc_gk0_gm0_gm10_gm11_gk1, a_global_buf, AGridIteratorHacks{});
+                    a_grid_desc_gk0_gm0_gm10_gm11_gk1, a_global_buf, AGridStepHacks{});
                 b_blockwise_copy.RunRead(
-                    b_grid_desc_gk0_gn0_gn10_gn11_gk1, b_global_buf, BGridIteratorHacks{});
+                    b_grid_desc_gk0_gn0_gn10_gn11_gk1, b_global_buf, BGridStepHacks{});
 
                 // LDS double buffer: GEMM on current data
                 blockwise_gemm.Run(
@@ -571,18 +571,18 @@ struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN
         {
             a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_gk0_gm0_gm10_gm11_gk1,
                                                 a_block_slice_copy_step,
-                                                AGridMoveSliceWindowIteratorHacks{});
+                                                AGridMoveSliceWindowStepHacks{});
             b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc_gk0_gn0_gn10_gn11_gk1,
                                                 b_block_slice_copy_step,
-                                                BGridMoveSliceWindowIteratorHacks{});
+                                                BGridMoveSliceWindowStepHacks{});
 
             __syncthreads();
 
             // LDS double buffer: load last data from device mem
             a_blockwise_copy.RunRead(
-                a_grid_desc_gk0_gm0_gm10_gm11_gk1, a_global_buf, AGridIteratorHacks{});
+                a_grid_desc_gk0_gm0_gm10_gm11_gk1, a_global_buf, AGridStepHacks{});
             b_blockwise_copy.RunRead(
-                b_grid_desc_gk0_gn0_gn10_gn11_gk1, b_global_buf, BGridIteratorHacks{});
+                b_grid_desc_gk0_gn0_gn10_gn11_gk1, b_global_buf, BGridStepHacks{});
 
             // LDS double buffer: GEMM on 2nd-last data
             blockwise_gemm.Run(
@@ -650,7 +650,7 @@ struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN
                      c_thread_buf,
                      c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
                      c_grid_buf,
-                     CGridIteratorHacks{});
+                     CGridStepHacks{});
         }
     }
 };
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r2.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r2.hpp
index 88f2059bbf..31a0fa342a 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r2.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r2.hpp
@@ -145,11 +145,11 @@ template <index_t BlockSize,
           typename CThreadTransferSrcDstAccessOrder,
           index_t CThreadTransferSrcDstVectorDim,
           index_t CThreadTransferDstScalarPerVector,
-          typename AGridIteratorHacks,
-          typename BGridIteratorHacks,
-          typename CGridIteratorHacks,
-          typename AGridMoveSliceWindowIteratorHacks,
-          typename BGridMoveSliceWindowIteratorHacks>
+          typename AGridStepHacks,
+          typename BGridStepHacks,
+          typename CGridStepHacks,
+          typename AGridMoveSliceWindowStepHacks,
+          typename BGridMoveSliceWindowStepHacks>
 struct GridwiseGemmDlops_km_kn_mn_v1r2
 {
     static constexpr auto I0 = Number<0>{};
@@ -475,15 +475,15 @@ struct GridwiseGemmDlops_km_kn_mn_v1r2
         constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock, 0, 0);
 
         // hack to control index calculation when iterating over A and B matrix for threadwise copy
-        constexpr auto a_k_m0_m1_global_iterator_hacks = AGridIteratorHacks{};
-        constexpr auto b_k_n0_n1_global_iterator_hacks = BGridIteratorHacks{};
+        constexpr auto a_k_m0_m1_global_step_hacks = AGridStepHacks{};
+        constexpr auto b_k_n0_n1_global_step_hacks = BGridStepHacks{};
 
         // hack to control index calculation when move slice window for A and B matrix for
         // threadwise copy
-        constexpr auto a_k_m0_m1_global_move_slice_window_iterator_hack =
-            AGridMoveSliceWindowIteratorHacks{};
-        constexpr auto b_k_n0_n1_global_move_slice_window_iterator_hack =
-            BGridMoveSliceWindowIteratorHacks{};
+        constexpr auto a_k_m0_m1_global_move_slice_window_step_hack =
+            AGridMoveSliceWindowStepHacks{};
+        constexpr auto b_k_n0_n1_global_move_slice_window_step_hack =
+            BGridMoveSliceWindowStepHacks{};
 
         auto a_block_even_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
             p_a_block_double, a_k_m0_m1_block_desc.GetElementSpaceSize());
@@ -500,9 +500,9 @@ struct GridwiseGemmDlops_km_kn_mn_v1r2
         // LDS double buffer: preload data into LDS
         {
             a_blockwise_copy.RunRead(
-                a_k_m0_m1_grid_desc, a_global_buf, a_k_m0_m1_global_iterator_hacks);
+                a_k_m0_m1_grid_desc, a_global_buf, a_k_m0_m1_global_step_hacks);
             b_blockwise_copy.RunRead(
-                b_k_n0_n1_grid_desc, b_global_buf, b_k_n0_n1_global_iterator_hacks);
+                b_k_n0_n1_grid_desc, b_global_buf, b_k_n0_n1_global_step_hacks);
 
             a_blockwise_copy.RunWrite(a_k_m0_m1_block_desc, a_block_even_buf);
             b_blockwise_copy.RunWrite(b_k_n0_n1_block_desc, b_block_even_buf);
@@ -517,22 +517,20 @@ struct GridwiseGemmDlops_km_kn_mn_v1r2
             do
             {
                 // even iteration
-                a_blockwise_copy.MoveSrcSliceWindow(
-                    a_k_m0_m1_grid_desc,
-                    a_block_slice_copy_step,
-                    a_k_m0_m1_global_move_slice_window_iterator_hack);
-                b_blockwise_copy.MoveSrcSliceWindow(
-                    b_k_n0_n1_grid_desc,
-                    b_block_slice_copy_step,
-                    b_k_n0_n1_global_move_slice_window_iterator_hack);
+                a_blockwise_copy.MoveSrcSliceWindow(a_k_m0_m1_grid_desc,
+                                                    a_block_slice_copy_step,
+                                                    a_k_m0_m1_global_move_slice_window_step_hack);
+                b_blockwise_copy.MoveSrcSliceWindow(b_k_n0_n1_grid_desc,
+                                                    b_block_slice_copy_step,
+                                                    b_k_n0_n1_global_move_slice_window_step_hack);
 
                 __syncthreads();
 
                 // LDS doubel buffer: load next data from device mem
                 a_blockwise_copy.RunRead(
-                    a_k_m0_m1_grid_desc, a_global_buf, a_k_m0_m1_global_iterator_hacks);
+                    a_k_m0_m1_grid_desc, a_global_buf, a_k_m0_m1_global_step_hacks);
                 b_blockwise_copy.RunRead(
-                    b_k_n0_n1_grid_desc, b_global_buf, b_k_n0_n1_global_iterator_hacks);
+                    b_k_n0_n1_grid_desc, b_global_buf, b_k_n0_n1_global_step_hacks);
 
                 // LDS double buffer: GEMM on current data
                 blockwise_gemm.Run(c_m10_m11_n10_n11_thread_desc,
@@ -545,22 +543,20 @@ struct GridwiseGemmDlops_km_kn_mn_v1r2
                 b_blockwise_copy.RunWrite(b_k_n0_n1_block_desc, b_block_odd_buf);
 
                 // odd iteration
-                a_blockwise_copy.MoveSrcSliceWindow(
-                    a_k_m0_m1_grid_desc,
-                    a_block_slice_copy_step,
-                    a_k_m0_m1_global_move_slice_window_iterator_hack);
-                b_blockwise_copy.MoveSrcSliceWindow(
-                    b_k_n0_n1_grid_desc,
-                    b_block_slice_copy_step,
-                    b_k_n0_n1_global_move_slice_window_iterator_hack);
+                a_blockwise_copy.MoveSrcSliceWindow(a_k_m0_m1_grid_desc,
+                                                    a_block_slice_copy_step,
+                                                    a_k_m0_m1_global_move_slice_window_step_hack);
+                b_blockwise_copy.MoveSrcSliceWindow(b_k_n0_n1_grid_desc,
+                                                    b_block_slice_copy_step,
+                                                    b_k_n0_n1_global_move_slice_window_step_hack);
 
                 __syncthreads();
 
                 // LDS doubel buffer: load next data from device mem
                 a_blockwise_copy.RunRead(
-                    a_k_m0_m1_grid_desc, a_global_buf, a_k_m0_m1_global_iterator_hacks);
+                    a_k_m0_m1_grid_desc, a_global_buf, a_k_m0_m1_global_step_hacks);
                 b_blockwise_copy.RunRead(
-                    b_k_n0_n1_grid_desc, b_global_buf, b_k_n0_n1_global_iterator_hacks);
+                    b_k_n0_n1_grid_desc, b_global_buf, b_k_n0_n1_global_step_hacks);
 
                 // LDS double buffer: GEMM on current data
                 blockwise_gemm.Run(
@@ -579,18 +575,18 @@ struct GridwiseGemmDlops_km_kn_mn_v1r2
         {
             a_blockwise_copy.MoveSrcSliceWindow(a_k_m0_m1_grid_desc,
                                                 a_block_slice_copy_step,
-                                                a_k_m0_m1_global_move_slice_window_iterator_hack);
+                                                a_k_m0_m1_global_move_slice_window_step_hack);
             b_blockwise_copy.MoveSrcSliceWindow(b_k_n0_n1_grid_desc,
                                                 b_block_slice_copy_step,
-                                                b_k_n0_n1_global_move_slice_window_iterator_hack);
+                                                b_k_n0_n1_global_move_slice_window_step_hack);
 
             __syncthreads();
 
             // LDS double buffer: load last data from device mem
             a_blockwise_copy.RunRead(
-                a_k_m0_m1_grid_desc, a_global_buf, a_k_m0_m1_global_iterator_hacks);
+                a_k_m0_m1_grid_desc, a_global_buf, a_k_m0_m1_global_step_hacks);
             b_blockwise_copy.RunRead(
-                b_k_n0_n1_grid_desc, b_global_buf, b_k_n0_n1_global_iterator_hacks);
+                b_k_n0_n1_grid_desc, b_global_buf, b_k_n0_n1_global_step_hacks);
 
             // LDS double buffer: GEMM on 2nd-last data
             blockwise_gemm.Run(
@@ -657,7 +653,7 @@ struct GridwiseGemmDlops_km_kn_mn_v1r2
                      c_thread_buf,
                      c_m0_m10_m11_n0_n10_n11_grid_desc,
                      c_grid_buf,
-                     CGridIteratorHacks{});
+                     CGridStepHacks{});
         }
     }
 };
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r3.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r3.hpp
index 70cedf3fa0..1017dcc2a1 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r3.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r3.hpp
@@ -141,11 +141,11 @@ template <index_t BlockSize,
           typename CThreadTransferSrcDstAccessOrder,
           index_t CThreadTransferSrcDstVectorDim,
           index_t CThreadTransferDstScalarPerVector,
-          typename AGridIteratorHacks,
-          typename BGridIteratorHacks,
-          typename CGridIteratorHacks,
-          typename AGridMoveSliceWindowIteratorHacks,
-          typename BGridMoveSliceWindowIteratorHacks>
+          typename AGridStepHacks,
+          typename BGridStepHacks,
+          typename CGridStepHacks,
+          typename AGridMoveSliceWindowStepHacks,
+          typename BGridMoveSliceWindowStepHacks>
 struct GridwiseGemmDlops_km_kn_mn_v1r3
 {
     static constexpr auto I0 = Number<0>{};
@@ -494,8 +494,8 @@ struct GridwiseGemmDlops_km_kn_mn_v1r3
 
         // LDS double buffer: preload data into LDS
         {
-            a_blockwise_copy.RunRead(a_k0_m0_m1_k1_grid_desc, a_global_buf, AGridIteratorHacks{});
-            b_blockwise_copy.RunRead(b_k0_n0_n1_k1_grid_desc, b_global_buf, BGridIteratorHacks{});
+            a_blockwise_copy.RunRead(a_k0_m0_m1_k1_grid_desc, a_global_buf, AGridStepHacks{});
+            b_blockwise_copy.RunRead(b_k0_n0_n1_k1_grid_desc, b_global_buf, BGridStepHacks{});
 
             a_blockwise_copy.RunWrite(a_k0_m0_m1_k1_block_desc, a_block_even_buf);
             b_blockwise_copy.RunWrite(b_k0_n0_n1_k1_block_desc, b_block_even_buf);
@@ -514,18 +514,16 @@ struct GridwiseGemmDlops_km_kn_mn_v1r3
                 // even iteration
                 a_blockwise_copy.MoveSrcSliceWindow(a_k0_m0_m1_k1_grid_desc,
                                                     a_block_slice_copy_step,
-                                                    AGridMoveSliceWindowIteratorHacks{});
+                                                    AGridMoveSliceWindowStepHacks{});
                 b_blockwise_copy.MoveSrcSliceWindow(b_k0_n0_n1_k1_grid_desc,
                                                     b_block_slice_copy_step,
-                                                    BGridMoveSliceWindowIteratorHacks{});
+                                                    BGridMoveSliceWindowStepHacks{});
 
                 __syncthreads();
 
                 // LDS doubel buffer: load next data from device mem
-                a_blockwise_copy.RunRead(
-                    a_k0_m0_m1_k1_grid_desc, a_global_buf, AGridIteratorHacks{});
-                b_blockwise_copy.RunRead(
-                    b_k0_n0_n1_k1_grid_desc, b_global_buf, BGridIteratorHacks{});
+                a_blockwise_copy.RunRead(a_k0_m0_m1_k1_grid_desc, a_global_buf, AGridStepHacks{});
+                b_blockwise_copy.RunRead(b_k0_n0_n1_k1_grid_desc, b_global_buf, BGridStepHacks{});
 
                 // LDS double buffer: GEMM on current data
                 blockwise_gemm.Run(c_m10_m11_n10_n11_thread_desc,
@@ -540,18 +538,16 @@ struct GridwiseGemmDlops_km_kn_mn_v1r3
                 // odd iteration
                 a_blockwise_copy.MoveSrcSliceWindow(a_k0_m0_m1_k1_grid_desc,
                                                     a_block_slice_copy_step,
-                                                    AGridMoveSliceWindowIteratorHacks{});
+                                                    AGridMoveSliceWindowStepHacks{});
                 b_blockwise_copy.MoveSrcSliceWindow(b_k0_n0_n1_k1_grid_desc,
                                                     b_block_slice_copy_step,
-                                                    BGridMoveSliceWindowIteratorHacks{});
+                                                    BGridMoveSliceWindowStepHacks{});
 
                 __syncthreads();
 
                 // LDS doubel buffer: load next data from device mem
-                a_blockwise_copy.RunRead(
-                    a_k0_m0_m1_k1_grid_desc, a_global_buf, AGridIteratorHacks{});
-                b_blockwise_copy.RunRead(
-                    b_k0_n0_n1_k1_grid_desc, b_global_buf, BGridIteratorHacks{});
+                a_blockwise_copy.RunRead(a_k0_m0_m1_k1_grid_desc, a_global_buf, AGridStepHacks{});
+                b_blockwise_copy.RunRead(b_k0_n0_n1_k1_grid_desc, b_global_buf, BGridStepHacks{});
 
                 // LDS double buffer: GEMM on current data
                 blockwise_gemm.Run(
@@ -568,18 +564,16 @@ struct GridwiseGemmDlops_km_kn_mn_v1r3
         // LDS double buffer: tail
         if constexpr(HasDoubleTailKBlockLoop) // if has 2 iteration left
         {
-            a_blockwise_copy.MoveSrcSliceWindow(a_k0_m0_m1_k1_grid_desc,
-                                                a_block_slice_copy_step,
-                                                AGridMoveSliceWindowIteratorHacks{});
-            b_blockwise_copy.MoveSrcSliceWindow(b_k0_n0_n1_k1_grid_desc,
-                                                b_block_slice_copy_step,
-                                                BGridMoveSliceWindowIteratorHacks{});
+            a_blockwise_copy.MoveSrcSliceWindow(
+                a_k0_m0_m1_k1_grid_desc, a_block_slice_copy_step, AGridMoveSliceWindowStepHacks{});
+            b_blockwise_copy.MoveSrcSliceWindow(
+                b_k0_n0_n1_k1_grid_desc, b_block_slice_copy_step, BGridMoveSliceWindowStepHacks{});
 
             __syncthreads();
 
             // LDS double buffer: load last data from device mem
-            a_blockwise_copy.RunRead(a_k0_m0_m1_k1_grid_desc, a_global_buf, AGridIteratorHacks{});
-            b_blockwise_copy.RunRead(b_k0_n0_n1_k1_grid_desc, b_global_buf, BGridIteratorHacks{});
+            a_blockwise_copy.RunRead(a_k0_m0_m1_k1_grid_desc, a_global_buf, AGridStepHacks{});
+            b_blockwise_copy.RunRead(b_k0_n0_n1_k1_grid_desc, b_global_buf, BGridStepHacks{});
 
             // LDS double buffer: GEMM on 2nd-last data
             blockwise_gemm.Run(
@@ -647,7 +641,7 @@ struct GridwiseGemmDlops_km_kn_mn_v1r3
                      c_thread_buf,
                      c_m0_m10_m11_n0_n10_n11_grid_desc,
                      c_grid_buf,
-                     CGridIteratorHacks{});
+                     CGridStepHacks{});
         }
     }
 };
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
index 484f5d938d..7fdb89781d 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
@@ -42,11 +42,11 @@ template <index_t BlockSize,
           typename CThreadTransferSrcDstAccessOrder,
           index_t CThreadTransferSrcDstVectorDim,
           index_t CThreadTransferDstScalarPerVector,
-          typename AGlobalIteratorHacks,
-          typename BGlobalIteratorHacks,
-          typename CGlobalIteratorHacks,
-          typename AGlobalMoveSliceWindowIteratorHacks,
-          typename BGlobalMoveSliceWindowIteratorHacks>
+          typename AGlobalStepHacks,
+          typename BGlobalStepHacks,
+          typename CGlobalStepHacks,
+          typename AGlobalMoveSliceWindowStepHacks,
+          typename BGlobalMoveSliceWindowStepHacks>
 struct GridwiseGemmDlops_km_kn_mn_v3
 {
     __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
@@ -239,15 +239,14 @@ struct GridwiseGemmDlops_km_kn_mn_v3
         constexpr auto b_thread_slice_copy_step = make_multi_index(EPerBlock, 0, 0, 0);
 
         // hack to control index calculation when iterating over A and B matrix for threadwise copy
-        constexpr auto a_e_k_global_iterator_hacks       = AGlobalIteratorHacks{};
-        constexpr auto b_e_n_ho_wo_global_iterator_hacks = BGlobalIteratorHacks{};
+        constexpr auto a_e_k_global_step_hacks       = AGlobalStepHacks{};
+        constexpr auto b_e_n_ho_wo_global_step_hacks = BGlobalStepHacks{};
 
         // hack to control index calculation when move slice window for A and B matrix for
         // threadwise copy
-        constexpr auto a_e_k_global_move_slice_window_iterator_hack =
-            AGlobalMoveSliceWindowIteratorHacks{};
-        constexpr auto b_e_n_ho_wo_global_move_slice_window_iterator_hack =
-            BGlobalMoveSliceWindowIteratorHacks{};
+        constexpr auto a_e_k_global_move_slice_window_step_hack = AGlobalMoveSliceWindowStepHacks{};
+        constexpr auto b_e_n_ho_wo_global_move_slice_window_step_hack =
+            BGlobalMoveSliceWindowStepHacks{};
 
         // double regsiter buffer for b
         StaticBuffer<AddressSpaceEnum_t::Vgpr,
@@ -257,14 +256,14 @@ struct GridwiseGemmDlops_km_kn_mn_v3
 
         // LDS double buffer: preload data
         {
-            a_blockwise_copy.RunRead(a_e_k_global_desc, a_global_buf, a_e_k_global_iterator_hacks);
+            a_blockwise_copy.RunRead(a_e_k_global_desc, a_global_buf, a_e_k_global_step_hacks);
 
             b_threadwise_transfer.Run(b_e_n_ho_wo_global_desc,
                                       b_global_buf,
                                       b_e_n_ho_wo_thread_desc,
                                       make_tuple(I0, I0, I0, I0),
                                       b_thread_even_buf,
-                                      b_e_n_ho_wo_global_iterator_hacks);
+                                      b_e_n_ho_wo_global_step_hacks);
 
             a_blockwise_copy.RunWrite(a_e_k_desc, a_block_buf);
         }
@@ -288,7 +287,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
                                           b_e_n_ho_wo_thread_desc,
                                           make_tuple(I0, I0, I0, I0),
                                           b_thread_odd_buf,
-                                          b_e_n_ho_wo_global_iterator_hacks);
+                                          b_e_n_ho_wo_global_step_hacks);
 
                 // LDS double buffer: GEMM on current data
                 // TODO: @Zhang Jing: blockwise gemm should be able to move slice window
@@ -304,7 +303,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
                                           b_e_n_ho_wo_thread_desc,
                                           make_tuple(I0, I0, I0, I0),
                                           b_thread_even_buf,
-                                          b_e_n_ho_wo_global_iterator_hacks);
+                                          b_e_n_ho_wo_global_step_hacks);
 
                 // LDS double buffer: GEMM on current data
                 blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf);
@@ -327,7 +326,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
                                       b_e_n_ho_wo_thread_desc,
                                       make_tuple(I0, I0, I0, I0),
                                       b_thread_odd_buf,
-                                      b_e_n_ho_wo_global_iterator_hacks);
+                                      b_e_n_ho_wo_global_step_hacks);
 
             // LDS double buffer: GEMM on 2nd-last data
             blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
@@ -346,7 +345,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
         // output: register to global memory
         {
             // hack to control index calculation when iterating over c_k_n_ho_wo_global tensor
-            constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks = CGlobalIteratorHacks{};
+            constexpr auto c_k_n_ho_wo_global_tensor_step_hacks = CGlobalStepHacks{};
 
             const index_t k_thread_data_on_global =
                 k_block_data_on_global + k_thread_id * KPerThread;
@@ -370,7 +369,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
                      c_thread_buf,
                      c_k_n_ho_wo_global_desc,
                      c_global_buf,
-                     c_k_n_ho_wo_global_tensor_iterator_hacks);
+                     c_k_n_ho_wo_global_tensor_step_hacks);
         }
     }
 
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
index b70d57f913..06352edce3 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
@@ -126,11 +126,11 @@ template <index_t BlockSize,
           typename CThreadTransferSrcDstAccessOrder,
           index_t CThreadTransferSrcDstVectorDim,
           index_t CThreadTransferDstScalarPerVector,
-          typename AGridIteratorHacks,
-          typename BGridIteratorHacks,
-          typename CGridIteratorHacks,
-          typename AGridMoveSliceWindowIteratorHacks,
-          typename BGridMoveSliceWindowIteratorHacks,
+          typename AGridStepHacks,
+          typename BGridStepHacks,
+          typename CGridStepHacks,
+          typename AGridMoveSliceWindowStepHacks,
+          typename BGridMoveSliceWindowStepHacks,
           bool CAccessOrderMRepeatNRepeat>
 struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
 {
@@ -416,15 +416,13 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
         constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock, 0, 0);
 
         // hack to control index calculation when iterating over A and B matrix for threadwise copy
-        constexpr auto a_k0_m_k1_grid_iterator_hacks = AGridIteratorHacks{};
-        constexpr auto b_k0_n_k1_grid_iterator_hacks = BGridIteratorHacks{};
+        constexpr auto a_k0_m_k1_grid_step_hacks = AGridStepHacks{};
+        constexpr auto b_k0_n_k1_grid_step_hacks = BGridStepHacks{};
 
         // hack to control index calculation when move slice window for A and B matrix for
         // threadwise copy
-        constexpr auto a_k0_m_k1_grid_move_slice_window_iterator_hack =
-            AGridMoveSliceWindowIteratorHacks{};
-        constexpr auto b_k0_n_k1_grid_move_slice_window_iterator_hack =
-            BGridMoveSliceWindowIteratorHacks{};
+        constexpr auto a_k0_m_k1_grid_move_slice_window_step_hack = AGridMoveSliceWindowStepHacks{};
+        constexpr auto b_k0_n_k1_grid_move_slice_window_step_hack = BGridMoveSliceWindowStepHacks{};
 
         auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
             p_a_block, a_k0_m_k1_block_desc.GetElementSpaceSize());
@@ -433,10 +431,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
 
         // preload data into LDS
         {
-            a_blockwise_copy.RunRead(
-                a_k0_m_k1_grid_desc, a_grid_buf, a_k0_m_k1_grid_iterator_hacks);
-            b_blockwise_copy.RunRead(
-                b_k0_n_k1_grid_desc, b_grid_buf, b_k0_n_k1_grid_iterator_hacks);
+            a_blockwise_copy.RunRead(a_k0_m_k1_grid_desc, a_grid_buf, a_k0_m_k1_grid_step_hacks);
+            b_blockwise_copy.RunRead(b_k0_n_k1_grid_desc, b_grid_buf, b_k0_n_k1_grid_step_hacks);
 
             a_blockwise_copy.RunWrite(a_k0_m_k1_block_desc, a_block_buf);
             b_blockwise_copy.RunWrite(b_k0_n_k1_block_desc, b_block_buf);
@@ -449,18 +445,16 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
         {
             a_blockwise_copy.MoveSrcSliceWindow(a_k0_m_k1_grid_desc,
                                                 a_block_slice_copy_step,
-                                                a_k0_m_k1_grid_move_slice_window_iterator_hack);
+                                                a_k0_m_k1_grid_move_slice_window_step_hack);
             b_blockwise_copy.MoveSrcSliceWindow(b_k0_n_k1_grid_desc,
                                                 b_block_slice_copy_step,
-                                                b_k0_n_k1_grid_move_slice_window_iterator_hack);
+                                                b_k0_n_k1_grid_move_slice_window_step_hack);
 
-            a_blockwise_copy.RunRead(
-                a_k0_m_k1_grid_desc, a_grid_buf, a_k0_m_k1_grid_iterator_hacks);
+            a_blockwise_copy.RunRead(a_k0_m_k1_grid_desc, a_grid_buf, a_k0_m_k1_grid_step_hacks);
 
             block_sync_lds();
 
-            b_blockwise_copy.RunRead(
-                b_k0_n_k1_grid_desc, b_grid_buf, b_k0_n_k1_grid_iterator_hacks);
+            b_blockwise_copy.RunRead(b_k0_n_k1_grid_desc, b_grid_buf, b_k0_n_k1_grid_step_hacks);
 
             blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
 
@@ -526,7 +520,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
             const index_t n_thread_data_on_grid =
                 n_block_data_idx_on_grid + c_thread_mtx_on_block[I1];
 
-            constexpr auto c_m0_m1_m2_n_grid_tensor_iterator_hacks = CGridIteratorHacks{};
+            constexpr auto c_m0_m1_m2_n_grid_tensor_step_hacks = CGridStepHacks{};
 
             constexpr index_t MWaves = MPerBlock / (MPerWave * MRepeat);
             constexpr index_t NWaves = NPerBlock / (NPerWave * NRepeat);
@@ -557,7 +551,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
                      c_blk_buf_,
                      c_m0_m1_m2_n_grid_desc,
                      c_grid_buf,
-                     c_m0_m1_m2_n_grid_tensor_iterator_hacks);
+                     c_m0_m1_m2_n_grid_tensor_step_hacks);
         }
 #else
         {
@@ -579,7 +573,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
             const index_t n_thread_data_on_grid =
                 n_block_data_idx_on_grid + c_thread_mtx_on_block[I1];
 
-            constexpr auto c_m0_m1_m2_n_grid_tensor_iterator_hacks = CGridIteratorHacks{};
+            constexpr auto c_m0_m1_m2_n_grid_tensor_step_hacks = CGridStepHacks{};
 
             auto c_thread_copy =
                 ThreadwiseTensorSliceTransfer_v1r3<FloatC,
@@ -610,7 +604,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
                                   c_thread_buf[Number<blk_off>{}].template AsType<FloatAcc>(),
                                   c_m0_m1_m2_n_grid_desc,
                                   c_grid_buf,
-                                  c_m0_m1_m2_n_grid_tensor_iterator_hacks);
+                                  c_m0_m1_m2_n_grid_tensor_step_hacks);
 
                 return c_thread_idx_;
             };
@@ -625,7 +619,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
                                   c_thread_buf[Number<blk_off>{}].template AsType<FloatAcc>(),
                                   c_m0_m1_m2_n_grid_desc,
                                   c_grid_buf,
-                                  c_m0_m1_m2_n_grid_tensor_iterator_hacks);
+                                  c_m0_m1_m2_n_grid_tensor_step_hacks);
             };
 
             auto nrepeat_plus_copy = [&](auto c_thread_idx_) {
@@ -638,7 +632,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
                                   c_thread_buf[Number<blk_off>{}].template AsType<FloatAcc>(),
                                   c_m0_m1_m2_n_grid_desc,
                                   c_grid_buf,
-                                  c_m0_m1_m2_n_grid_tensor_iterator_hacks);
+                                  c_m0_m1_m2_n_grid_tensor_step_hacks);
             };
 
             auto mrepeat_minus_copy = [&](auto c_thread_idx_) {
@@ -651,7 +645,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
                                   c_thread_buf[Number<blk_off>{}].template AsType<FloatAcc>(),
                                   c_m0_m1_m2_n_grid_desc,
                                   c_grid_buf,
-                                  c_m0_m1_m2_n_grid_tensor_iterator_hacks);
+                                  c_m0_m1_m2_n_grid_tensor_step_hacks);
             };
 
             auto nrepeat_minus_copy = [&](auto c_thread_idx_) {
@@ -664,7 +658,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
                                   c_thread_buf[Number<blk_off>{}].template AsType<FloatAcc>(),
                                   c_m0_m1_m2_n_grid_desc,
                                   c_grid_buf,
-                                  c_m0_m1_m2_n_grid_tensor_iterator_hacks);
+                                  c_m0_m1_m2_n_grid_tensor_step_hacks);
             };
 
             static_assert((MRepeat == 4 && NRepeat == 4) or (MRepeat == 4 && NRepeat == 2) or
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_set.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_set.hpp
index 6eb058711e..a4128c274b 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_set.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_set.hpp
@@ -11,7 +11,7 @@ namespace ck {
 //   1. Desc is known at compile-time
 //   2. Buffer is StaticBuffer
 //   3. OriginIdx is known at compile-time
-//   4. use #-iterator
+//   4. use #-step
 template <typename Data,
           typename Desc,
           typename SliceLengths,
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
index 66be04c335..db86c1e729 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
@@ -66,7 +66,7 @@ struct ThreadwiseTensorSliceTransfer_v1r3
 
     using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
 
-    using DstCoordIterator = decltype(make_tensor_coordinate_iterator(DstDesc{}, Index{}));
+    using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
 
     __device__ constexpr ThreadwiseTensorSliceTransfer_v1r3(const DstDesc& dst_desc,
                                                             const Index& dst_slice_origin_idx)
@@ -84,13 +84,13 @@ struct ThreadwiseTensorSliceTransfer_v1r3
     template <typename SrcSliceOriginIdx,
               typename SrcBuffer,
               typename DstBuffer,
-              typename DstIteratorHacks>
+              typename DstStepHacks>
     __device__ void Run(const SrcDesc&,
                         const SrcSliceOriginIdx&,
                         const SrcBuffer& src_buf,
                         const DstDesc& dst_desc,
                         DstBuffer& dst_buf,
-                        const DstIteratorHacks& dst_iterator_hacks)
+                        const DstStepHacks& dst_step_hacks)
     {
         static_assert(SrcDesc::IsKnownAtCompileTime(),
                       "wrong! SrcDesc need to known at compile-time");
@@ -127,31 +127,31 @@ struct ThreadwiseTensorSliceTransfer_v1r3
         constexpr auto ordered_access_lengths =
             container_reorder_given_new2old(access_lengths, dim_access_order);
 
-        // make forward iterators
-        const auto dst_forward_iterators = generate_tuple(
+        // make forward steps
+        const auto dst_forward_steps = generate_tuple(
             [&](auto i) {
-                Index forward_step;
+                Index forward_step_idx;
 
                 static_for<0, nDim, 1>{}([&](auto j) {
-                    forward_step(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
+                    forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
                 });
 
-                return make_tensor_coordinate_iterator(
-                    dst_desc, forward_step, dst_iterator_hacks[I0][i]);
+                return make_tensor_coordinate_step(
+                    dst_desc, forward_step_idx, dst_step_hacks[I0][i]);
             },
             Number<nDim>{});
 
-        // make backward iterators
-        const auto dst_backward_iterators = generate_tuple(
+        // make backward steps
+        const auto dst_backward_steps = generate_tuple(
             [&](auto i) {
-                Index backward_step;
+                Index backward_step_idx;
 
                 static_for<0, nDim, 1>{}([&](auto j) {
-                    backward_step(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
+                    backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
                 });
 
-                return make_tensor_coordinate_iterator(
-                    dst_desc, backward_step, dst_iterator_hacks[I1][i]);
+                return make_tensor_coordinate_step(
+                    dst_desc, backward_step_idx, dst_step_hacks[I1][i]);
             },
             Number<nDim>{});
 
@@ -236,12 +236,12 @@ struct ThreadwiseTensorSliceTransfer_v1r3
                     if constexpr(forward_sweep[i])
                     {
                         move_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_forward_iterators[dim_access_order[i]]);
+                            dst_desc, dst_coord_, dst_forward_steps[dim_access_order[i]]);
                     }
                     else
                     {
                         move_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_backward_iterators[dim_access_order[i]]);
+                            dst_desc, dst_coord_, dst_backward_steps[dim_access_order[i]]);
                     }
                 }
             });
@@ -250,10 +250,10 @@ struct ThreadwiseTensorSliceTransfer_v1r3
         // move dst coordinate back to slice origin (or not)
         if constexpr(DstResetCoordinateAfterRun)
         {
-            const auto dst_reset_iterator =
-                make_tensor_coordinate_iterator(dst_desc, GetDstCoordinateResetStep());
+            const auto dst_reset_step =
+                make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep());
 
-            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_iterator);
+            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
         }
     }
 
@@ -268,11 +268,11 @@ struct ThreadwiseTensorSliceTransfer_v1r3
 
         constexpr auto zeros = typename uniform_sequence_gen<ntransform_dst, 0>::type{};
 
-        constexpr auto dst_iterator_hacks =
+        constexpr auto dst_step_hacks =
             make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
                        generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
 
-        Run(SrcDesc{}, SrcSliceOriginIdx{}, src_buf, dst_desc, dst_buf, dst_iterator_hacks);
+        Run(SrcDesc{}, SrcSliceOriginIdx{}, src_buf, dst_desc, dst_buf, dst_step_hacks);
     }
 
     __device__ static constexpr auto GetDstCoordinateResetStep()
@@ -345,7 +345,7 @@ struct ThreadwiseTensorSliceTransfer_v1r3
                                        : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
 
         // is it OK to construct a new step every time?
-        const auto adjusted_step = make_tensor_coordinate_iterator(dst_desc, adjusted_step_idx);
+        const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
 
         move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
     }
@@ -382,7 +382,7 @@ struct ThreadwiseTensorSliceTransfer_v2
 
     using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
 
-    using SrcCoordIterator = decltype(make_tensor_coordinate_iterator(SrcDesc{}, Index{}));
+    using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
 
     __device__ constexpr ThreadwiseTensorSliceTransfer_v2(const SrcDesc& src_desc,
                                                           const Index& src_slice_origin_idx)
@@ -400,13 +400,13 @@ struct ThreadwiseTensorSliceTransfer_v2
     template <typename SrcBuffer,
               typename DstBuffer,
               typename DstSliceOriginIdx,
-              typename SrcIteratorHacks>
+              typename SrcStepHacks>
     __device__ void Run(const SrcDesc& src_desc,
                         const SrcBuffer& src_buf,
                         const DstDesc&,
                         const DstSliceOriginIdx&,
                         DstBuffer& dst_buf,
-                        const SrcIteratorHacks& src_iterator_hacks)
+                        const SrcStepHacks& src_step_hacks)
     {
         static_assert(DstDesc::IsKnownAtCompileTime(),
                       "wrong! DstDesc need to known at compile-time");
@@ -441,31 +441,31 @@ struct ThreadwiseTensorSliceTransfer_v2
         constexpr auto ordered_access_lengths =
             container_reorder_given_new2old(access_lengths, dim_access_order);
 
-        // make forward iterators
-        const auto src_forward_iterators = generate_tuple(
+        // make forward steps
+        const auto src_forward_steps = generate_tuple(
             [&](auto i) {
-                Index forward_step;
+                Index forward_step_idx;
 
                 static_for<0, nDim, 1>{}([&](auto j) {
-                    forward_step(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0;
+                    forward_step_idx(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0;
                 });
 
-                return make_tensor_coordinate_iterator(
-                    src_desc, forward_step, src_iterator_hacks[I0][i]);
+                return make_tensor_coordinate_step(
+                    src_desc, forward_step_idx, src_step_hacks[I0][i]);
             },
             Number<nDim>{});
 
-        // make backward iterators
-        const auto src_backward_iterators = generate_tuple(
+        // make backward steps
+        const auto src_backward_steps = generate_tuple(
             [&](auto i) {
-                Index backward_step;
+                Index backward_step_idx;
 
                 static_for<0, nDim, 1>{}([&](auto j) {
-                    backward_step(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0;
+                    backward_step_idx(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0;
                 });
 
-                return make_tensor_coordinate_iterator(
-                    src_desc, backward_step, src_iterator_hacks[I1][i]);
+                return make_tensor_coordinate_step(
+                    src_desc, backward_step_idx, src_step_hacks[I1][i]);
             },
             Number<nDim>{});
 
@@ -548,12 +548,12 @@ struct ThreadwiseTensorSliceTransfer_v2
                     if constexpr(forward_sweep[i])
                     {
                         move_tensor_coordinate(
-                            src_desc, src_coord_, src_forward_iterators[dim_access_order[i]]);
+                            src_desc, src_coord_, src_forward_steps[dim_access_order[i]]);
                     }
                     else
                     {
                         move_tensor_coordinate(
-                            src_desc, src_coord_, src_backward_iterators[dim_access_order[i]]);
+                            src_desc, src_coord_, src_backward_steps[dim_access_order[i]]);
                     }
                 }
             });
@@ -562,10 +562,10 @@ struct ThreadwiseTensorSliceTransfer_v2
         // move src coordinate back to slice origin (or not)
         if constexpr(SrcResetCoordinateAfterRun)
         {
-            const auto src_reset_iterator =
-                make_tensor_coordinate_iterator(src_desc, GetSrcCoordinateResetStep());
+            const auto src_reset_step =
+                make_tensor_coordinate_step(src_desc, GetSrcCoordinateResetStep());
 
-            move_tensor_coordinate(src_desc, src_coord_, src_reset_iterator);
+            move_tensor_coordinate(src_desc, src_coord_, src_reset_step);
         }
     }
 
@@ -580,11 +580,11 @@ struct ThreadwiseTensorSliceTransfer_v2
 
         constexpr auto zeros = typename uniform_sequence_gen<ntransform_src, 0>::type{};
 
-        constexpr auto src_iterator_hacks =
+        constexpr auto src_step_hacks =
             make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
                        generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
 
-        Run(src_desc, src_buf, DstDesc{}, DstSliceOriginIdx{}, dst_buf, src_iterator_hacks);
+        Run(src_desc, src_buf, DstDesc{}, DstSliceOriginIdx{}, dst_buf, src_step_hacks);
     }
 
     __device__ static constexpr auto GetSrcCoordinateResetStep()
@@ -657,7 +657,7 @@ struct ThreadwiseTensorSliceTransfer_v2
                                        : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
 
         // is it OK to construct a new step every time?
-        const auto adjusted_step = make_tensor_coordinate_iterator(src_desc, adjusted_step_idx);
+        const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
 
         move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
     }
@@ -699,8 +699,8 @@ struct ThreadwiseTensorSliceTransfer_v3
     using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
     using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
 
-    using SrcCoordIterator = decltype(make_tensor_coordinate_iterator(SrcDesc{}, Index{}));
-    using DstCoordIterator = decltype(make_tensor_coordinate_iterator(DstDesc{}, Index{}));
+    using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
+    using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
 
     __device__ constexpr ThreadwiseTensorSliceTransfer_v3(const SrcDesc& src_desc,
                                                           const Index& src_slice_origin,
@@ -724,10 +724,9 @@ struct ThreadwiseTensorSliceTransfer_v3
         dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
     }
 
-    template <typename SrcBuffer, typename SrcIteratorHacks>
-    __device__ void RunRead(const SrcDesc& src_desc,
-                            const SrcBuffer& src_buf,
-                            const SrcIteratorHacks& src_iterator_hacks)
+    template <typename SrcBuffer, typename SrcStepHacks>
+    __device__ void
+    RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf, const SrcStepHacks& src_step_hacks)
     {
         static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
                           SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
@@ -755,31 +754,31 @@ struct ThreadwiseTensorSliceTransfer_v3
         constexpr auto ordered_src_access_lengths =
             container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
 
-        // make forward iterators
-        const auto src_forward_iterators = generate_tuple(
+        // make forward steps
+        const auto src_forward_steps = generate_tuple(
             [&](auto i) {
-                Index forward_step;
+                Index forward_step_idx;
 
                 static_for<0, nDim, 1>{}([&](auto j) {
-                    forward_step(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0;
+                    forward_step_idx(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0;
                 });
 
-                return make_tensor_coordinate_iterator(
-                    src_desc, forward_step, src_iterator_hacks[I0][i]);
+                return make_tensor_coordinate_step(
+                    src_desc, forward_step_idx, src_step_hacks[I0][i]);
             },
             Number<nDim>{});
 
-        // make backward iterators
-        const auto src_backward_iterators = generate_tuple(
+        // make backward steps
+        const auto src_backward_steps = generate_tuple(
             [&](auto i) {
-                Index backward_step;
+                Index backward_step_idx;
 
                 static_for<0, nDim, 1>{}([&](auto j) {
-                    backward_step(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0;
+                    backward_step_idx(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0;
                 });
 
-                return make_tensor_coordinate_iterator(
-                    src_desc, backward_step, src_iterator_hacks[I1][i]);
+                return make_tensor_coordinate_step(
+                    src_desc, backward_step_idx, src_step_hacks[I1][i]);
             },
             Number<nDim>{});
 
@@ -861,12 +860,12 @@ struct ThreadwiseTensorSliceTransfer_v3
                     if constexpr(forward_sweep[i])
                     {
                         move_tensor_coordinate(
-                            src_desc, src_coord_, src_forward_iterators[src_dim_access_order[i]]);
+                            src_desc, src_coord_, src_forward_steps[src_dim_access_order[i]]);
                     }
                     else
                     {
                         move_tensor_coordinate(
-                            src_desc, src_coord_, src_backward_iterators[src_dim_access_order[i]]);
+                            src_desc, src_coord_, src_backward_steps[src_dim_access_order[i]]);
                     }
                 }
             });
@@ -875,17 +874,16 @@ struct ThreadwiseTensorSliceTransfer_v3
         // move src coordinate back to slice origin (or not)
         if constexpr(SrcResetCoordinateAfterRun)
         {
-            const auto src_reset_iterator =
-                make_tensor_coordinate_iterator(src_desc, GetSrcCoordinateResetStep());
+            const auto src_reset_step =
+                make_tensor_coordinate_step(src_desc, GetSrcCoordinateResetStep());
 
-            move_tensor_coordinate(src_desc, src_coord_, src_reset_iterator);
+            move_tensor_coordinate(src_desc, src_coord_, src_reset_step);
         }
     }
 
-    template <typename DstBuffer, typename DstIteratorHacks>
-    __device__ void RunWrite(const DstDesc& dst_desc,
-                             DstBuffer& dst_buf,
-                             const DstIteratorHacks& dst_iterator_hacks)
+    template <typename DstBuffer, typename DstStepHacks>
+    __device__ void
+    RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf, const DstStepHacks& dst_step_hacks)
     {
         static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
                           DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
@@ -913,35 +911,31 @@ struct ThreadwiseTensorSliceTransfer_v3
         constexpr auto ordered_dst_access_lengths =
             container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
 
-        // make forward iterators
-        const auto dst_forward_iterators = generate_tuple(
+        // make forward steps
+        const auto dst_forward_steps = generate_tuple(
             [&](auto i) {
-                Index forward_step;
+                Index forward_step_idx;
 
                 static_for<0, nDim, 1>{}([&](auto j) {
-                    forward_step(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
+                    forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
                 });
 
-                const auto forward_iterator = make_tensor_coordinate_iterator(
-                    dst_desc, forward_step, dst_iterator_hacks[I0][i]);
-
-                return forward_iterator;
+                return make_tensor_coordinate_step(
+                    dst_desc, forward_step_idx, dst_step_hacks[I0][i]);
             },
             Number<nDim>{});
 
-        // make backward iterators
-        const auto dst_backward_iterators = generate_tuple(
+        // make backward steps
+        const auto dst_backward_steps = generate_tuple(
             [&](auto i) {
-                Index backward_step;
+                Index backward_step_idx;
 
                 static_for<0, nDim, 1>{}([&](auto j) {
-                    backward_step(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
+                    backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
                 });
 
-                const auto backward_iterator = make_tensor_coordinate_iterator(
-                    dst_desc, backward_step, dst_iterator_hacks[I1][i]);
-
-                return backward_iterator;
+                return make_tensor_coordinate_step(
+                    dst_desc, backward_step_idx, dst_step_hacks[I1][i]);
             },
             Number<nDim>{});
 
@@ -1025,12 +1019,12 @@ struct ThreadwiseTensorSliceTransfer_v3
                     if constexpr(forward_sweep[i])
                     {
                         move_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_forward_iterators[dst_dim_access_order[i]]);
+                            dst_desc, dst_coord_, dst_forward_steps[dst_dim_access_order[i]]);
                     }
                     else
                     {
                         move_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_backward_iterators[dst_dim_access_order[i]]);
+                            dst_desc, dst_coord_, dst_backward_steps[dst_dim_access_order[i]]);
                     }
                 }
             });
@@ -1039,10 +1033,10 @@ struct ThreadwiseTensorSliceTransfer_v3
         // move dst coordinate back to slice origin (or not)
         if constexpr(DstResetCoordinateAfterRun)
         {
-            const auto dst_reset_iterator =
-                make_tensor_coordinate_iterator(dst_desc, GetDstCoordinateResetStep());
+            const auto dst_reset_step =
+                make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep());
 
-            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_iterator);
+            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
         }
     }
 
@@ -1053,11 +1047,11 @@ struct ThreadwiseTensorSliceTransfer_v3
 
         constexpr auto zeros = typename uniform_sequence_gen<ntransform_src, 0>::type{};
 
-        constexpr auto src_iterator_hacks =
+        constexpr auto src_step_hacks =
             make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
                        generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
 
-        RunRead(src_desc, src_buf, src_iterator_hacks);
+        RunRead(src_desc, src_buf, src_step_hacks);
     }
 
     template <typename DstBuffer>
@@ -1067,11 +1061,11 @@ struct ThreadwiseTensorSliceTransfer_v3
 
         constexpr auto zeros = typename uniform_sequence_gen<ntransform_dst, 0>::type{};
 
-        constexpr auto dst_iterator_hacks =
+        constexpr auto dst_step_hacks =
             make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
                        generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
 
-        RunWrite(dst_desc, dst_buf, dst_iterator_hacks);
+        RunWrite(dst_desc, dst_buf, dst_step_hacks);
     }
 
     __device__ static constexpr auto GetSrcCoordinateResetStep()
@@ -1204,17 +1198,17 @@ struct ThreadwiseTensorSliceTransfer_v3
                                        : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
 
         // is it OK to construct a new step every time?
-        const auto adjusted_step = make_tensor_coordinate_iterator(src_desc, adjusted_step_idx);
+        const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
 
         move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
     }
 
     // src_slice_origin_step_idx need to be known at compile-time, for performance reason
-    template <typename SrcMoveSliceWindowIteratorHack>
+    template <typename SrcMoveSliceWindowStepHack>
     __device__ void
     MoveSrcSliceWindow(const SrcDesc& src_desc,
                        const Index& src_slice_origin_step_idx,
-                       const SrcMoveSliceWindowIteratorHack& src_move_slice_window_iterator_hack)
+                       const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
     {
         // if src coord was not reset by RunRead(), then need to adjust the step here
         const auto adjusted_step_idx =
@@ -1222,8 +1216,8 @@ struct ThreadwiseTensorSliceTransfer_v3
                                        : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
 
         // is it OK to construct a new step every time?
-        const auto adjusted_step = make_tensor_coordinate_iterator(
-            src_desc, adjusted_step_idx, src_move_slice_window_iterator_hack);
+        const auto adjusted_step = make_tensor_coordinate_step(
+            src_desc, adjusted_step_idx, src_move_slice_window_step_hack);
 
         move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
     }
@@ -1237,7 +1231,7 @@ struct ThreadwiseTensorSliceTransfer_v3
                                        : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
 
         // is it OK to construct a new step every time?
-        const auto adjusted_step = make_tensor_coordinate_iterator(dst_desc, adjusted_step_idx);
+        const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
 
         move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
     }
@@ -1260,7 +1254,7 @@ struct ThreadwiseTensorSliceTransfer_v3
 //     2. SrcBuffer is DynamicBuffer
 //     3. src_ref_idx is known at run-time
 //     4. SrcRefToOriginDisplacement is known at compile-time
-//     5. use #-iterator
+//     5. use #-step
 //   2. dst:
 //     1. DstDesc is known at compile-time
 //     2. DstBuffer is StaticBuffer
@@ -1287,7 +1281,7 @@ struct ThreadwiseTensorSliceTransfer_v4
 
     using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
 
-    using SrcCoordIterator = decltype(make_tensor_coordinate_iterator(SrcDesc{}, Index{}));
+    using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
 
     __device__ constexpr ThreadwiseTensorSliceTransfer_v4(const Index& src_ref_idx)
         : src_ref_coord_(make_tensor_coordinate(SrcDesc{}, src_ref_idx))
@@ -1386,12 +1380,12 @@ struct ThreadwiseTensorSliceTransfer_v4
             constexpr auto src_ref_to_data_disp_idx =
                 src_ref_to_origin_disp_idx + data_to_origin_disp_idx;
 
-            constexpr auto src_ref_to_data_disp_coord_iterator =
-                make_tensor_coordinate_iterator(src_desc, src_ref_to_data_disp_idx);
+            constexpr auto src_ref_to_data_disp_coord_step =
+                make_tensor_coordinate_step(src_desc, src_ref_to_data_disp_idx);
 
             auto src_data_coord = src_ref_coord_;
 
-            move_tensor_coordinate(src_desc, src_data_coord, src_ref_to_data_disp_coord_iterator);
+            move_tensor_coordinate(src_desc, src_data_coord, src_ref_to_data_disp_coord_step);
 
             vector_type_maker_t<SrcData, SrcScalarPerVector> src_tmp_vector;
 
@@ -1431,7 +1425,7 @@ struct ThreadwiseTensorSliceTransfer_v4
         constexpr auto src_desc = SrcDesc{};
 
         const auto src_slice_move_step_iter =
-            make_tensor_coordinate_iterator(src_desc, to_multi_index(src_slice_move_step_idx));
+            make_tensor_coordinate_step(src_desc, to_multi_index(src_slice_move_step_idx));
 
         move_tensor_coordinate(SrcDesc{}, src_ref_coord_, src_slice_move_step_iter);
     }
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp
index a2613f2e2d..ceac47a364 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp
@@ -41,8 +41,8 @@ struct ThreadwiseTensorSliceTransfer_v3r1
     using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
     using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
 
-    using SrcCoordIterator = decltype(make_tensor_coordinate_iterator(SrcDesc{}, Index{}));
-    using DstCoordIterator = decltype(make_tensor_coordinate_iterator(DstDesc{}, Index{}));
+    using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
+    using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
 
     __device__ constexpr ThreadwiseTensorSliceTransfer_v3r1(const SrcDesc& src_desc,
                                                             const Index& src_slice_origin,
@@ -72,10 +72,9 @@ struct ThreadwiseTensorSliceTransfer_v3r1
         dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
     }
 
-    template <typename SrcBuffer, typename SrcIteratorHacks>
-    __device__ void RunRead(const SrcDesc& src_desc,
-                            const SrcBuffer& src_buf,
-                            const SrcIteratorHacks& src_iterator_hacks)
+    template <typename SrcBuffer, typename SrcStepHacks>
+    __device__ void
+    RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf, const SrcStepHacks& src_step_hacks)
     {
         static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
                           SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
@@ -108,31 +107,31 @@ struct ThreadwiseTensorSliceTransfer_v3r1
         constexpr auto ordered_src_access_lengths =
             container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
 
-        // make forward iterators
-        const auto src_forward_iterators = generate_tuple(
+        // make forward steps
+        const auto src_forward_steps = generate_tuple(
             [&](auto i) {
-                Index forward_step;
+                Index forward_step_idx;
 
                 static_for<0, nDim, 1>{}([&](auto j) {
-                    forward_step(j) = (i.value == j.value) ? src_vector_tensor_lengths[i] : 0;
+                    forward_step_idx(j) = (i.value == j.value) ? src_vector_tensor_lengths[i] : 0;
                 });
 
-                return make_tensor_coordinate_iterator(
-                    src_desc, forward_step, src_iterator_hacks[I0][i]);
+                return make_tensor_coordinate_step(
+                    src_desc, forward_step_idx, src_step_hacks[I0][i]);
             },
             Number<nDim>{});
 
-        // make backward iterators
-        const auto src_backward_iterators = generate_tuple(
+        // make backward steps
+        const auto src_backward_steps = generate_tuple(
             [&](auto i) {
-                Index backward_step;
+                Index backward_step_idx;
 
                 static_for<0, nDim, 1>{}([&](auto j) {
-                    backward_step(j) = (i.value == j.value) ? -src_vector_tensor_lengths[i] : 0;
+                    backward_step_idx(j) = (i.value == j.value) ? -src_vector_tensor_lengths[i] : 0;
                 });
 
-                return make_tensor_coordinate_iterator(
-                    src_desc, backward_step, src_iterator_hacks[I1][i]);
+                return make_tensor_coordinate_step(
+                    src_desc, backward_step_idx, src_step_hacks[I1][i]);
             },
             Number<nDim>{});
 
@@ -220,12 +219,12 @@ struct ThreadwiseTensorSliceTransfer_v3r1
                     if constexpr(forward_sweep[i])
                     {
                         move_tensor_coordinate(
-                            src_desc, src_coord_, src_forward_iterators[src_dim_access_order[i]]);
+                            src_desc, src_coord_, src_forward_steps[src_dim_access_order[i]]);
                     }
                     else
                     {
                         move_tensor_coordinate(
-                            src_desc, src_coord_, src_backward_iterators[src_dim_access_order[i]]);
+                            src_desc, src_coord_, src_backward_steps[src_dim_access_order[i]]);
                     }
                 }
             });
@@ -234,17 +233,16 @@ struct ThreadwiseTensorSliceTransfer_v3r1
         // move src coordinate back to slice origin (or not)
         if constexpr(SrcResetCoordinateAfterRun)
         {
-            const auto src_reset_iterator =
-                make_tensor_coordinate_iterator(src_desc, GetSrcCoordinateResetStep());
+            const auto src_reset_step =
+                make_tensor_coordinate_step(src_desc, GetSrcCoordinateResetStep());
 
-            move_tensor_coordinate(src_desc, src_coord_, src_reset_iterator);
+            move_tensor_coordinate(src_desc, src_coord_, src_reset_step);
         }
     }
 
-    template <typename DstBuffer, typename DstIteratorHacks>
-    __device__ void RunWrite(const DstDesc& dst_desc,
-                             DstBuffer& dst_buf,
-                             const DstIteratorHacks& dst_iterator_hacks)
+    template <typename DstBuffer, typename DstStepHacks>
+    __device__ void
+    RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf, const DstStepHacks& dst_step_hacks)
     {
         static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
                           DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
@@ -277,35 +275,31 @@ struct ThreadwiseTensorSliceTransfer_v3r1
         constexpr auto ordered_dst_access_lengths =
             container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
 
-        // make forward iterators
-        const auto dst_forward_iterators = generate_tuple(
+        // make forward steps
+        const auto dst_forward_steps = generate_tuple(
             [&](auto i) {
-                Index forward_step;
+                Index forward_step_idx;
 
                 static_for<0, nDim, 1>{}([&](auto j) {
-                    forward_step(j) = (i.value == j.value) ? dst_vector_tensor_lengths[i] : 0;
+                    forward_step_idx(j) = (i.value == j.value) ? dst_vector_tensor_lengths[i] : 0;
                 });
 
-                const auto forward_iterator = make_tensor_coordinate_iterator(
-                    dst_desc, forward_step, dst_iterator_hacks[I0][i]);
-
-                return forward_iterator;
+                return make_tensor_coordinate_step(
+                    dst_desc, forward_step_idx, dst_step_hacks[I0][i]);
             },
             Number<nDim>{});
 
-        // make backward iterators
-        const auto dst_backward_iterators = generate_tuple(
+        // make backward steps
+        const auto dst_backward_steps = generate_tuple(
             [&](auto i) {
-                Index backward_step;
+                Index backward_step_idx;
 
                 static_for<0, nDim, 1>{}([&](auto j) {
-                    backward_step(j) = (i.value == j.value) ? -dst_vector_tensor_lengths[i] : 0;
+                    backward_step_idx(j) = (i.value == j.value) ? -dst_vector_tensor_lengths[i] : 0;
                 });
 
-                const auto backward_iterator = make_tensor_coordinate_iterator(
-                    dst_desc, backward_step, dst_iterator_hacks[I1][i]);
-
-                return backward_iterator;
+                return make_tensor_coordinate_step(
+                    dst_desc, backward_step_idx, dst_step_hacks[I1][i]);
             },
             Number<nDim>{});
 
@@ -395,12 +389,12 @@ struct ThreadwiseTensorSliceTransfer_v3r1
                     if constexpr(forward_sweep[i])
                     {
                         move_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_forward_iterators[dst_dim_access_order[i]]);
+                            dst_desc, dst_coord_, dst_forward_steps[dst_dim_access_order[i]]);
                     }
                     else
                     {
                         move_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_backward_iterators[dst_dim_access_order[i]]);
+                            dst_desc, dst_coord_, dst_backward_steps[dst_dim_access_order[i]]);
                     }
                 }
             });
@@ -409,10 +403,10 @@ struct ThreadwiseTensorSliceTransfer_v3r1
         // move dst coordinate back to slice origin (or not)
         if constexpr(DstResetCoordinateAfterRun)
         {
-            const auto dst_reset_iterator =
-                make_tensor_coordinate_iterator(dst_desc, GetDstCoordinateResetStep());
+            const auto dst_reset_step =
+                make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep());
 
-            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_iterator);
+            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
         }
     }
 
@@ -423,11 +417,11 @@ struct ThreadwiseTensorSliceTransfer_v3r1
 
         constexpr auto zeros = typename uniform_sequence_gen<ntransform_src, 0>::type{};
 
-        constexpr auto src_iterator_hacks =
+        constexpr auto src_step_hacks =
             make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
                        generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
 
-        RunRead(src_desc, src_buf, src_iterator_hacks);
+        RunRead(src_desc, src_buf, src_step_hacks);
     }
 
     template <typename DstBuffer>
@@ -437,11 +431,11 @@ struct ThreadwiseTensorSliceTransfer_v3r1
 
         constexpr auto zeros = typename uniform_sequence_gen<ntransform_dst, 0>::type{};
 
-        constexpr auto dst_iterator_hacks =
+        constexpr auto dst_step_hacks =
             make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
                        generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
 
-        RunWrite(dst_desc, dst_buf, dst_iterator_hacks);
+        RunWrite(dst_desc, dst_buf, dst_step_hacks);
     }
 
     __device__ static constexpr auto GetSrcCoordinateResetStep()
@@ -564,17 +558,17 @@ struct ThreadwiseTensorSliceTransfer_v3r1
                                        : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
 
         // is it OK to construct a new step every time?
-        const auto adjusted_step = make_tensor_coordinate_iterator(src_desc, adjusted_step_idx);
+        const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
 
         move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
     }
 
     // src_slice_origin_step_idx need to be known at compile-time, for performance reason
-    template <typename SrcMoveSliceWindowIteratorHack>
+    template <typename SrcMoveSliceWindowStepHack>
     __device__ void
     MoveSrcSliceWindow(const SrcDesc& src_desc,
                        const Index& src_slice_origin_step_idx,
-                       const SrcMoveSliceWindowIteratorHack& src_move_slice_window_iterator_hack)
+                       const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
     {
         // if src coord was not reset by RunRead(), then need to adjust the step here
         const auto adjusted_step_idx =
@@ -582,8 +576,8 @@ struct ThreadwiseTensorSliceTransfer_v3r1
                                        : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
 
         // is it OK to construct a new step every time?
-        const auto adjusted_step = make_tensor_coordinate_iterator(
-            src_desc, adjusted_step_idx, src_move_slice_window_iterator_hack);
+        const auto adjusted_step = make_tensor_coordinate_step(
+            src_desc, adjusted_step_idx, src_move_slice_window_step_hack);
 
         move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
     }
@@ -597,7 +591,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
                                        : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
 
         // is it OK to construct a new step every time?
-        const auto adjusted_step = make_tensor_coordinate_iterator(dst_desc, adjusted_step_idx);
+        const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
 
         move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
     }
@@ -620,7 +614,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
 //     2. SrcBuffer is DynamicBuffer
 //     3. src_ref_idx is known at run-time
 //     4. SrcRefToOriginDisplacement is known at compile-time
-//     5. use #-iterator
+//     5. use #-step
 //   2. dst:
 //     1. DstDesc is known at compile-time
 //     2. DstBuffer is StaticBuffer
@@ -649,7 +643,7 @@ struct ThreadwiseTensorSliceTransfer_v4r1
 
     using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
 
-    using SrcCoordIterator = decltype(make_tensor_coordinate_iterator(SrcDesc{}, Index{}));
+    using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
 
     __device__ constexpr ThreadwiseTensorSliceTransfer_v4r1(const Index& src_ref_idx)
         : src_ref_coord_(make_tensor_coordinate(SrcDesc{}, src_ref_idx))
@@ -732,12 +726,12 @@ struct ThreadwiseTensorSliceTransfer_v4r1
             constexpr auto src_ref_to_data_disp_idx =
                 src_ref_to_origin_disp_idx + data_to_origin_disp_idx;
 
-            constexpr auto src_ref_to_data_disp_coord_iterator =
-                make_tensor_coordinate_iterator(src_desc, src_ref_to_data_disp_idx);
+            constexpr auto src_ref_to_data_disp_coord_step =
+                make_tensor_coordinate_step(src_desc, src_ref_to_data_disp_idx);
 
             auto src_data_coord = src_ref_coord_;
 
-            move_tensor_coordinate(src_desc, src_data_coord, src_ref_to_data_disp_coord_iterator);
+            move_tensor_coordinate(src_desc, src_data_coord, src_ref_to_data_disp_coord_step);
 
             vector_type_maker_t<SrcData, src_vector_desc.GetElementSpaceSize()> src_vector;
 
@@ -773,7 +767,7 @@ struct ThreadwiseTensorSliceTransfer_v4r1
         constexpr auto src_desc = SrcDesc{};
 
         const auto src_slice_move_step_iter =
-            make_tensor_coordinate_iterator(src_desc, to_multi_index(src_slice_move_step_idx));
+            make_tensor_coordinate_step(src_desc, to_multi_index(src_slice_move_step_idx));
 
         move_tensor_coordinate(SrcDesc{}, src_ref_coord_, src_slice_move_step_iter);
     }
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp
index 1843a0ca64..09a7fffa3e 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp
@@ -113,16 +113,16 @@ extern "C" __global__ void convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcy
     using BKNGridDesc = decltype(b_k_n_grid_desc);
     using CMNGridDesc = decltype(c_m_n_grid_desc);
 
-    using AGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{}),
-                                                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{})));
-
-    using BGridIteratorHacks =
+    using AGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{}),
+                                               make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{})));
+
+    using BGridStepHacks =
         decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
                                        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
                                        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}),
@@ -130,21 +130,21 @@ extern "C" __global__ void convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcy
                                        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
                                        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{})));
 
-    using CGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 1, 0, 0>{},
-                                                              Sequence<0, 0, 1, 0, 0>{},
-                                                              Sequence<0, 0, 1, 0, 0>{}),
-                                                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 2, 0, 0>{},
-                                                              Sequence<0, 0, 2, 0, 0>{},
-                                                              Sequence<0, 0, 2, 0, 0>{})));
-
-    using AGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0>;
-    using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
+    using CGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{}),
+                                               make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{})));
+
+    using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0>;
+    using BGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
 
     using GridwiseGemm =
         GridwiseGemmDlops_km_kn_mn_v1r2<BlockSize,
@@ -184,11 +184,11 @@ extern "C" __global__ void convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcy
                                         CThreadTransferSrcDstAccessOrder,
                                         CThreadTransferSrcDstVectorDim,
                                         CThreadTransferDstScalarPerVector,
-                                        AGridIteratorHacks,
-                                        BGridIteratorHacks,
-                                        CGridIteratorHacks,
-                                        AGridMoveSliceWindowIteratorHacks,
-                                        BGridMoveSliceWindowIteratorHacks>;
+                                        AGridStepHacks,
+                                        BGridStepHacks,
+                                        CGridStepHacks,
+                                        AGridMoveSliceWindowStepHacks,
+                                        BGridMoveSliceWindowStepHacks>;
 
     auto a_k_m0_m1_grid_desc = GridwiseGemm::MakeAKM0M1GridDescriptor(a_k_m_grid_desc);
     auto b_k_n0_n1_grid_desc = GridwiseGemm::MakeBKN0N1GridDescriptor(b_k_n_grid_desc);
@@ -249,16 +249,16 @@ extern "C" __global__ void
     using BKNGridDesc = decltype(b_k_n_grid_desc);
     using CMNGridDesc = decltype(c_m_n_grid_desc);
 
-    using AGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{}),
-                                                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{})));
+    using AGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{}),
+                                               make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{})));
 
-    using BGridIteratorHacks =
+    using BGridStepHacks =
         decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
                                        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
                                        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}),
@@ -266,21 +266,21 @@ extern "C" __global__ void
                                        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
                                        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{})));
 
-    using CGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 1, 0, 0>{},
-                                                              Sequence<0, 0, 1, 0, 0>{},
-                                                              Sequence<0, 0, 1, 0, 0>{}),
-                                                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 2, 0, 0>{},
-                                                              Sequence<0, 0, 2, 0, 0>{},
-                                                              Sequence<0, 0, 2, 0, 0>{})));
-
-    using AGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0>;
-    using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
+    using CGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{}),
+                                               make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{})));
+
+    using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0>;
+    using BGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
 
     using GridwiseGemm =
         GridwiseGemmDlops_km_kn_mn_v1r2<BlockSize,
@@ -320,11 +320,11 @@ extern "C" __global__ void
                                         CThreadTransferSrcDstAccessOrder,
                                         CThreadTransferSrcDstVectorDim,
                                         CThreadTransferDstScalarPerVector,
-                                        AGridIteratorHacks,
-                                        BGridIteratorHacks,
-                                        CGridIteratorHacks,
-                                        AGridMoveSliceWindowIteratorHacks,
-                                        BGridMoveSliceWindowIteratorHacks>;
+                                        AGridStepHacks,
+                                        BGridStepHacks,
+                                        CGridStepHacks,
+                                        AGridMoveSliceWindowStepHacks,
+                                        BGridMoveSliceWindowStepHacks>;
 
     constexpr auto a_k_m0_m1_grid_desc_tmp =
         GridwiseGemm::MakeAKM0M1GridDescriptor(a_k_m_grid_desc);
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
index d434dab6fe..51d852617f 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
@@ -110,12 +110,12 @@ extern "C" __global__ void convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kc
     using BK0NK1GridDesc = decltype(b_k0_n_k1_grid_desc);
     using CMNGridDesc    = decltype(c_m_n_grid_desc);
 
-    using AGridIteratorHacks = decltype(make_tuple(
+    using AGridStepHacks = decltype(make_tuple(
         make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
         make_tuple(
             Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{})));
 
-    using BGridIteratorHacks =
+    using BGridStepHacks =
         decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
                                        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
                                        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
@@ -123,25 +123,25 @@ extern "C" __global__ void convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kc
                                        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
                                        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})));
 
-    using CGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 1, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 1, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 1, 0, 0>{}),
-                                                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 2, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 2, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 2, 0, 0>{})));
-
-    using AGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0>;
-    using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
+    using CGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{}),
+                                               make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{})));
+
+    using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0>;
+    using BGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
 
     using GridwiseGemm =
         GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
@@ -179,11 +179,11 @@ extern "C" __global__ void convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kc
                                                 CThreadTransferSrcDstAccessOrder,
                                                 CThreadTransferSrcDstVectorDim,
                                                 CThreadTransferDstScalarPerVector,
-                                                AGridIteratorHacks,
-                                                BGridIteratorHacks,
-                                                CGridIteratorHacks,
-                                                AGridMoveSliceWindowIteratorHacks,
-                                                BGridMoveSliceWindowIteratorHacks,
+                                                AGridStepHacks,
+                                                BGridStepHacks,
+                                                CGridStepHacks,
+                                                AGridMoveSliceWindowStepHacks,
+                                                BGridMoveSliceWindowStepHacks,
                                                 false>;
 
     auto c_m0_m1_m2_n_grid_desc = GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
@@ -243,12 +243,12 @@ extern "C" __global__ void
     constexpr auto b_k0_n_k1_grid_desc_tmp = descs[I1];
     constexpr auto c_m_n_grid_desc         = descs[I2];
 
-    using AGridIteratorHacks = decltype(make_tuple(
+    using AGridStepHacks = decltype(make_tuple(
         make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
         make_tuple(
             Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{})));
 
-    using BGridIteratorHacks =
+    using BGridStepHacks =
         decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
                                        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
                                        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
@@ -256,25 +256,25 @@ extern "C" __global__ void
                                        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
                                        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})));
 
-    using CGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 1, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 1, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 1, 0, 0>{}),
-                                                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 2, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 2, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 2, 0, 0>{})));
-
-    using AGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0>;
-    using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
+    using CGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{}),
+                                               make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{})));
+
+    using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0>;
+    using BGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
 
     using AK0MK1GridDesc = decltype(a_k0_m_k1_grid_desc_tmp);
     using BK0NK1GridDesc = decltype(b_k0_n_k1_grid_desc_tmp);
@@ -316,11 +316,11 @@ extern "C" __global__ void
                                                 CThreadTransferSrcDstAccessOrder,
                                                 CThreadTransferSrcDstVectorDim,
                                                 CThreadTransferDstScalarPerVector,
-                                                AGridIteratorHacks,
-                                                BGridIteratorHacks,
-                                                CGridIteratorHacks,
-                                                AGridMoveSliceWindowIteratorHacks,
-                                                BGridMoveSliceWindowIteratorHacks,
+                                                AGridStepHacks,
+                                                BGridStepHacks,
+                                                CGridStepHacks,
+                                                AGridMoveSliceWindowStepHacks,
+                                                BGridMoveSliceWindowStepHacks,
                                                 false>;
 
     constexpr auto c_m0_m1_m2_n_grid_desc_tmp =
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp
index 7678a69b12..30e4c518ce 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp
@@ -110,12 +110,12 @@ extern "C" __global__ void convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_ky
     using BK0NK1GridDesc = decltype(b_k0_n_k1_grid_desc);
     using CMNGridDesc    = decltype(c_m_n_grid_desc);
 
-    using BGridIteratorHacks = decltype(make_tuple(
+    using BGridStepHacks = decltype(make_tuple(
         make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
         make_tuple(
             Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{})));
 
-    using AGridIteratorHacks =
+    using AGridStepHacks =
         decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
                                        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
                                        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
@@ -123,25 +123,25 @@ extern "C" __global__ void convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_ky
                                        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
                                        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})));
 
-    using CGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 1, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 1, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 1, 0, 0>{}),
-                                                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 2, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 2, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 2, 0, 0>{})));
-
-    using AGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
-    using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0>;
+    using CGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{}),
+                                               make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{})));
+
+    using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
+    using BGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0>;
 
     using GridwiseGemm =
         GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
@@ -179,11 +179,11 @@ extern "C" __global__ void convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_ky
                                                 CThreadTransferSrcDstAccessOrder,
                                                 CThreadTransferSrcDstVectorDim,
                                                 CThreadTransferDstScalarPerVector,
-                                                AGridIteratorHacks,
-                                                BGridIteratorHacks,
-                                                CGridIteratorHacks,
-                                                AGridMoveSliceWindowIteratorHacks,
-                                                BGridMoveSliceWindowIteratorHacks,
+                                                AGridStepHacks,
+                                                BGridStepHacks,
+                                                CGridStepHacks,
+                                                AGridMoveSliceWindowStepHacks,
+                                                BGridMoveSliceWindowStepHacks,
                                                 false>;
 
     auto c_m0_m1_m2_n_grid_desc = GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
@@ -247,12 +247,12 @@ extern "C" __global__ void
     using BK0NK1GridDesc = decltype(b_k0_n_k1_grid_desc_tmp);
     using CMNGridDesc    = decltype(c_m_n_grid_desc);
 
-    using BGridIteratorHacks = decltype(make_tuple(
+    using BGridStepHacks = decltype(make_tuple(
         make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
         make_tuple(
             Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{})));
 
-    using AGridIteratorHacks =
+    using AGridStepHacks =
         decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
                                        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
                                        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
@@ -260,25 +260,25 @@ extern "C" __global__ void
                                        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
                                        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})));
 
-    using CGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 1, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 1, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 1, 0, 0>{}),
-                                                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 2, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 2, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 2, 0, 0>{})));
-
-    using AGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
-    using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0>;
+    using CGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{}),
+                                               make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{})));
+
+    using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
+    using BGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0>;
 
     using GridwiseGemm =
         GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
@@ -316,11 +316,11 @@ extern "C" __global__ void
                                                 CThreadTransferSrcDstAccessOrder,
                                                 CThreadTransferSrcDstVectorDim,
                                                 CThreadTransferDstScalarPerVector,
-                                                AGridIteratorHacks,
-                                                BGridIteratorHacks,
-                                                CGridIteratorHacks,
-                                                AGridMoveSliceWindowIteratorHacks,
-                                                BGridMoveSliceWindowIteratorHacks,
+                                                AGridStepHacks,
+                                                BGridStepHacks,
+                                                CGridStepHacks,
+                                                AGridMoveSliceWindowStepHacks,
+                                                BGridMoveSliceWindowStepHacks,
                                                 false>;
     constexpr auto c_m0_m1_m2_n_grid_desc_tmp =
         GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp
index ac7e1dd6d4..9661f0e50c 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp
@@ -111,7 +111,7 @@ convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare(index_t N,
     using BGridDesc_GK0_GN0_GN1_GK1 = decltype(b_grid_desc_gk0_gn0_gn1_gk1);
     using CGridDesc_GM0_GM1_GN0_GN1 = decltype(c_grid_desc_gm0_gm1_gn0_gn1);
 
-    using AGridIteratorHacks =
+    using AGridStepHacks =
         decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 0+: GK0
                                        Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 1+: GM0
                                        Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 2+: GM10
@@ -123,7 +123,7 @@ convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare(index_t N,
                                        Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 3-: GM11
                                        Sequence<0, 0, 0, 0, 0, 0, 0>{}))); // 4-: GK1
 
-    using BGridIteratorHacks = decltype(make_tuple(
+    using BGridStepHacks = decltype(make_tuple(
         make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0>{},    // 0+: GK0
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},    // 1+: GN0
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},    // 2+: GN10
@@ -135,7 +135,7 @@ convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare(index_t N,
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},    // 3-: GN11
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}))); // 4-: GK1
 
-    using CGridIteratorHacks = decltype(make_tuple(
+    using CGridStepHacks = decltype(make_tuple(
         make_tuple(
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 0+: GM10
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},  // 1+: BM0
@@ -151,9 +151,9 @@ convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare(index_t N,
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{},    // 4-: BN0
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{}))); // 5-: GN1
 
-    using AGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0>;
+    using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0>;
 
-    using BGridMoveSliceWindowIteratorHacks =
+    using BGridMoveSliceWindowStepHacks =
         Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0>;
 
     using GridwiseContraction =
@@ -191,11 +191,11 @@ convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare(index_t N,
             CThreadTransferSrcDstAccessOrder,
             CThreadTransferSrcDstVectorDim,
             CThreadTransferDstScalarPerVector,
-            AGridIteratorHacks,
-            BGridIteratorHacks,
-            CGridIteratorHacks,
-            AGridMoveSliceWindowIteratorHacks,
-            BGridMoveSliceWindowIteratorHacks>;
+            AGridStepHacks,
+            BGridStepHacks,
+            CGridStepHacks,
+            AGridMoveSliceWindowStepHacks,
+            BGridMoveSliceWindowStepHacks>;
 
     if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0)
     {
@@ -254,7 +254,7 @@ extern "C" __global__ void
     using BGridDesc_GK0_GN0_GN1_GK1 = decltype(b_grid_desc_gk0_gn0_gn1_gk1);
     using CGridDesc_GM0_GM1_GN0_GN1 = decltype(c_grid_desc_gm0_gm1_gn0_gn1);
 
-    using AGridIteratorHacks =
+    using AGridStepHacks =
         decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 0+: GK0
                                        Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 1+: GM0
                                        Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 2+: GM10
@@ -266,7 +266,7 @@ extern "C" __global__ void
                                        Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 3-: GM11
                                        Sequence<0, 0, 0, 0, 0, 0, 0>{}))); // 4-: GK1
 
-    using BGridIteratorHacks = decltype(make_tuple(
+    using BGridStepHacks = decltype(make_tuple(
         make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0>{},    // 0+: GK0
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},    // 1+: GN0
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},    // 2+: GN10
@@ -278,7 +278,7 @@ extern "C" __global__ void
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},    // 3-: GN11
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}))); // 4-: GK1
 
-    using CGridIteratorHacks = decltype(make_tuple(
+    using CGridStepHacks = decltype(make_tuple(
         make_tuple(
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 0+: GM10
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},  // 1+: BM0
@@ -294,9 +294,9 @@ extern "C" __global__ void
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{},    // 4-: BN0
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{}))); // 5-: GN1
 
-    using AGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0>;
+    using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0>;
 
-    using BGridMoveSliceWindowIteratorHacks =
+    using BGridMoveSliceWindowStepHacks =
         Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0>;
 
     using GridwiseContraction =
@@ -334,11 +334,11 @@ extern "C" __global__ void
             CThreadTransferSrcDstAccessOrder,
             CThreadTransferSrcDstVectorDim,
             CThreadTransferDstScalarPerVector,
-            AGridIteratorHacks,
-            BGridIteratorHacks,
-            CGridIteratorHacks,
-            AGridMoveSliceWindowIteratorHacks,
-            BGridMoveSliceWindowIteratorHacks>;
+            AGridStepHacks,
+            BGridStepHacks,
+            CGridStepHacks,
+            AGridMoveSliceWindowStepHacks,
+            BGridMoveSliceWindowStepHacks>;
 
     using AGridDesc_GK0_GM0_GM10_GM11_GK1 =
         decltype(GridwiseContraction::MakeAGridDescriptor_GK0_GM0_GM10_GM11_GK1(
diff --git a/src/composable_kernel/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp b/src/composable_kernel/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
index 5f162ec24b..7bd82bf6d5 100644
--- a/src/composable_kernel/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/src/composable_kernel/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
@@ -207,7 +207,7 @@ void device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk(
     const auto in_gemmm_gemmn_grid_desc          = descs[I2];
 
     // HACK: hacks that control index calculation when iterating over A, B, C matrix
-    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks =
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},   // 0+: gemmk0
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: gemmm
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 2+: gemmk1
@@ -215,7 +215,7 @@ void device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk(
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: Gemmm
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 2-: Gemmk1
 
-    constexpr auto out_gemmk0_gemmn_gemmk1_grid_iterator_hacks = make_tuple(
+    constexpr auto out_gemmk0_gemmn_gemmk1_grid_step_hacks = make_tuple(
         make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},   // 0+: gemmk0
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0>{},   // 1+: gemmn
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 2+: gemmk1
@@ -223,7 +223,7 @@ void device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk(
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0>{},   // 1-: gemmn
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 2-: gemmk1
 
-    constexpr auto in_m0_m1_m2_n_grid_iterator_hacks = make_tuple(
+    constexpr auto in_m0_m1_m2_n_grid_step_hacks = make_tuple(
         make_tuple(
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 0+: MRepeat
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},  // 1+: NRepeat
@@ -243,10 +243,10 @@ void device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk(
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6-: M2
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{})); // 7-: N1
 
-    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks =
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{};
 
-    constexpr auto out_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks =
+    constexpr auto out_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0>{};
 
     for(index_t i = 0; i < 5; ++i)
@@ -287,11 +287,11 @@ void device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk(
             Sequence<1, 3, 7, 0, 2, 4, 5, 6>,
             6,
             GemmCThreadTransferDstScalarPerVector,
-            decltype(wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks),
-            decltype(out_gemmk0_gemmn_gemmk1_grid_iterator_hacks),
-            decltype(in_m0_m1_m2_n_grid_iterator_hacks),
-            decltype(wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks),
-            decltype(out_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks),
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_step_hacks),
+            decltype(out_gemmk0_gemmn_gemmk1_grid_step_hacks),
+            decltype(in_m0_m1_m2_n_grid_step_hacks),
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks),
+            decltype(out_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks),
             false // CAccessOrderMRepeatNRepeat
             >(static_cast<TInWei*>(wei_k_y_x_c_device_buf.GetDeviceBuffer()),
               static_cast<TOut*>(out_n_ho_wo_k_device_buf.GetDeviceBuffer()),
@@ -299,11 +299,11 @@ void device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk(
               wei_gemmk0_gemmm_gemmk1_grid_desc,
               out_gemmk0_gemmn_gemmk1_grid_desc,
               in_gemmm_gemmn_grid_desc,
-              wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks,
-              out_gemmk0_gemmn_gemmk1_grid_iterator_hacks,
-              in_m0_m1_m2_n_grid_iterator_hacks,
-              wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks,
-              out_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks,
+              wei_gemmk0_gemmm_gemmk1_grid_step_hacks,
+              out_gemmk0_gemmn_gemmk1_grid_step_hacks,
+              in_m0_m1_m2_n_grid_step_hacks,
+              wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks,
+              out_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks,
               nrepeat);
 
         {
diff --git a/src/composable_kernel/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp b/src/composable_kernel/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
index 82539fdd11..0ebf8571f4 100644
--- a/src/composable_kernel/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/src/composable_kernel/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
@@ -179,7 +179,7 @@ void device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk
     const auto in_gemmm_gemmn_grid_desc          = descs[I2];
 
     // HACK: hacks that control index calculation when iterating over A, B, C matrix
-    constexpr auto out_gemmk0_gemmm_gemmk1_grid_iterator_hacks = make_tuple(
+    constexpr auto out_gemmk0_gemmm_gemmk1_grid_step_hacks = make_tuple(
         make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},   // 0+: gemmk0
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0>{},   // 1+: gemmm
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 2+: gemmk1
@@ -187,7 +187,7 @@ void device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0>{},   // 1-: gemmm
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 2-: gemmk1
 
-    constexpr auto wei_gemmk0_gemmn_gemmk1_grid_iterator_hacks =
+    constexpr auto wei_gemmk0_gemmn_gemmk1_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},   // 0+: gemmk0
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: gemmn
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 2+: gemmk1
@@ -195,7 +195,7 @@ void device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: Gemmn
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 2-: Gemmk1
 
-    constexpr auto in_m0_m1_m2_n_grid_iterator_hacks = make_tuple(
+    constexpr auto in_m0_m1_m2_n_grid_step_hacks = make_tuple(
         make_tuple(
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},  // 0+: MRepeat
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 1+: NRepeat
@@ -215,10 +215,10 @@ void device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},   // 6-: M2
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 7-: N1
 
-    constexpr auto out_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks =
+    constexpr auto out_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0>{};
 
-    constexpr auto wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks =
+    constexpr auto wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{};
 
     for(index_t i = 0; i < 5; ++i)
@@ -263,11 +263,11 @@ void device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk
 #endif
             7,
             GemmCThreadTransferDstScalarPerVector,
-            decltype(out_gemmk0_gemmm_gemmk1_grid_iterator_hacks),
-            decltype(wei_gemmk0_gemmn_gemmk1_grid_iterator_hacks),
-            decltype(in_m0_m1_m2_n_grid_iterator_hacks),
-            decltype(out_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks),
-            decltype(wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks),
+            decltype(out_gemmk0_gemmm_gemmk1_grid_step_hacks),
+            decltype(wei_gemmk0_gemmn_gemmk1_grid_step_hacks),
+            decltype(in_m0_m1_m2_n_grid_step_hacks),
+            decltype(out_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks),
+            decltype(wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks),
             true // CAccessOrderMRepeatNRepeat
             >(static_cast<TOut*>(out_n_ho_wo_k_device_buf.GetDeviceBuffer()),
               static_cast<TInWei*>(wei_k_y_x_c_device_buf.GetDeviceBuffer()),
@@ -275,11 +275,11 @@ void device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk
               out_gemmk0_gemmm_gemmk1_grid_desc,
               wei_gemmk0_gemmn_gemmk1_grid_desc,
               in_gemmm_gemmn_grid_desc,
-              out_gemmk0_gemmm_gemmk1_grid_iterator_hacks,
-              wei_gemmk0_gemmn_gemmk1_grid_iterator_hacks,
-              in_m0_m1_m2_n_grid_iterator_hacks,
-              out_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks,
-              wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks,
+              out_gemmk0_gemmm_gemmk1_grid_step_hacks,
+              wei_gemmk0_gemmn_gemmk1_grid_step_hacks,
+              in_m0_m1_m2_n_grid_step_hacks,
+              out_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks,
+              wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks,
               nrepeat);
 
         {
diff --git a/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp b/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
index a2af8eab28..e6554cf0fe 100644
--- a/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
+++ b/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
@@ -89,7 +89,7 @@ void device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
                                                                         in_right_pads);
 
     // HACK: hacks that control index calculation when iterating over A, B, C matrix
-    constexpr auto wei_gemmk_gemmm0_gemmn1_grid_iterator_hacks =
+    constexpr auto wei_gemmk_gemmm0_gemmn1_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0>{},
@@ -99,7 +99,7 @@ void device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
                               Sequence<0, 0, 0, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0>{}));
 
-    constexpr auto in_gemmk_gemmn0_gemmn1_grid_iterator_hacks =
+    constexpr auto in_gemmk_gemmn0_gemmn1_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}),
@@ -107,7 +107,7 @@ void device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{}));
 
-    constexpr auto out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_iterator_hacks =
+    constexpr auto out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0>{},
@@ -121,10 +121,10 @@ void device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
                               Sequence<0, 0, 2, 0, 0>{},
                               Sequence<0, 0, 2, 0, 0>{}));
 
-    constexpr auto wei_gemmk_gemmm0_gemmm1_grid_move_slice_window_iterator_hacks =
+    constexpr auto wei_gemmk_gemmm0_gemmm1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0>{};
 
-    constexpr auto in_gemmk_gemmn0_gemmn1_grid_move_slice_window_iterator_hacks =
+    constexpr auto in_gemmk_gemmn0_gemmn1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>{};
 
     const auto wei_gemmk_gemmm_grid_desc = descs[I0];
@@ -171,22 +171,22 @@ void device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
             Sequence<3, 4, 5, 0, 1, 2>, // CThreadTransferSrcDstAccessOrder
             5,                          // CThreadTransferSrcDstVectorDim
             GemmCThreadTransferDstScalarPerVector_N11,
-            decltype(wei_gemmk_gemmm0_gemmn1_grid_iterator_hacks),
-            decltype(in_gemmk_gemmn0_gemmn1_grid_iterator_hacks),
-            decltype(out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_iterator_hacks),
-            decltype(wei_gemmk_gemmm0_gemmm1_grid_move_slice_window_iterator_hacks),
-            decltype(in_gemmk_gemmn0_gemmn1_grid_move_slice_window_iterator_hacks)>(
+            decltype(wei_gemmk_gemmm0_gemmn1_grid_step_hacks),
+            decltype(in_gemmk_gemmn0_gemmn1_grid_step_hacks),
+            decltype(out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_step_hacks),
+            decltype(wei_gemmk_gemmm0_gemmm1_grid_move_slice_window_step_hacks),
+            decltype(in_gemmk_gemmn0_gemmn1_grid_move_slice_window_step_hacks)>(
             static_cast<TInWei*>(wei_k_c_y_x_device_buf.GetDeviceBuffer()),
             static_cast<TInWei*>(in_n_c_hi_wi_device_buf.GetDeviceBuffer()),
             static_cast<TOut*>(out_n_k_ho_wo_device_buf.GetDeviceBuffer()),
             wei_gemmk_gemmm_grid_desc,
             in_gemmk_gemmn_grid_desc,
             out_gemmm_gemmn_grid_desc,
-            wei_gemmk_gemmm0_gemmn1_grid_iterator_hacks,
-            in_gemmk_gemmn0_gemmn1_grid_iterator_hacks,
-            out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_iterator_hacks,
-            wei_gemmk_gemmm0_gemmm1_grid_move_slice_window_iterator_hacks,
-            in_gemmk_gemmn0_gemmn1_grid_move_slice_window_iterator_hacks,
+            wei_gemmk_gemmm0_gemmn1_grid_step_hacks,
+            in_gemmk_gemmn0_gemmn1_grid_step_hacks,
+            out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_step_hacks,
+            wei_gemmk_gemmm0_gemmm1_grid_move_slice_window_step_hacks,
+            in_gemmk_gemmn0_gemmn1_grid_move_slice_window_step_hacks,
             nrepeat);
 
         float perf = static_cast<float>(calculate_convolution_flops(
diff --git a/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp b/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp
index d32eeea9cd..40685e81cf 100644
--- a/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp
+++ b/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp
@@ -155,7 +155,7 @@ void device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk(
     const auto out_gemmm_gemmn_grid_desc         = descs[I2];
 
     // HACK: hacks that control index calculation when iterating over A, B, C matrix
-    constexpr auto in_gemmk0_gemmm0_gemmm1_gemmk1_grid_iterator_hacks = make_tuple(
+    constexpr auto in_gemmk0_gemmm0_gemmm1_gemmk1_grid_step_hacks = make_tuple(
         make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},   // 0+: GemmK0
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},   // 1+: GemmM0
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},   // 2+: GemmM1
@@ -165,7 +165,7 @@ void device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk(
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},   // 3-: GemmM1
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{})); // 3-: GemmK1
 
-    constexpr auto wei_gemmk0_gemmn0_gemmn1_gemmk1_grid_iterator_hacks =
+    constexpr auto wei_gemmk0_gemmn0_gemmn1_gemmk1_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0>{},   // 0+: GemmK0
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: GemmN0
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0>{},   // 2+: GemmN1
@@ -175,7 +175,7 @@ void device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk(
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0>{},   // 2-: GemmN1
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0>{})); // 3-: GemmK1
 
-    constexpr auto out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_iterator_hacks =
+    constexpr auto out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},   // 0+: GemmM0
                               Sequence<0, 0, 0, 0, 0>{},   // 1+: GemmM10
                               Sequence<0, 0, 0, 0, 0>{},   // 2+: GemmM11
@@ -189,10 +189,10 @@ void device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk(
                               Sequence<0, 0, 0, 0, 0>{},   // 4-: GemmN10
                               Sequence<0, 0, 0, 0, 0>{})); // 5-: GemmN11
 
-    constexpr auto in_gemmk0_gemmm0_gemmm1_gemmk1_grid_move_slice_window_iterator_hacks =
+    constexpr auto in_gemmk0_gemmm0_gemmm1_gemmk1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0>{};
 
-    constexpr auto wei_gemmk0_gemmn0_gemmn1_gemmk1_grid_move_slice_window_iterator_hacks =
+    constexpr auto wei_gemmk0_gemmn0_gemmn1_gemmk1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0, 0, 0, 0>{};
 
     for(index_t i = 0; i < 5; ++i)
@@ -231,22 +231,22 @@ void device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk(
             Sequence<0, 1, 2, 3, 4, 5>, // CThreadTransferSrcDstAccessOrder
             5,                          // CThreadTransferSrcDstVectorDim
             GemmCThreadTransferDstScalarPerVector_N11,
-            decltype(in_gemmk0_gemmm0_gemmm1_gemmk1_grid_iterator_hacks),
-            decltype(wei_gemmk0_gemmn0_gemmn1_gemmk1_grid_iterator_hacks),
-            decltype(out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_iterator_hacks),
-            decltype(in_gemmk0_gemmm0_gemmm1_gemmk1_grid_move_slice_window_iterator_hacks),
-            decltype(wei_gemmk0_gemmn0_gemmn1_gemmk1_grid_move_slice_window_iterator_hacks)>(
+            decltype(in_gemmk0_gemmm0_gemmm1_gemmk1_grid_step_hacks),
+            decltype(wei_gemmk0_gemmn0_gemmn1_gemmk1_grid_step_hacks),
+            decltype(out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_step_hacks),
+            decltype(in_gemmk0_gemmm0_gemmm1_gemmk1_grid_move_slice_window_step_hacks),
+            decltype(wei_gemmk0_gemmn0_gemmn1_gemmk1_grid_move_slice_window_step_hacks)>(
             static_cast<TInWei*>(in_n_hi_wi_c_device_buf.GetDeviceBuffer()),
             static_cast<TInWei*>(wei_k_y_x_c_device_buf.GetDeviceBuffer()),
             static_cast<TOut*>(out_n_ho_wo_k_device_buf.GetDeviceBuffer()),
             in_gemmk0_gemmm_gemmk1_grid_desc,
             wei_gemmk0_gemmn_gemmk1_grid_desc,
             out_gemmm_gemmn_grid_desc,
-            in_gemmk0_gemmm0_gemmm1_gemmk1_grid_iterator_hacks,
-            wei_gemmk0_gemmn0_gemmn1_gemmk1_grid_iterator_hacks,
-            out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_iterator_hacks,
-            in_gemmk0_gemmm0_gemmm1_gemmk1_grid_move_slice_window_iterator_hacks,
-            wei_gemmk0_gemmn0_gemmn1_gemmk1_grid_move_slice_window_iterator_hacks,
+            in_gemmk0_gemmm0_gemmm1_gemmk1_grid_step_hacks,
+            wei_gemmk0_gemmn0_gemmn1_gemmk1_grid_step_hacks,
+            out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_step_hacks,
+            in_gemmk0_gemmm0_gemmm1_gemmk1_grid_move_slice_window_step_hacks,
+            wei_gemmk0_gemmn0_gemmn1_gemmk1_grid_move_slice_window_step_hacks,
             nrepeat);
 
         {
diff --git a/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp b/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
index d82fbf69d6..695ffeeb36 100644
--- a/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
+++ b/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
@@ -92,12 +92,12 @@ void device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw(
     const auto out_gemmm_gemmn_grid_desc         = descs[I2];
 
     // HACK: hacks that control index calculation when iterating over A, B, C matrix
-    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks = make_tuple(
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_step_hacks = make_tuple(
         make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
         make_tuple(
             Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}));
 
-    constexpr auto in_gemmk0_gemmn_gemmk1_grid_iterator_hacks =
+    constexpr auto in_gemmk0_gemmn_gemmk1_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
@@ -105,7 +105,7 @@ void device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw(
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}));
 
-    constexpr auto out_m0_m1_m2_n_grid_iterator_hacks =
+    constexpr auto out_m0_m1_m2_n_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
                               Sequence<0, 0, 1, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0>{},
@@ -123,10 +123,10 @@ void device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw(
                               Sequence<0, 0, 0, 0, 0>{},
                               Sequence<0, 0, 2, 0, 0>{}));
 
-    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks =
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0>{};
 
-    constexpr auto in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks =
+    constexpr auto in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>{};
 
     for(index_t i = 0; i < 5; ++i)
@@ -167,22 +167,22 @@ void device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw(
             Sequence<3, 0, 1, 2, 7, 5, 4, 6>,
             7,
             GemmCThreadTransferDstScalarPerVector,
-            decltype(wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks),
-            decltype(in_gemmk0_gemmn_gemmk1_grid_iterator_hacks),
-            decltype(out_m0_m1_m2_n_grid_iterator_hacks),
-            decltype(wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks),
-            decltype(in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks),
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_step_hacks),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_step_hacks),
+            decltype(out_m0_m1_m2_n_grid_step_hacks),
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks),
             false>(static_cast<TInWei*>(wei_k_c_y_x_device_buf.GetDeviceBuffer()),
                    static_cast<TInWei*>(in_n_c_hi_wi_device_buf.GetDeviceBuffer()),
                    static_cast<TOut*>(out_n_k_ho_wo_device_buf.GetDeviceBuffer()),
                    wei_gemmk0_gemmm_gemmk1_grid_desc,
                    in_gemmk0_gemmn_gemmk1_grid_desc,
                    out_gemmm_gemmn_grid_desc,
-                   wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks,
-                   in_gemmk0_gemmn_gemmk1_grid_iterator_hacks,
-                   out_m0_m1_m2_n_grid_iterator_hacks,
-                   wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks,
-                   in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks,
+                   wei_gemmk0_gemmm_gemmk1_grid_step_hacks,
+                   in_gemmk0_gemmn_gemmk1_grid_step_hacks,
+                   out_m0_m1_m2_n_grid_step_hacks,
+                   wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks,
+                   in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks,
                    nrepeat);
 
         float perf = static_cast<float>(calculate_convolution_flops(
diff --git a/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp b/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp
index 37d89ec5a2..141a326574 100644
--- a/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp
@@ -121,12 +121,12 @@ void device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk(
     const auto out_gemmm_gemmn_grid_desc         = descs[I2];
 
     // HACK: hacks that control index calculation when iterating over A, B, C matrix
-    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks = make_tuple(
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_step_hacks = make_tuple(
         make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
         make_tuple(
             Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}));
 
-    constexpr auto in_gemmk0_gemmn_gemmk1_grid_iterator_hacks =
+    constexpr auto in_gemmk0_gemmn_gemmk1_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
@@ -134,7 +134,7 @@ void device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk(
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}));
 
-    constexpr auto out_m0_m1_m2_n_grid_iterator_hacks =
+    constexpr auto out_m0_m1_m2_n_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0>{},
@@ -144,10 +144,10 @@ void device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk(
                               Sequence<0, 0, 0, 0, 0>{},
                               Sequence<0, 0, 2, 0, 0>{}));
 
-    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks =
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0>{};
 
-    constexpr auto in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks =
+    constexpr auto in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>{};
 
     for(index_t i = 0; i < 5; ++i)
@@ -187,22 +187,22 @@ void device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk(
             Sequence<2, 3, 0, 1>,
             2,
             GemmCThreadTransferDstScalarPerVector,
-            decltype(wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks),
-            decltype(in_gemmk0_gemmn_gemmk1_grid_iterator_hacks),
-            decltype(out_m0_m1_m2_n_grid_iterator_hacks),
-            decltype(wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks),
-            decltype(in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks)>(
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_step_hacks),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_step_hacks),
+            decltype(out_m0_m1_m2_n_grid_step_hacks),
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks)>(
             static_cast<TInWei*>(wei_k_y_x_c_device_buf.GetDeviceBuffer()),
             static_cast<TInWei*>(in_n_hi_wi_c_device_buf.GetDeviceBuffer()),
             static_cast<TOut*>(out_n_ho_wo_k_device_buf.GetDeviceBuffer()),
             wei_gemmk0_gemmm_gemmk1_grid_desc,
             in_gemmk0_gemmn_gemmk1_grid_desc,
             out_gemmm_gemmn_grid_desc,
-            wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks,
-            in_gemmk0_gemmn_gemmk1_grid_iterator_hacks,
-            out_m0_m1_m2_n_grid_iterator_hacks,
-            wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks,
-            in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks,
+            wei_gemmk0_gemmm_gemmk1_grid_step_hacks,
+            in_gemmk0_gemmn_gemmk1_grid_step_hacks,
+            out_m0_m1_m2_n_grid_step_hacks,
+            wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks,
+            in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks,
             nrepeat);
 
         {
diff --git a/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp b/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp
index d1671bb87c..692751bfb3 100644
--- a/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp
@@ -182,12 +182,12 @@ void device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk(
     const auto out_gemmm_gemmn_grid_desc         = descs[I2];
 
     // HACK: hacks that control index calculation when iterating over A, B, C matrix
-    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks = make_tuple(
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_step_hacks = make_tuple(
         make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
         make_tuple(
             Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}));
 
-    constexpr auto in_gemmk0_gemmn_gemmk1_grid_iterator_hacks =
+    constexpr auto in_gemmk0_gemmn_gemmk1_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
@@ -195,7 +195,7 @@ void device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk(
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}));
 
-    constexpr auto out_m0_m1_m2_n_grid_iterator_hacks =
+    constexpr auto out_m0_m1_m2_n_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
                               Sequence<0, 0, 1, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0>{},
@@ -213,10 +213,10 @@ void device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk(
                               Sequence<0, 0, 0, 0, 0>{},
                               Sequence<0, 0, 2, 0, 0>{}));
 
-    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks =
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0>{};
 
-    constexpr auto in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks =
+    constexpr auto in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>{};
 
     for(index_t i = 0; i < 5; ++i)
@@ -256,11 +256,11 @@ void device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk(
             Sequence<2, 3, 0, 1, 7, 5, 4, 6>,
             6,
             GemmCThreadTransferDstScalarPerVector,
-            decltype(wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks),
-            decltype(in_gemmk0_gemmn_gemmk1_grid_iterator_hacks),
-            decltype(out_m0_m1_m2_n_grid_iterator_hacks),
-            decltype(wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks),
-            decltype(in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks),
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_step_hacks),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_step_hacks),
+            decltype(out_m0_m1_m2_n_grid_step_hacks),
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks),
             false // CAccessOrderMRepeatNRepeat
             >(static_cast<TInWei*>(wei_k_y_x_c_device_buf.GetDeviceBuffer()),
               static_cast<TInWei*>(in_n_hi_wi_c_device_buf.GetDeviceBuffer()),
@@ -268,11 +268,11 @@ void device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk(
               wei_gemmk0_gemmm_gemmk1_grid_desc,
               in_gemmk0_gemmn_gemmk1_grid_desc,
               out_gemmm_gemmn_grid_desc,
-              wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks,
-              in_gemmk0_gemmn_gemmk1_grid_iterator_hacks,
-              out_m0_m1_m2_n_grid_iterator_hacks,
-              wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks,
-              in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks,
+              wei_gemmk0_gemmm_gemmk1_grid_step_hacks,
+              in_gemmk0_gemmn_gemmk1_grid_step_hacks,
+              out_m0_m1_m2_n_grid_step_hacks,
+              wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks,
+              in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks,
               nrepeat);
 
         {
diff --git a/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp b/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
index 7a38b569c9..7067291c8a 100644
--- a/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
@@ -233,7 +233,7 @@ void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk(
     const auto out_gemmm_gemmn_grid_desc         = descs[I2];
 
     // HACK: hacks that control index calculation when iterating over A, B, C matrix
-    constexpr auto in_gemmk0_gemmm_gemmk1_grid_iterator_hacks =
+    constexpr auto in_gemmk0_gemmm_gemmk1_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},   // 0+: GemmK0
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},   // 1+: GemmM
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),  // 2+: GemmK1
@@ -241,7 +241,7 @@ void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk(
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},   // 1-: GemmM
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})); // 2-: GemmK1
 
-    constexpr auto wei_gemmk0_gemmn_gemmk1_grid_iterator_hacks =
+    constexpr auto wei_gemmk0_gemmn_gemmk1_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},   // 0+: GemmK0
                               Sequence<0, 0, 0, 0, 0>{},   // 1+: GemmN
                               Sequence<0, 0, 0, 0, 0>{}),  // 2+: GemmK1
@@ -249,7 +249,7 @@ void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk(
                               Sequence<0, 0, 0, 0, 0>{},   // 1-: GemmN
                               Sequence<0, 0, 0, 0, 0>{})); // 2-: GemmK1
 
-    constexpr auto out_m0_m1_m2_n_grid_iterator_hacks =
+    constexpr auto out_m0_m1_m2_n_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},   // 0+: MRepeat
                               Sequence<0, 0, 0, 0, 0>{},   // 1+: NRepeat
                               Sequence<0, 0, 0, 0, 0>{},   // 2+: MWaves
@@ -267,10 +267,10 @@ void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk(
                               Sequence<0, 0, 0, 0, 0>{},   // 6-: M2
                               Sequence<0, 0, 0, 0, 0>{})); // 7-: N1
 
-    constexpr auto in_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks =
+    constexpr auto in_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>{};
 
-    constexpr auto wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks =
+    constexpr auto wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0>{};
 
     for(index_t i = 0; i < 5; ++i)
@@ -311,11 +311,11 @@ void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk(
             Sequence<2, 3, 0, 1, 7, 5, 4, 6>,
             7,
             GemmCThreadTransferDstScalarPerVector,
-            decltype(in_gemmk0_gemmm_gemmk1_grid_iterator_hacks),
-            decltype(wei_gemmk0_gemmn_gemmk1_grid_iterator_hacks),
-            decltype(out_m0_m1_m2_n_grid_iterator_hacks),
-            decltype(in_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks),
-            decltype(wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks),
+            decltype(in_gemmk0_gemmm_gemmk1_grid_step_hacks),
+            decltype(wei_gemmk0_gemmn_gemmk1_grid_step_hacks),
+            decltype(out_m0_m1_m2_n_grid_step_hacks),
+            decltype(in_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks),
+            decltype(wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks),
             false // CAccessOrderMRepeatNRepeat
             >(static_cast<TInWei*>(in_n_hi_wi_c_device_buf.GetDeviceBuffer()),
               static_cast<TInWei*>(wei_k_y_x_c_device_buf.GetDeviceBuffer()),
@@ -323,11 +323,11 @@ void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk(
               in_gemmk0_gemmm_gemmk1_grid_desc,
               wei_gemmk0_gemmn_gemmk1_grid_desc,
               out_gemmm_gemmn_grid_desc,
-              in_gemmk0_gemmm_gemmk1_grid_iterator_hacks,
-              wei_gemmk0_gemmn_gemmk1_grid_iterator_hacks,
-              out_m0_m1_m2_n_grid_iterator_hacks,
-              in_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks,
-              wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks,
+              in_gemmk0_gemmm_gemmk1_grid_step_hacks,
+              wei_gemmk0_gemmn_gemmk1_grid_step_hacks,
+              out_m0_m1_m2_n_grid_step_hacks,
+              in_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks,
+              wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks,
               nrepeat);
 
         {
diff --git a/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp b/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
index f2a8a1a2b2..0d28616386 100644
--- a/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
@@ -130,7 +130,7 @@ void device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
     const auto out_grid_desc_gm0_gm1_gn0_gn1 = descs[I2];
 
     // HACK: hacks that control index calculation when iterating over A, B, C matrix
-    constexpr auto wei_grid_iterator_hacks =
+    constexpr auto wei_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0>{},   // 0+: GK0
                               Sequence<0, 0, 0, 0, 0, 0, 0>{},   // 1+: GM0
                               Sequence<0, 0, 0, 0, 0, 0, 0>{},   // 2+: GM10
@@ -142,7 +142,7 @@ void device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
                               Sequence<0, 0, 0, 0, 0, 0, 0>{},   // 3-: GM11
                               Sequence<0, 0, 0, 0, 0, 0, 0>{})); // 4-: GK1
 
-    constexpr auto in_grid_iterator_hacks = make_tuple(
+    constexpr auto in_grid_step_hacks = make_tuple(
         make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0>{},   // 0+: GK0
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},   // 1+: GN0
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},   // 2+: GN10
@@ -154,7 +154,7 @@ void device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},   // 3-: GN11
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 4-: GK1
 
-    constexpr auto out_grid_iterator_hacks = make_tuple(
+    constexpr auto out_grid_step_hacks = make_tuple(
         make_tuple(
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 0+: GM10
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},  // 1+: BM0
@@ -170,9 +170,9 @@ void device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{},   // 4-: BN0
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{})); // 5-: GN1
 
-    constexpr auto wei_grid_move_slice_window_iterator_hacks = Sequence<0, 0, 0, 0, 0, 0, 0>{};
+    constexpr auto wei_grid_move_slice_window_step_hacks = Sequence<0, 0, 0, 0, 0, 0, 0>{};
 
-    constexpr auto in_grid_move_slice_window_iterator_hacks =
+    constexpr auto in_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0>{};
 
     for(index_t i = 0; i < 5; ++i)
@@ -211,22 +211,22 @@ void device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
             Sequence<3, 4, 5, 0, 1, 2>, // CThreadTransferSrcDstAccessOrder
             5,                          // CThreadTransferSrcDstVectorDim
             CThreadTransferDstScalarPerVector_BN1,
-            decltype(wei_grid_iterator_hacks),
-            decltype(in_grid_iterator_hacks),
-            decltype(out_grid_iterator_hacks),
-            decltype(wei_grid_move_slice_window_iterator_hacks),
-            decltype(in_grid_move_slice_window_iterator_hacks)>(
+            decltype(wei_grid_step_hacks),
+            decltype(in_grid_step_hacks),
+            decltype(out_grid_step_hacks),
+            decltype(wei_grid_move_slice_window_step_hacks),
+            decltype(in_grid_move_slice_window_step_hacks)>(
             static_cast<TInWei*>(wei_k_c_y_x_device_buf.GetDeviceBuffer()),
             static_cast<TInWei*>(in_n_c_hi_wi_device_buf.GetDeviceBuffer()),
             static_cast<TOut*>(out_n_k_ho_wo_device_buf.GetDeviceBuffer()),
             wei_grid_desc_gk0_gm0_gm1_gk1,
             in_grid_desc_gk0_gn0_gn1_gk1,
             out_grid_desc_gm0_gm1_gn0_gn1,
-            wei_grid_iterator_hacks,
-            in_grid_iterator_hacks,
-            out_grid_iterator_hacks,
-            wei_grid_move_slice_window_iterator_hacks,
-            in_grid_move_slice_window_iterator_hacks,
+            wei_grid_step_hacks,
+            in_grid_step_hacks,
+            out_grid_step_hacks,
+            wei_grid_move_slice_window_step_hacks,
+            in_grid_move_slice_window_step_hacks,
             nrepeat);
 
         float perf = static_cast<float>(calculate_convolution_flops(
diff --git a/src/composable_kernel/host/driver_offline/include/driver_contraction_dlops_v1r2.hpp b/src/composable_kernel/host/driver_offline/include/driver_contraction_dlops_v1r2.hpp
index fbd1ce4e5e..d207728a2e 100644
--- a/src/composable_kernel/host/driver_offline/include/driver_contraction_dlops_v1r2.hpp
+++ b/src/composable_kernel/host/driver_offline/include/driver_contraction_dlops_v1r2.hpp
@@ -39,11 +39,11 @@ template <ck::index_t BlockSize,
           typename CThreadTransferSrcDstAccessOrder,
           ck::index_t CThreadTransferSrcDstVectorDim,
           ck::index_t CThreadTransferDstScalarPerVector,
-          typename AGridIteratorHacks,
-          typename BGridIteratorHacks,
-          typename CGridIteratorHacks,
-          typename AGridMoveSliceWindowIteratorHacks,
-          typename BGridMoveSliceWindowIteratorHacks>
+          typename AGridStepHacks,
+          typename BGridStepHacks,
+          typename CGridStepHacks,
+          typename AGridMoveSliceWindowStepHacks,
+          typename BGridMoveSliceWindowStepHacks>
 __host__ float
 driver_contraction_dlops_v1r2(const FloatAB* p_a_grid,
                               const FloatAB* p_b_grid,
@@ -51,11 +51,11 @@ driver_contraction_dlops_v1r2(const FloatAB* p_a_grid,
                               const AGridDesc_GK0_GM0_GM1_GK1& a_grid_desc_gk0_gm0_gm1_gk1,
                               const BGridDesc_GK0_GN0_GN1_GK1& b_grid_desc_gk0_gn0_gn1_gk1,
                               const CGridDesc_GM0_GM1_GN0_GN1& c_grid_desc_gm0_gm1_gn0_gn1,
-                              AGridIteratorHacks,
-                              BGridIteratorHacks,
-                              CGridIteratorHacks,
-                              AGridMoveSliceWindowIteratorHacks,
-                              BGridMoveSliceWindowIteratorHacks,
+                              AGridStepHacks,
+                              BGridStepHacks,
+                              CGridStepHacks,
+                              AGridMoveSliceWindowStepHacks,
+                              BGridMoveSliceWindowStepHacks,
                               ck::index_t nrepeat)
 
 {
@@ -104,11 +104,11 @@ driver_contraction_dlops_v1r2(const FloatAB* p_a_grid,
             CThreadTransferSrcDstAccessOrder,
             CThreadTransferSrcDstVectorDim,
             CThreadTransferDstScalarPerVector,
-            AGridIteratorHacks,
-            BGridIteratorHacks,
-            CGridIteratorHacks,
-            AGridMoveSliceWindowIteratorHacks,
-            BGridMoveSliceWindowIteratorHacks>;
+            AGridStepHacks,
+            BGridStepHacks,
+            CGridStepHacks,
+            AGridMoveSliceWindowStepHacks,
+            BGridMoveSliceWindowStepHacks>;
 
     const auto GK0 = a_grid_desc_gk0_gm0_gm1_gk1.GetLength(I0);
 
diff --git a/src/composable_kernel/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp b/src/composable_kernel/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
index 6f4db5ff7b..efd4ce6a19 100644
--- a/src/composable_kernel/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/src/composable_kernel/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
@@ -136,13 +136,13 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
         }
 
         // hack to control index calculation when iterating over a_k_m_global tensor
-        constexpr auto a_e_k_global_iterator_hacks =
+        constexpr auto a_e_k_global_step_hacks =
             make_tuple(make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}),
                        make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}));
 
-        constexpr auto a_e_k_global_move_slice_window_iterator_hack = Sequence<0, 0, 0>{};
+        constexpr auto a_e_k_global_move_slice_window_step_hack = Sequence<0, 0, 0>{};
 
-        constexpr auto b_e_n_ho_wo_global_iterator_hacks =
+        constexpr auto b_e_n_ho_wo_global_step_hacks =
             make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
                                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
                                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
@@ -152,12 +152,12 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
                                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
                                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
 
-        constexpr auto b_e_n_ho_wo_global_move_slice_window_iterator_hack =
+        constexpr auto b_e_n_ho_wo_global_move_slice_window_step_hack =
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{};
 
         // hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
         // hack for NKHW format
-        constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks =
+        constexpr auto c_k_n_ho_wo_global_tensor_step_hacks =
             make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0>{},
                                   Sequence<0, 0, 0, 0, 0>{},
                                   Sequence<0, 0, 0, 0, 0>{},
@@ -202,11 +202,11 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
             Sequence<0, 2, 3, 1>,
             0,
             CThreadTransferDstScalarPerVector_W,
-            decltype(a_e_k_global_iterator_hacks),
-            decltype(b_e_n_ho_wo_global_iterator_hacks),
-            decltype(c_k_n_ho_wo_global_tensor_iterator_hacks),
-            decltype(a_e_k_global_move_slice_window_iterator_hack),
-            decltype(b_e_n_ho_wo_global_move_slice_window_iterator_hack)>;
+            decltype(a_e_k_global_step_hacks),
+            decltype(b_e_n_ho_wo_global_step_hacks),
+            decltype(c_k_n_ho_wo_global_tensor_step_hacks),
+            decltype(a_e_k_global_move_slice_window_step_hack),
+            decltype(b_e_n_ho_wo_global_move_slice_window_step_hack)>;
 
         const auto GridSize = (K / KPerBlock) * (Ho / HoPerBlock) * (Wo / WoPerBlock) * N;
 
diff --git a/src/composable_kernel/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp b/src/composable_kernel/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
index 1b7179173c..70f73cbf4a 100644
--- a/src/composable_kernel/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
+++ b/src/composable_kernel/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
@@ -149,13 +149,13 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
         }
 
         // hack to control index calculation when iterating over a_k_m_global tensor
-        constexpr auto a_e_k_global_iterator_hacks =
+        constexpr auto a_e_k_global_step_hacks =
             make_tuple(make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}),
                        make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}));
 
-        constexpr auto a_e_k_global_move_slice_window_iterator_hack = Sequence<0, 0, 0>{};
+        constexpr auto a_e_k_global_move_slice_window_step_hack = Sequence<0, 0, 0>{};
 
-        constexpr auto b_e_n_ho_wo_global_iterator_hacks =
+        constexpr auto b_e_n_ho_wo_global_step_hacks =
             make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
                                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
                                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
@@ -165,12 +165,12 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
                                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
                                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
 
-        constexpr auto b_e_n_ho_wo_global_move_slice_window_iterator_hack =
+        constexpr auto b_e_n_ho_wo_global_move_slice_window_step_hack =
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{};
 
         // hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
         // hack for NKHW format
-        constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks =
+        constexpr auto c_k_n_ho_wo_global_tensor_step_hacks =
             make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0>{},
                                   Sequence<0, 0, 0, 0, 0>{},
                                   Sequence<0, 0, 0, 0, 0>{},
@@ -214,11 +214,11 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
             Sequence<0, 2, 3, 1>,
             0,
             CThreadTransferDstScalarPerVector_W,
-            decltype(a_e_k_global_iterator_hacks),
-            decltype(b_e_n_ho_wo_global_iterator_hacks),
-            decltype(c_k_n_ho_wo_global_tensor_iterator_hacks),
-            decltype(a_e_k_global_move_slice_window_iterator_hack),
-            decltype(b_e_n_ho_wo_global_move_slice_window_iterator_hack)>;
+            decltype(a_e_k_global_step_hacks),
+            decltype(b_e_n_ho_wo_global_step_hacks),
+            decltype(c_k_n_ho_wo_global_tensor_step_hacks),
+            decltype(a_e_k_global_move_slice_window_step_hack),
+            decltype(b_e_n_ho_wo_global_move_slice_window_step_hack)>;
 
         const auto GridSize = (K / KPerBlock) * (Hop / HoPerBlock) * (Wop / WoPerBlock) * N;
 
diff --git a/src/composable_kernel/host/driver_offline/include/driver_gemm_dlops_v1r2.hpp b/src/composable_kernel/host/driver_offline/include/driver_gemm_dlops_v1r2.hpp
index 114f31e760..bf5f7f1c0f 100644
--- a/src/composable_kernel/host/driver_offline/include/driver_gemm_dlops_v1r2.hpp
+++ b/src/composable_kernel/host/driver_offline/include/driver_gemm_dlops_v1r2.hpp
@@ -43,22 +43,22 @@ template <ck::index_t BlockSize,
           typename CThreadTransferSrcDstAccessOrder,
           ck::index_t CThreadTransferSrcDstVectorDim,
           ck::index_t CThreadTransferDstScalarPerVector,
-          typename AGridIteratorHacks,
-          typename BGridIteratorHacks,
-          typename CGridIteratorHacks,
-          typename AGridMoveSliceWindowIteratorHacks,
-          typename BGridMoveSliceWindowIteratorHacks>
+          typename AGridStepHacks,
+          typename BGridStepHacks,
+          typename CGridStepHacks,
+          typename AGridMoveSliceWindowStepHacks,
+          typename BGridMoveSliceWindowStepHacks>
 __host__ float driver_gemm_dlops_v1r2(const FloatAB* p_a_grid,
                                       const FloatAB* p_b_grid,
                                       FloatC* p_c_grid,
                                       const AKMGridDesc& a_k_m_grid_desc,
                                       const BKNGridDesc& b_k_n_grid_desc,
                                       const CMNGridDesc& c_m_n_grid_desc,
-                                      AGridIteratorHacks,
-                                      BGridIteratorHacks,
-                                      CGridIteratorHacks,
-                                      AGridMoveSliceWindowIteratorHacks,
-                                      BGridMoveSliceWindowIteratorHacks,
+                                      AGridStepHacks,
+                                      BGridStepHacks,
+                                      CGridStepHacks,
+                                      AGridMoveSliceWindowStepHacks,
+                                      BGridMoveSliceWindowStepHacks,
                                       ck::index_t nrepeat)
 
 {
@@ -109,11 +109,11 @@ __host__ float driver_gemm_dlops_v1r2(const FloatAB* p_a_grid,
                                                          CThreadTransferSrcDstAccessOrder,
                                                          CThreadTransferSrcDstVectorDim,
                                                          CThreadTransferDstScalarPerVector,
-                                                         AGridIteratorHacks,
-                                                         BGridIteratorHacks,
-                                                         CGridIteratorHacks,
-                                                         AGridMoveSliceWindowIteratorHacks,
-                                                         BGridMoveSliceWindowIteratorHacks>;
+                                                         AGridStepHacks,
+                                                         BGridStepHacks,
+                                                         CGridStepHacks,
+                                                         AGridMoveSliceWindowStepHacks,
+                                                         BGridMoveSliceWindowStepHacks>;
 
     const auto M = a_k_m_grid_desc.GetLength(I1);
     const auto N = b_k_n_grid_desc.GetLength(I1);
diff --git a/src/composable_kernel/host/driver_offline/include/driver_gemm_dlops_v1r3.hpp b/src/composable_kernel/host/driver_offline/include/driver_gemm_dlops_v1r3.hpp
index a9350bf0f8..4470918820 100644
--- a/src/composable_kernel/host/driver_offline/include/driver_gemm_dlops_v1r3.hpp
+++ b/src/composable_kernel/host/driver_offline/include/driver_gemm_dlops_v1r3.hpp
@@ -39,22 +39,22 @@ template <ck::index_t BlockSize,
           typename CThreadTransferSrcDstAccessOrder,
           ck::index_t CThreadTransferSrcDstVectorDim,
           ck::index_t CThreadTransferDstScalarPerVector,
-          typename AGridIteratorHacks,
-          typename BGridIteratorHacks,
-          typename CGridIteratorHacks,
-          typename AGridMoveSliceWindowIteratorHacks,
-          typename BGridMoveSliceWindowIteratorHacks>
+          typename AGridStepHacks,
+          typename BGridStepHacks,
+          typename CGridStepHacks,
+          typename AGridMoveSliceWindowStepHacks,
+          typename BGridMoveSliceWindowStepHacks>
 __host__ float driver_gemm_dlops_v1r3(const FloatAB* p_a_grid,
                                       const FloatAB* p_b_grid,
                                       FloatC* p_c_grid,
                                       const AK0MK1GridDesc& a_k0_m_k1_grid_desc,
                                       const BK0NK1GridDesc& b_k0_n_k1_grid_desc,
                                       const CMNGridDesc& c_m_n_grid_desc,
-                                      AGridIteratorHacks,
-                                      BGridIteratorHacks,
-                                      CGridIteratorHacks,
-                                      AGridMoveSliceWindowIteratorHacks,
-                                      BGridMoveSliceWindowIteratorHacks,
+                                      AGridStepHacks,
+                                      BGridStepHacks,
+                                      CGridStepHacks,
+                                      AGridMoveSliceWindowStepHacks,
+                                      BGridMoveSliceWindowStepHacks,
                                       ck::index_t nrepeat)
 
 {
@@ -102,11 +102,11 @@ __host__ float driver_gemm_dlops_v1r3(const FloatAB* p_a_grid,
                                         CThreadTransferSrcDstAccessOrder,
                                         CThreadTransferSrcDstVectorDim,
                                         CThreadTransferDstScalarPerVector,
-                                        AGridIteratorHacks,
-                                        BGridIteratorHacks,
-                                        CGridIteratorHacks,
-                                        AGridMoveSliceWindowIteratorHacks,
-                                        BGridMoveSliceWindowIteratorHacks>;
+                                        AGridStepHacks,
+                                        BGridStepHacks,
+                                        CGridStepHacks,
+                                        AGridMoveSliceWindowStepHacks,
+                                        BGridMoveSliceWindowStepHacks>;
 
     const auto M  = a_k0_m_k1_grid_desc.GetLength(I1);
     const auto N  = b_k0_n_k1_grid_desc.GetLength(I1);
diff --git a/src/composable_kernel/host/driver_offline/include/driver_gemm_xdlops_v2r3.hpp b/src/composable_kernel/host/driver_offline/include/driver_gemm_xdlops_v2r3.hpp
index c29dbdae69..edfce52a19 100644
--- a/src/composable_kernel/host/driver_offline/include/driver_gemm_xdlops_v2r3.hpp
+++ b/src/composable_kernel/host/driver_offline/include/driver_gemm_xdlops_v2r3.hpp
@@ -41,11 +41,11 @@ template <ck::index_t BlockSize,
           typename CThreadTransferSrcDstAccessOrder,
           ck::index_t CThreadTransferSrcDstVectorDim,
           ck::index_t CThreadTransferDstScalarPerVector,
-          typename AGridIteratorHacks,
-          typename BGridIteratorHacks,
-          typename CGridIteratorHacks,
-          typename AGridMoveSliceWindowIteratorHacks,
-          typename BGridMoveSliceWindowIteratorHacks,
+          typename AGridStepHacks,
+          typename BGridStepHacks,
+          typename CGridStepHacks,
+          typename AGridMoveSliceWindowStepHacks,
+          typename BGridMoveSliceWindowStepHacks,
           bool CAccessOrderMRepeatNRepeat>
 __host__ float driver_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
                                        const FloatAB* p_b_grid,
@@ -53,11 +53,11 @@ __host__ float driver_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
                                        const AK0MK1GridDesc& a_k0_m_k1_grid_desc,
                                        const BK0NK1GridDesc& b_k0_n_k1_grid_desc,
                                        const CMNGridDesc& c_m_n_grid_desc,
-                                       AGridIteratorHacks,
-                                       BGridIteratorHacks,
-                                       CGridIteratorHacks,
-                                       AGridMoveSliceWindowIteratorHacks,
-                                       BGridMoveSliceWindowIteratorHacks,
+                                       AGridStepHacks,
+                                       BGridStepHacks,
+                                       CGridStepHacks,
+                                       AGridMoveSliceWindowStepHacks,
+                                       BGridMoveSliceWindowStepHacks,
                                        ck::index_t nrepeat)
 
 {
@@ -103,11 +103,11 @@ __host__ float driver_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
                                                 CThreadTransferSrcDstAccessOrder,
                                                 CThreadTransferSrcDstVectorDim,
                                                 CThreadTransferDstScalarPerVector,
-                                                AGridIteratorHacks,
-                                                BGridIteratorHacks,
-                                                CGridIteratorHacks,
-                                                AGridMoveSliceWindowIteratorHacks,
-                                                BGridMoveSliceWindowIteratorHacks,
+                                                AGridStepHacks,
+                                                BGridStepHacks,
+                                                CGridStepHacks,
+                                                AGridMoveSliceWindowStepHacks,
+                                                BGridMoveSliceWindowStepHacks,
                                                 CAccessOrderMRepeatNRepeat>;
 
     {

From 4619a4e86c493378a954da098842193f7355d8e5 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Wed, 11 Aug 2021 09:42:53 -0500
Subject: [PATCH 32/57] make innner product compatiable on gfx900

---
 .../threadwise_contraction_dlops.hpp          |   9 +-
 .../include/utility/amd_dlop.hpp              | 188 ----------------
 .../include/utility/amd_inline_asm.hpp        |   2 +
 .../include/utility/common_header.hpp         |   6 +-
 .../include/utility/config.hpp                |  27 +--
 .../include/utility/inner_product.hpp         | 207 ++++++++++++++++++
 ...mplicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp |   2 +-
 .../src/conv_fwd_driver_offline.cpp           |   8 +-
 8 files changed, 234 insertions(+), 215 deletions(-)
 delete mode 100644 src/composable_kernel/composable_kernel/include/utility/amd_dlop.hpp
 create mode 100644 src/composable_kernel/composable_kernel/include/utility/inner_product.hpp

diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp
index 7e7bb9c8c3..ca3aca3015 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp
@@ -97,10 +97,9 @@ struct ThreadwiseGemmDlops_km0m1_kn0n1_m0m1n0n1
                                 CThreadDesc_TM0_TM1_TN0_TN1{}.CalculateOffset(
                                     c_origin_idx + make_multi_index(tm0, tm1, tn0, tn1));
 
-                            amd_inner_product_dlop<FloatA, FloatB, FloatC>(
-                                a_buf[Number<a_offset>{}],
-                                b_buf[Number<b_offset>{}],
-                                c_buf(Number<c_offset>{}));
+                            inner_product<FloatA, FloatB, FloatC>(a_buf[Number<a_offset>{}],
+                                                                  b_buf[Number<b_offset>{}],
+                                                                  c_buf(Number<c_offset>{}));
                         });
                     });
                 });
@@ -214,7 +213,7 @@ struct ThreadwiseContractionDlops_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_
                                 CThreadDesc_TM0_TM1_TN0_TN1{}.CalculateOffset(
                                     c_origin_idx + make_multi_index(tm0, tm1, tn0, tn1));
 
-                            amd_inner_product_dlop<a_vector_t, b_vector_t, FloatC>(
+                            inner_product<a_vector_t, b_vector_t, FloatC>(
                                 a_vec.template AsType<a_vector_t>()[I0],
                                 b_vec.template AsType<b_vector_t>()[I0],
                                 c_buf(Number<c_offset>{}));
diff --git a/src/composable_kernel/composable_kernel/include/utility/amd_dlop.hpp b/src/composable_kernel/composable_kernel/include/utility/amd_dlop.hpp
deleted file mode 100644
index 8ce19012e9..0000000000
--- a/src/composable_kernel/composable_kernel/include/utility/amd_dlop.hpp
+++ /dev/null
@@ -1,188 +0,0 @@
-#ifndef CK_AMD_DLOP_HPP
-#define CK_AMD_DLOP_HPP
-
-#include "data_type.hpp"
-
-namespace ck {
-
-template <typename TA, typename TB, typename TC>
-__device__ void amd_inner_product_dlop(const TA& a, const TB& b, TC& c);
-
-template <>
-__device__ void
-amd_inner_product_dlop<float, float, float>(const float& a, const float& b, float& c)
-{
-#if CK_USE_AMD_DLOP_INLINE_ASM
-    asm volatile("\n \
-            v_fmac_f32 %0, %1, %2 \n \
-            "
-                 : "=v"(c)
-                 : "v"(a), "v"(b), "0"(c));
-#else
-    c += a * b;
-#endif
-}
-
-template <>
-__device__ void
-amd_inner_product_dlop<float2_t, float2_t, float>(const float2_t& a, const float2_t& b, float& c)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-
-    amd_inner_product_dlop(vector_type<float, 2>{a}.AsType<float>()[I0],
-                           vector_type<float, 2>{b}.AsType<float>()[I0],
-                           c);
-
-    amd_inner_product_dlop(vector_type<float, 2>{a}.AsType<float>()[I1],
-                           vector_type<float, 2>{b}.AsType<float>()[I1],
-                           c);
-}
-
-template <>
-__device__ void
-amd_inner_product_dlop<float4_t, float4_t, float>(const float4_t& a, const float4_t& b, float& c)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    amd_inner_product_dlop(vector_type<float, 4>{a}.AsType<float>()[I0],
-                           vector_type<float, 4>{b}.AsType<float>()[I0],
-                           c);
-
-    amd_inner_product_dlop(vector_type<float, 4>{a}.AsType<float>()[I1],
-                           vector_type<float, 4>{b}.AsType<float>()[I1],
-                           c);
-
-    amd_inner_product_dlop(vector_type<float, 4>{a}.AsType<float>()[I2],
-                           vector_type<float, 4>{b}.AsType<float>()[I2],
-                           c);
-
-    amd_inner_product_dlop(vector_type<float, 4>{a}.AsType<float>()[I3],
-                           vector_type<float, 4>{b}.AsType<float>()[I3],
-                           c);
-}
-
-#if CK_USE_AMD_DLOP
-template <>
-__device__ void
-amd_inner_product_dlop<half2_t, half2_t, float>(const half2_t& a, const half2_t& b, float& c)
-{
-#if CK_USE_AMD_DLOP_INLINE_ASM
-    asm volatile("\n \
-            v_dot2_f32_f16 %0, %1, %2, %0\n \
-            "
-                 : "=v"(c)
-                 : "v"(a), "v"(b), "0"(c));
-#else
-    c = __builtin_amdgcn_sdot2(a, b, c, false);
-#endif
-}
-
-template <>
-__device__ void
-amd_inner_product_dlop<half4_t, half4_t, float>(const half4_t& a, const half4_t& b, float& c)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-
-    amd_inner_product_dlop(vector_type<half_t, 4>{a}.AsType<half2_t>()[I0],
-                           vector_type<half_t, 4>{b}.AsType<half2_t>()[I0],
-                           c);
-
-    amd_inner_product_dlop(vector_type<half_t, 4>{a}.AsType<half2_t>()[I1],
-                           vector_type<half_t, 4>{b}.AsType<half2_t>()[I1],
-                           c);
-}
-
-template <>
-__device__ void
-amd_inner_product_dlop<half8_t, half8_t, float>(const half8_t& a, const half8_t& b, float& c)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    amd_inner_product_dlop(vector_type<half_t, 8>{a}.AsType<half2_t>()[I0],
-                           vector_type<half_t, 8>{b}.AsType<half2_t>()[I0],
-                           c);
-
-    amd_inner_product_dlop(vector_type<half_t, 8>{a}.AsType<half2_t>()[I1],
-                           vector_type<half_t, 8>{b}.AsType<half2_t>()[I1],
-                           c);
-
-    amd_inner_product_dlop(vector_type<half_t, 8>{a}.AsType<half2_t>()[I2],
-                           vector_type<half_t, 8>{b}.AsType<half2_t>()[I2],
-                           c);
-
-    amd_inner_product_dlop(vector_type<half_t, 8>{a}.AsType<half2_t>()[I3],
-                           vector_type<half_t, 8>{b}.AsType<half2_t>()[I3],
-                           c);
-}
-
-template <>
-__device__ void amd_inner_product_dlop<int8x4_t, int8x4_t, int32_t>(const int8x4_t& a,
-                                                                    const int8x4_t& b,
-                                                                    int32_t& c)
-{
-#if CK_USE_AMD_DLOP_INLINE_ASM
-    asm volatile("\n \
-            v_dot4_i32_i8 %0, %1, %2, %0\n \
-            "
-                 : "=v"(c)
-                 : "v"(as_type<int32_t>(a)), "v"(as_type<int32_t>(b)), "0"(c));
-#else
-    c = __builtin_amdgcn_sdot4(as_type<int32_t>(a), as_type<int32_t>(b), c, false);
-#endif
-}
-
-template <>
-__device__ void amd_inner_product_dlop<int8x8_t, int8x8_t, int32_t>(const int8x8_t& a,
-                                                                    const int8x8_t& b,
-                                                                    int32_t& c)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-
-    amd_inner_product_dlop(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I0],
-                           vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I0],
-                           c);
-
-    amd_inner_product_dlop(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I1],
-                           vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I1],
-                           c);
-}
-
-template <>
-__device__ void amd_inner_product_dlop<int8x16_t, int8x16_t, int32_t>(const int8x16_t& a,
-                                                                      const int8x16_t& b,
-                                                                      int32_t& c)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    amd_inner_product_dlop(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I0],
-                           vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I0],
-                           c);
-
-    amd_inner_product_dlop(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I1],
-                           vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I1],
-                           c);
-
-    amd_inner_product_dlop(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I2],
-                           vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I2],
-                           c);
-
-    amd_inner_product_dlop(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I3],
-                           vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I3],
-                           c);
-}
-#endif // CK_USE_AMD_DLOP
-
-} // namespace ck
-#endif
diff --git a/src/composable_kernel/composable_kernel/include/utility/amd_inline_asm.hpp b/src/composable_kernel/composable_kernel/include/utility/amd_inline_asm.hpp
index 34c68a35aa..a2d9d5f062 100644
--- a/src/composable_kernel/composable_kernel/include/utility/amd_inline_asm.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/amd_inline_asm.hpp
@@ -4,6 +4,8 @@
 #include "data_type.hpp"
 #include "c_style_pointer_cast.hpp"
 
+// TODO: deprecate all amd_assembly_outer_product_xxx
+
 namespace ck {
 
 // c0 += inner_product(a, b0)
diff --git a/src/composable_kernel/composable_kernel/include/utility/common_header.hpp b/src/composable_kernel/composable_kernel/include/utility/common_header.hpp
index 45d22cd618..ba20248028 100644
--- a/src/composable_kernel/composable_kernel/include/utility/common_header.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/common_header.hpp
@@ -31,15 +31,13 @@
 #include "static_buffer.hpp"
 #include "dynamic_buffer.hpp"
 
+#include "inner_product.hpp"
+
 // TODO: remove this
 #if CK_USE_AMD_INLINE_ASM
 #include "amd_inline_asm.hpp"
 #endif
 
-#if CK_USE_AMD_DLOP
-#include "amd_dlop.hpp"
-#endif
-
 #if CK_USE_AMD_XDLOPS
 #include "amd_xdlops.hpp"
 #endif
diff --git a/src/composable_kernel/composable_kernel/include/utility/config.hpp b/src/composable_kernel/composable_kernel/include/utility/config.hpp
index 49f1bb7a5a..521ad24d47 100644
--- a/src/composable_kernel/composable_kernel/include/utility/config.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/config.hpp
@@ -14,12 +14,7 @@
 // should enable one and only one GPU target
 #if !(defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900) || defined(CK_AMD_GPU_GFX906) || \
       defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90A) || defined(CK_AMD_GPU_GFX1030))
-#error Need to define a single GPU target
-#endif
-
-// HIP version
-#ifndef CK_HIP_VERSION_FLAT
-#define CK_HIP_VERSION_FLAT 0
+#error Need to define (only) one GPU target
 #endif
 
 // launch bounds
@@ -38,6 +33,16 @@
 #define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
 #endif
 
+// FMA instruction
+#if defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900)
+#define CK_USE_AMD_V_MAC_F32
+#elif defined(CK_AMD_GPU_GFX906) || defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90a) || \
+    defined(CK_AMD_GPU_GFX1030)
+#define CK_USE_AMD_V_FMAC_F32
+#define CK_USE_AMD_V_DOT2_F32_F16
+#define CK_USE_AMD_V_DOT4_I32_I8
+#endif
+
 // multi index
 #define CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0
 
@@ -46,13 +51,9 @@
 #define CK_USE_AMD_INLINE_ASM 1
 #endif
 
-// AMD DLOPS
-#ifndef CK_USE_AMD_DLOP
-#define CK_USE_AMD_DLOP 1
-#endif
-
-#ifndef CK_USE_AMD_DLOP_INLINE_ASM
-#define CK_USE_AMD_DLOP_INLINE_ASM 1
+// AMD inner product (DLOP)
+#ifndef CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
+#define CK_USE_AMD_INNER_PRODUCT_INLINE_ASM 1
 #endif
 
 // AMD buffer addressing
diff --git a/src/composable_kernel/composable_kernel/include/utility/inner_product.hpp b/src/composable_kernel/composable_kernel/include/utility/inner_product.hpp
new file mode 100644
index 0000000000..51753accf3
--- /dev/null
+++ b/src/composable_kernel/composable_kernel/include/utility/inner_product.hpp
@@ -0,0 +1,207 @@
+#ifndef CK_INNER_PRODUCT_HPP
+#define CK_INNER_PRODUCT_HPP
+
+#include "data_type.hpp"
+
+namespace ck {
+
+template <typename TA, typename TB, typename TC>
+__device__ void inner_product(const TA& a, const TB& b, TC& c);
+
+template <>
+__device__ void inner_product<float, float, float>(const float& a, const float& b, float& c)
+{
+#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM && defined(CK_USE_AMD_V_MAC_F32)
+    asm volatile("\n \
+            v_mac_f32 %0, %1, %2 \n \
+            "
+                 : "=v"(c)
+                 : "v"(a), "v"(b), "0"(c));
+#elif CK_USE_AMD_INNER_PRODUCT_INLINE_ASM && defined(CK_USE_AMD_V_FMAC_F32)
+    asm volatile("\n \
+            v_fmac_f32 %0, %1, %2 \n \
+            "
+                 : "=v"(c)
+                 : "v"(a), "v"(b), "0"(c));
+#else
+    c += a * b;
+#endif
+}
+
+template <>
+__device__ void
+inner_product<float2_t, float2_t, float>(const float2_t& a, const float2_t& b, float& c)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+
+    inner_product(vector_type<float, 2>{a}.AsType<float>()[I0],
+                  vector_type<float, 2>{b}.AsType<float>()[I0],
+                  c);
+
+    inner_product(vector_type<float, 2>{a}.AsType<float>()[I1],
+                  vector_type<float, 2>{b}.AsType<float>()[I1],
+                  c);
+}
+
+template <>
+__device__ void
+inner_product<float4_t, float4_t, float>(const float4_t& a, const float4_t& b, float& c)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    inner_product(vector_type<float, 4>{a}.AsType<float>()[I0],
+                  vector_type<float, 4>{b}.AsType<float>()[I0],
+                  c);
+
+    inner_product(vector_type<float, 4>{a}.AsType<float>()[I1],
+                  vector_type<float, 4>{b}.AsType<float>()[I1],
+                  c);
+
+    inner_product(vector_type<float, 4>{a}.AsType<float>()[I2],
+                  vector_type<float, 4>{b}.AsType<float>()[I2],
+                  c);
+
+    inner_product(vector_type<float, 4>{a}.AsType<float>()[I3],
+                  vector_type<float, 4>{b}.AsType<float>()[I3],
+                  c);
+}
+
+template <>
+__device__ void inner_product<half2_t, half2_t, float>(const half2_t& a, const half2_t& b, float& c)
+{
+#if defined(CK_USE_AMD_V_DOT2_F32_F16)
+#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
+    asm volatile("\n \
+            v_dot2_f32_f16 %0, %1, %2, %0\n \
+            "
+                 : "=v"(c)
+                 : "v"(a), "v"(b), "0"(c));
+#else
+    c = __builtin_amdgcn_sdot2(a, b, c, false);
+#endif
+#else
+    const auto convert = type_convert<int32_t>{};
+
+    const vector_type<half_t, 2> a_vector{a};
+    const vector_type<half_t, 2> b_vector{b};
+
+    static_for<0, 2, 1>{}([&](auto i) {
+        c += convert(a_vector.AsType<half_t>()[i]) * convert(b_vector.AsType<half_t>()[i]);
+    });
+#endif
+}
+
+template <>
+__device__ void inner_product<half4_t, half4_t, float>(const half4_t& a, const half4_t& b, float& c)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+
+    inner_product(vector_type<half_t, 4>{a}.AsType<half2_t>()[I0],
+                  vector_type<half_t, 4>{b}.AsType<half2_t>()[I0],
+                  c);
+
+    inner_product(vector_type<half_t, 4>{a}.AsType<half2_t>()[I1],
+                  vector_type<half_t, 4>{b}.AsType<half2_t>()[I1],
+                  c);
+}
+
+template <>
+__device__ void inner_product<half8_t, half8_t, float>(const half8_t& a, const half8_t& b, float& c)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    inner_product(vector_type<half_t, 8>{a}.AsType<half2_t>()[I0],
+                  vector_type<half_t, 8>{b}.AsType<half2_t>()[I0],
+                  c);
+
+    inner_product(vector_type<half_t, 8>{a}.AsType<half2_t>()[I1],
+                  vector_type<half_t, 8>{b}.AsType<half2_t>()[I1],
+                  c);
+
+    inner_product(vector_type<half_t, 8>{a}.AsType<half2_t>()[I2],
+                  vector_type<half_t, 8>{b}.AsType<half2_t>()[I2],
+                  c);
+
+    inner_product(vector_type<half_t, 8>{a}.AsType<half2_t>()[I3],
+                  vector_type<half_t, 8>{b}.AsType<half2_t>()[I3],
+                  c);
+}
+
+template <>
+__device__ void
+inner_product<int8x4_t, int8x4_t, int32_t>(const int8x4_t& a, const int8x4_t& b, int32_t& c)
+{
+#if defined(CK_USE_DOT4_I32_I8)
+#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
+    asm volatile("\n \
+            v_dot4_i32_i8 %0, %1, %2, %0\n \
+            "
+                 : "=v"(c)
+                 : "v"(as_type<int32_t>(a)), "v"(as_type<int32_t>(b)), "0"(c));
+#else
+    c = __builtin_amdgcn_sdot4(as_type<int32_t>(a), as_type<int32_t>(b), c, false);
+#endif
+#else
+    const auto convert = type_convert<int32_t>{};
+
+    const vector_type<int8_t, 4> a_vector{a};
+    const vector_type<int8_t, 4> b_vector{b};
+
+    static_for<0, 4, 1>{}([&](auto i) {
+        c += convert(a_vector.AsType<int8_t>()[i]) * convert(b_vector.AsType<int8_t>()[i]);
+    });
+#endif
+}
+
+template <>
+__device__ void
+inner_product<int8x8_t, int8x8_t, int32_t>(const int8x8_t& a, const int8x8_t& b, int32_t& c)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+
+    inner_product(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I0],
+                  vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I0],
+                  c);
+
+    inner_product(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I1],
+                  vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I1],
+                  c);
+}
+
+template <>
+__device__ void
+inner_product<int8x16_t, int8x16_t, int32_t>(const int8x16_t& a, const int8x16_t& b, int32_t& c)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I0],
+                  vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I0],
+                  c);
+
+    inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I1],
+                  vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I1],
+                  c);
+
+    inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I2],
+                  vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I2],
+                  c);
+
+    inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I3],
+                  vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I3],
+                  c);
+}
+
+} // namespace ck
+#endif
diff --git a/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp b/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
index 0d28616386..e1b7c5486c 100644
--- a/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/src/composable_kernel/host/driver_offline/include/device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
@@ -48,7 +48,7 @@ void device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
     const auto wei_desc_k_c_y_x   = make_naive_tensor_descriptor_packed(wei_k_c_y_x_lengths);
     const auto out_desc_n_k_ho_wo = make_naive_tensor_descriptor_packed(out_n_k_ho_wo_lengths);
 
-#if 0
+#if 1
     // [8, 1, 128, 1] * [8, 4, 32, 1] = [1, 128, 4, 32] for fp32
     // cdata = 64, BlockSize = 256
     constexpr index_t BlockSize = 256;
diff --git a/src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp b/src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp
index 2653929c32..161d17a4de 100644
--- a/src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp
+++ b/src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp
@@ -20,9 +20,9 @@
 #include "device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
 
 #define USE_MODE 1
-#define USE_CONV_FWD_V4R4_NCHW 0
-#define USE_CONV_FWD_V4R4R2_NHWC 1
-#define USE_CONV_FWD_V6R1_NCHW 1
+#define USE_CONV_FWD_V4R4_NCHW 1
+#define USE_CONV_FWD_V4R4R2_NHWC 0
+#define USE_CONV_FWD_V6R1_NCHW 0
 #define USE_CONV_FWD_V5R1_NCHW 0
 #define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
 #define USE_CONV_FWD_V4R4R4_XDL_NHWC 0
@@ -126,7 +126,7 @@ int main(int argc, char* argv[])
     const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
 #endif
 
-#if 0
+#if 1
     using in_data_t  = float;
     using acc_data_t = float;
     using out_data_t = float;

From d5e7530f8db03ec3ca87862e76fafc4cfe47a656 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Thu, 12 Aug 2021 17:39:41 -0500
Subject: [PATCH 33/57] Update src/include/miopen/solver/ck_utility_common.hpp

Co-authored-by: JD <Jehandad.Khan@amd.com>
---
 src/include/miopen/solver/ck_utility_common.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/include/miopen/solver/ck_utility_common.hpp b/src/include/miopen/solver/ck_utility_common.hpp
index 8904c3b76d..765d436d9e 100644
--- a/src/include/miopen/solver/ck_utility_common.hpp
+++ b/src/include/miopen/solver/ck_utility_common.hpp
@@ -66,7 +66,7 @@ static inline auto get_ck_common_compiler_flag(const ConvolutionContext& ctx)
     auto compiler_flag = std::string(" --std=c++17");
 
     // GPU target
-    std::string gpu_target = ctx.GetStream().GetDeviceName();
+    static const std::string gpu_target = ctx.GetStream().GetDeviceName();
 
     if(StartsWith(gpu_target, "gfx803"))
         compiler_flag += std::string(" -DCK_AMD_GPU_GFX803");

From 04999e3a7b6537fe126db75c211c2c539538b1af Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Fri, 13 Aug 2021 01:05:14 +0000
Subject: [PATCH 34/57] compiler parameter use stream

---
 ...nv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp | 175 +++++++++---------
 .../miopen/solver/ck_utility_common.hpp       |  36 ++--
 2 files changed, 110 insertions(+), 101 deletions(-)

diff --git a/src/composable_kernel/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp b/src/composable_kernel/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp
index b2e14df8f9..2b645e3c3b 100644
--- a/src/composable_kernel/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/src/composable_kernel/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp
@@ -2,6 +2,7 @@
 #define CONV_IGEMM_FWD_V6R1_DLOPS_NCHW_KCYX_NKHW_HPP
 
 #include <numeric>
+#include <sstream>
 
 namespace ck {
 namespace driver {
@@ -10,93 +11,97 @@ struct CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw
 {
     auto GetCompileParameterString() const
     {
+        auto param = std::stringstream();
+
         // clang-format off
-        return
-            " -DCK_PARAM_ABDataTypeEnum=" + 
-                std::to_string(ABDataTypeEnum) + 
-            " -DCK_PARAM_AccDataTypeEnum=" + 
-                std::to_string(AccDataTypeEnum) +
-            " -DCK_PARAM_CDataTypeEnum=" + 
-                std::to_string(CDataTypeEnum) + 
-            " -DCK_PARAM_BlockSize=" +
-                std::to_string(BlockSize) +
-            " -DCK_PARAM_GN0=" +
-                std::to_string(GN0) +
-            " -DCK_PARAM_GK1=" +
-                std::to_string(GK1) +
-            " -DCK_PARAM_GM1PerBlockGM11=" +
-                std::to_string(GM1PerBlockGM11) +
-            " -DCK_PARAM_GN1PerBlockGN11=" +
-                std::to_string(GN1PerBlockGN11) +
-            " -DCK_PARAM_GK0PerBlock=" + 
-                std::to_string(GK0PerBlock) +
-            " -DCK_PARAM_BM1PerThreadBM11=" +
-                std::to_string(BM1PerThreadBM11) +
-            " -DCK_PARAM_BN1PerThreadBN11=" +
-                std::to_string(BN1PerThreadBN11) +
-            " -DCK_PARAM_BK0PerThread=" +
-                std::to_string(BK0PerThread) +
-            " -DCK_PARAM_BM10BN10ThreadClusterBM10Xs=" +
-                std::to_string(BM10BN10ThreadClusterBM10Xs[0]) + "," +
-                std::to_string(BM10BN10ThreadClusterBM10Xs[1]) +
-            " -DCK_PARAM_BM10BN10ThreadClusterBN10Xs=" +
-                std::to_string(BM10BN10ThreadClusterBN10Xs[0]) + "," +
-                std::to_string(BM10BN10ThreadClusterBN10Xs[1]) +
-            " -DCK_PARAM_ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1=" +
-                std::to_string(ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[0]) + "," +
-                std::to_string(ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[1]) + "," +
-                std::to_string(ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[2]) + "," +
-                std::to_string(ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[3]) + "," +
-                std::to_string(ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[4]) +
-            " -DCK_PARAM_ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1=" +
-                std::to_string(ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[0]) + "," +
-                std::to_string(ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[1]) + "," +
-                std::to_string(ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[2]) + "," +
-                std::to_string(ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[3]) + "," +
-                std::to_string(ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[4]) +
-            " -DCK_PARAM_ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1=" +
-                std::to_string(ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0]) +  "," +
-                std::to_string(ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1]) + "," +
-                std::to_string(ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2]) + "," +
-                std::to_string(ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3]) + "," +
-                std::to_string(ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4]) +
-            " -DCK_PARAM_ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1=" +
-                std::to_string(ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0]) + "," +
-                std::to_string(ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1]) + "," +
-                std::to_string(ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2]) + "," +
-                std::to_string(ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3]) + "," +
-                std::to_string(ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4]) +
-            " -DCK_PARAM_BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1=" +
-                std::to_string(BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[0]) + "," +
-                std::to_string(BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[1]) + "," +
-                std::to_string(BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[2]) + "," +
-                std::to_string(BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[3]) + "," +
-                std::to_string(BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[4]) +
-            " -DCK_PARAM_BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1=" +
-                std::to_string(BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[0]) + "," +
-                std::to_string(BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[1]) + "," +
-                std::to_string(BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[2]) + "," +
-                std::to_string(BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[3]) + "," +
-                std::to_string(BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[4]) +
-            " -DCK_PARAM_BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1=" +
-                std::to_string(BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0]) + "," +
-                std::to_string(BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1]) + "," +
-                std::to_string(BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2]) + "," +
-                std::to_string(BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3]) + "," +
-                std::to_string(BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4]) +
-            " -DCK_PARAM_BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1=" +
-                std::to_string(BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0]) + "," +
-                std::to_string(BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1]) + "," +
-                std::to_string(BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2]) + "," +
-                std::to_string(BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3]) + "," +
-                std::to_string(BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4]) +
-            " -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
-                std::to_string(CThreadTransferDstScalarPerVector) +
-            " -DCK_PARAM_HasMainKBlockLoop=" +
-                std::to_string(static_cast<int>(HasMainKBlockLoop)) + 
-            " -DCK_PARAM_HasDoubleTailKBlockLoop=" +
-                std::to_string(static_cast<int>(HasDoubleTailKBlockLoop));
+        param <<
+            " -DCK_PARAM_ABDataTypeEnum=" << 
+                ABDataTypeEnum <<
+            " -DCK_PARAM_AccDataTypeEnum=" << 
+                AccDataTypeEnum <<
+            " -DCK_PARAM_CDataTypeEnum=" << 
+                CDataTypeEnum <<
+            " -DCK_PARAM_BlockSize=" << 
+                BlockSize <<
+            " -DCK_PARAM_GN0=" << 
+                GN0 <<
+            " -DCK_PARAM_GK1=" << 
+                GK1 <<
+            " -DCK_PARAM_GM1PerBlockGM11=" 
+                << GM1PerBlockGM11 <<
+            " -DCK_PARAM_GN1PerBlockGN11=" <<
+                GN1PerBlockGN11 <<
+            " -DCK_PARAM_GK0PerBlock=" <<
+                GK0PerBlock <<
+            " -DCK_PARAM_BM1PerThreadBM11=" <<
+                BM1PerThreadBM11 <<
+            " -DCK_PARAM_BN1PerThreadBN11=" <<
+                BN1PerThreadBN11 <<
+            " -DCK_PARAM_BK0PerThread=" <<
+                BK0PerThread <<
+            " -DCK_PARAM_BM10BN10ThreadClusterBM10Xs=" <<
+                BM10BN10ThreadClusterBM10Xs[0] << "," <<
+                BM10BN10ThreadClusterBM10Xs[1] <<
+            " -DCK_PARAM_BM10BN10ThreadClusterBN10Xs=" <<
+                BM10BN10ThreadClusterBN10Xs[0] << "," <<
+                BM10BN10ThreadClusterBN10Xs[1] <<
+            " -DCK_PARAM_ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1=" <<
+                ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[0] << "," <<
+                ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[1] << "," <<
+                ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[2] << "," <<
+                ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[3] << "," <<
+                ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[4] <<
+            " -DCK_PARAM_ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1=" <<
+                ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[0] << "," <<
+                ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[1] << "," <<
+                ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[2] << "," <<
+                ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[3] << "," <<
+                ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[4] << 
+            " -DCK_PARAM_ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1=" <<
+                ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0] << "," <<
+                ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1] << "," <<
+                ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2] << "," <<
+                ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3] << "," <<
+                ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4] <<
+            " -DCK_PARAM_ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1=" <<
+                ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0] << "," <<
+                ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1] << "," <<
+                ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2] << "," <<
+                ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3] << "," <<
+                ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4] <<
+            " -DCK_PARAM_BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1=" <<
+                BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[0] << "," <<
+                BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[1] << "," <<
+                BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[2] << "," <<
+                BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[3] << "," <<
+                BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[4] <<
+            " -DCK_PARAM_BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1=" <<
+                BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[0] << "," <<
+                BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[1] << "," <<
+                BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[2] << "," <<
+                BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[3] << "," <<
+                BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[4] << 
+            " -DCK_PARAM_BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1=" <<
+                BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0] << "," <<
+                BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1] << "," <<
+                BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2] << "," <<
+                BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3] << "," <<
+                BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4] << 
+            " -DCK_PARAM_BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1=" <<
+                BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0] << "," <<
+                BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1] << "," <<
+                BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2] << "," <<
+                BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3] << "," <<
+                BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4] << 
+            " -DCK_PARAM_CThreadTransferDstScalarPerVector=" <<
+                CThreadTransferDstScalarPerVector <<
+            " -DCK_PARAM_HasMainKBlockLoop=" <<
+                static_cast<int>(HasMainKBlockLoop) <<
+            " -DCK_PARAM_HasDoubleTailKBlockLoop=" <<
+                static_cast<int>(HasDoubleTailKBlockLoop);
         // clang-format on
+
+        return param.str();
     }
 
     ck::DataTypeEnum_t ABDataTypeEnum  = ck::DataTypeEnum_t::Unknown;
diff --git a/src/include/miopen/solver/ck_utility_common.hpp b/src/include/miopen/solver/ck_utility_common.hpp
index 765d436d9e..9000162201 100644
--- a/src/include/miopen/solver/ck_utility_common.hpp
+++ b/src/include/miopen/solver/ck_utility_common.hpp
@@ -33,6 +33,7 @@
 #include <miopen/rocm_features.hpp>
 #include <miopen/solver/convolution_context_interpreter.hpp>
 #include <algorithm>
+#include <sstream>
 
 #include "../composable_kernel/composable_kernel/include/utility/data_type_enum.hpp"
 #include "../composable_kernel/host/solver/include/convolution_problem_descriptor.hpp"
@@ -63,39 +64,42 @@ static inline bool is_support_amd_buffer_atomic_fadd(const std::string& device_n
 
 static inline auto get_ck_common_compiler_flag(const ConvolutionContext& ctx)
 {
-    auto compiler_flag = std::string(" --std=c++17");
+    auto compiler_flag = std::stringstream();
+
+    // C++ standard
+    compiler_flag << " --std=c++17";
 
     // GPU target
     static const std::string gpu_target = ctx.GetStream().GetDeviceName();
 
     if(StartsWith(gpu_target, "gfx803"))
-        compiler_flag += std::string(" -DCK_AMD_GPU_GFX803");
+        compiler_flag << " -DCK_AMD_GPU_GFX803";
     else if(StartsWith(gpu_target, "gfx900"))
-        compiler_flag += std::string(" -DCK_AMD_GPU_GFX900");
+        compiler_flag << " -DCK_AMD_GPU_GFX900";
     else if(StartsWith(gpu_target, "gfx906"))
-        compiler_flag += std::string(" -DCK_AMD_GPU_GFX906");
+        compiler_flag << " -DCK_AMD_GPU_GFX906";
     else if(StartsWith(gpu_target, "gfx908"))
-        compiler_flag += std::string(" -DCK_AMD_GPU_GFX908");
+        compiler_flag << " -DCK_AMD_GPU_GFX908";
     else if(StartsWith(gpu_target, "gfx90a"))
-        compiler_flag += std::string(" -DCK_AMD_GPU_GFX90A");
+        compiler_flag << " -DCK_AMD_GPU_GFX90A";
     else if(StartsWith(gpu_target, "gfx1030"))
-        compiler_flag += std::string(" -DCK_AMD_GPU_GFX1030");
+        compiler_flag << " -DCK_AMD_GPU_GFX1030";
 
     // buffer atomic-fadd
-    compiler_flag +=
-        std::string(" -DCK_USE_AMD_BUFFER_ATOMIC_FADD=") +
-        (is_support_amd_buffer_atomic_fadd(ctx.GetStream().GetDeviceName()) ? '1' : '0');
+    compiler_flag << " -DCK_USE_AMD_BUFFER_ATOMIC_FADD="
+                  << (is_support_amd_buffer_atomic_fadd(ctx.GetStream().GetDeviceName()) ? '1'
+                                                                                         : '0');
 
     // sync LDS
-    compiler_flag +=
-        std::string(" -DCK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM=") +
-        (miopen::IsDisabled(MIOPEN_DEBUG_CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM{}) ? '0' : '1');
+    compiler_flag << " -DCK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM="
+                  << (miopen::IsDisabled(MIOPEN_DEBUG_CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM{}) ? '0'
+                                                                                             : '1');
 
     // buffer addressing
-    compiler_flag += std::string(" -DCK_USE_AMD_BUFFER_ADDRESSING=") +
-                     (miopen::IsDisabled(MIOPEN_DEBUG_CK_USE_AMD_BUFFER_ADDRESSING{}) ? '0' : '1');
+    compiler_flag << " -DCK_USE_AMD_BUFFER_ADDRESSING="
+                  << (miopen::IsDisabled(MIOPEN_DEBUG_CK_USE_AMD_BUFFER_ADDRESSING{}) ? '0' : '1');
 
-    return compiler_flag;
+    return compiler_flag.str();
 }
 
 static inline auto get_ck_convolution_problem_descriptor(const ConvolutionContext& ctx)

From b460246d23b72905d09b039f2221be16d906d852 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Fri, 13 Aug 2021 20:55:39 +0000
Subject: [PATCH 35/57] use int instead of index_t in kernel wrapper

---
 ...mplicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp | 46 +++++++++++++------
 1 file changed, 31 insertions(+), 15 deletions(-)

diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp
index 9661f0e50c..c1208ac3cb 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp
@@ -62,23 +62,39 @@ constexpr bool HasMainKBlockLoop       = static_cast<bool>(CK_PARAM_HasMainKBloc
 constexpr bool HasDoubleTailKBlockLoop = static_cast<bool>(CK_PARAM_HasDoubleTailKBlockLoop);
 
 extern "C" __global__ void
-convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare(index_t N,
-                                                                    index_t C,
-                                                                    index_t Hi,
-                                                                    index_t Wi,
-                                                                    index_t K,
-                                                                    index_t Y,
-                                                                    index_t X,
-                                                                    index_t ConvStrideH,
-                                                                    index_t ConvStrideW,
-                                                                    index_t ConvDilationH,
-                                                                    index_t ConvDilationW,
-                                                                    index_t InLeftPadH,
-                                                                    index_t InLeftPadW,
-                                                                    index_t InRightPadH,
-                                                                    index_t InRightPadW,
+convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare(int N_,
+                                                                    int C_,
+                                                                    int Hi_,
+                                                                    int Wi_,
+                                                                    int K_,
+                                                                    int Y_,
+                                                                    int X_,
+                                                                    int ConvStrideH_,
+                                                                    int ConvStrideW_,
+                                                                    int ConvDilationH_,
+                                                                    int ConvDilationW_,
+                                                                    int InLeftPadH_,
+                                                                    int InLeftPadW_,
+                                                                    int InRightPadH_,
+                                                                    int InRightPadW_,
                                                                     void* p_desc_tuple)
 {
+    index_t N             = static_cast<index_t>(N_);
+    index_t C             = static_cast<index_t>(C_);
+    index_t Hi            = static_cast<index_t>(Hi_);
+    index_t Wi            = static_cast<index_t>(Wi_);
+    index_t K             = static_cast<index_t>(K_);
+    index_t Y             = static_cast<index_t>(Y_);
+    index_t X             = static_cast<index_t>(X_);
+    index_t ConvStrideH   = static_cast<index_t>(ConvStrideH_);
+    index_t ConvStrideW   = static_cast<index_t>(ConvStrideW_);
+    index_t ConvDilationH = static_cast<index_t>(ConvDilationH_);
+    index_t ConvDilationW = static_cast<index_t>(ConvDilationW_);
+    index_t InLeftPadH    = static_cast<index_t>(InLeftPadH_);
+    index_t InLeftPadW    = static_cast<index_t>(InLeftPadW_);
+    index_t InRightPadH   = static_cast<index_t>(InRightPadH_);
+    index_t InRightPadW   = static_cast<index_t>(InRightPadW_);
+
     constexpr auto I0 = Number<0>{};
     constexpr auto I1 = Number<1>{};
     constexpr auto I2 = Number<2>{};

From 6445cae8e42866c1eac002249b6d1d31c63d6184 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Fri, 13 Aug 2021 23:40:19 +0000
Subject: [PATCH 36/57] DynamicBuffer, StaticBuffer, amd_buffer_load support
 customized value for invalid element

---
 .../blockwise_gemm_dlops_v3.hpp               |   2 +-
 .../gridwise_gemm_dlops_v2.hpp                |   6 +-
 .../gridwise_gemm_xdlops_v2r3.hpp             |   5 +-
 .../threadwise_tensor_slice_transfer.hpp      |   2 +-
 .../threadwise_tensor_slice_transfer_v2.hpp   |   2 +-
 .../include/utility/amd_buffer_addressing.hpp | 101 +++++++++++-------
 .../include/utility/dynamic_buffer.hpp        |  84 ++++++++++-----
 .../include/utility/static_buffer.hpp         |  46 +++++++-
 .../src/conv_fwd_driver_offline.cpp           |   2 +-
 9 files changed, 176 insertions(+), 74 deletions(-)

diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp
index a15be541b5..03f889649e 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp
@@ -133,7 +133,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
         static_assert(WPerThread % WoPerThreadSubC == 0, "");
 
         // thread A buffer for GEMM
-        StaticBuffer<AddressSpaceEnum_t::Vgpr, FloatA, a_thread_mtx_.GetElementSpaceSize()>
+        StaticBuffer<AddressSpaceEnum_t::Vgpr, FloatA, a_thread_mtx_.GetElementSpaceSize(), true>
             a_thread_buf;
 
         constexpr auto threadwise_gemm = ThreadwiseGemmDlops_km_kn_mn_v3<FloatA,
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
index 7fdb89781d..b141307b77 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
@@ -227,7 +227,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3
         // register allocation for output
         StaticBuffer<AddressSpaceEnum_t::Vgpr,
                      FloatAcc,
-                     c_k_n_ho_wo_thread_desc.GetElementSpaceSize()>
+                     c_k_n_ho_wo_thread_desc.GetElementSpaceSize(),
+                     true>
             c_thread_buf;
 
         // initialize output thread tensor
@@ -251,7 +252,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3
         // double regsiter buffer for b
         StaticBuffer<AddressSpaceEnum_t::Vgpr,
                      FloatAB,
-                     b_e_n_ho_wo_thread_desc.GetElementSpaceSize()>
+                     b_e_n_ho_wo_thread_desc.GetElementSpaceSize(),
+                     true>
             b_thread_even_buf, b_thread_odd_buf;
 
         // LDS double buffer: preload data
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
index 06352edce3..dcb16e5dcd 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
@@ -402,7 +402,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
 
         StaticBuffer<AddressSpaceEnum_t::Vgpr,
                      vector_type<FloatAcc, BlkSize>,
-                     c_mr_nr_blk_desc.GetElementSpaceSize()>
+                     c_mr_nr_blk_desc.GetElementSpaceSize(),
+                     true>
             c_thread_buf;
 
         // LDS allocation for A and B: be careful of alignment
@@ -493,7 +494,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
                                                                           Number<M2>{},
                                                                           Number<1>{}));
 
-            StaticBuffer<AddressSpaceEnum_t::Vgpr, FloatC, c_m0_m1_m2_n_thread_desc.GetElementSpaceSize()>
+            StaticBuffer<AddressSpaceEnum_t::Vgpr, FloatC, c_m0_m1_m2_n_thread_desc.GetElementSpaceSize(), true>
                 c_blk_buf_;
 
             static_for<0, MRepeat, 1>{}([&](auto mr_i) {
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
index db86c1e729..82e46984e2 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
@@ -1242,7 +1242,7 @@ struct ThreadwiseTensorSliceTransfer_v3
 
     static constexpr auto buffer_size_ = buffer_desc_.GetElementSpaceSize();
 
-    StaticBuffer<AddressSpaceEnum_t::Vgpr, SrcData, buffer_size_> buffer_;
+    StaticBuffer<AddressSpaceEnum_t::Vgpr, SrcData, buffer_size_, true> buffer_;
 
     SrcCoord src_coord_;
     DstCoord dst_coord_;
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp
index ceac47a364..6d96aa1253 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp
@@ -602,7 +602,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
 
     static constexpr auto buffer_size_ = buffer_desc_.GetElementSpaceSize();
 
-    StaticBuffer<AddressSpaceEnum_t::Vgpr, SrcData, buffer_size_> buffer_;
+    StaticBuffer<AddressSpaceEnum_t::Vgpr, SrcData, buffer_size_, true> buffer_;
 
     SrcCoord src_coord_;
     DstCoord dst_coord_;
diff --git a/src/composable_kernel/composable_kernel/include/utility/amd_buffer_addressing.hpp b/src/composable_kernel/composable_kernel/include/utility/amd_buffer_addressing.hpp
index 711af2e648..57081b7fd7 100644
--- a/src/composable_kernel/composable_kernel/include/utility/amd_buffer_addressing.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/amd_buffer_addressing.hpp
@@ -10,25 +10,25 @@ union BufferResource
 {
     // 128 bit SGPRs to supply buffer resource in buffer instructions
     // https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
-    int32x4_t data;
+    int32x4_t content;
     StaticallyIndexedArray<T*, 2> address;
     StaticallyIndexedArray<int32_t, 4> range;
     StaticallyIndexedArray<int32_t, 4> config;
 };
 
 template <typename T>
-__device__ int32x4_t make_wave_buffer_resource(T* p_wave, index_t data_space_size)
+__device__ int32x4_t make_wave_buffer_resource(T* p_wave, index_t element_space_size)
 {
     BufferResource<T> wave_buffer_resource;
 
     // wavewise base address (64 bit)
     wave_buffer_resource.address(Number<0>{}) = const_cast<remove_cv_t<T>*>(p_wave);
     // wavewise range (32 bit)
-    wave_buffer_resource.range(Number<2>{}) = data_space_size * sizeof(T);
+    wave_buffer_resource.range(Number<2>{}) = element_space_size * sizeof(T);
     // wavewise setting (32 bit)
     wave_buffer_resource.config(Number<3>{}) = CK_BUFFER_RESOURCE_3RD_DWORD;
 
-    return wave_buffer_resource.data;
+    return wave_buffer_resource.content;
 }
 
 // load
@@ -204,10 +204,9 @@ llvm_amdgcn_raw_buffer_store_fp32x4(float4_t vdata,
                                     index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32");
 
 template <typename T, index_t N>
-__device__ typename vector_type<T, N>::type
-amd_buffer_load_impl_v2(int32x4_t src_wave_buffer_resource,
-                        index_t src_thread_addr_offset,
-                        index_t src_wave_addr_offset)
+__device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_wave_buffer_resource,
+                                                                 index_t src_thread_addr_offset,
+                                                                 index_t src_wave_addr_offset)
 {
     static_assert(
         (is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
@@ -412,10 +411,10 @@ amd_buffer_load_impl_v2(int32x4_t src_wave_buffer_resource,
 }
 
 template <typename T, index_t N>
-__device__ void amd_buffer_store_impl_v2(const typename vector_type<T, N>::type src_thread_data,
-                                         int32x4_t dst_wave_buffer_resource,
-                                         index_t dst_thread_addr_offset,
-                                         index_t dst_wave_addr_offset)
+__device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src_thread_data,
+                                      int32x4_t dst_wave_buffer_resource,
+                                      index_t dst_thread_addr_offset,
+                                      index_t dst_wave_addr_offset)
 {
     static_assert(
         (is_same<T, float>::value && (N == 1 || N == 2 || N == 4)) ||
@@ -584,67 +583,95 @@ __device__ void amd_buffer_store_impl_v2(const typename vector_type<T, N>::type
 
 // buffer_load requires:
 //   1) p_src_wave must be in global memory space
-//   2) p_src_wave to be a wavewise pointer.
+//   2) p_src_wave must be a wavewise pointer.
 // It is user's responsibility to make sure that is true.
 template <typename T, index_t N>
 __device__ typename vector_type_maker<T, N>::type::type
-amd_buffer_load_v2(const T* p_src_wave,
-                   index_t src_thread_data_offset,
-                   bool src_thread_data_valid,
-                   index_t src_element_space)
+amd_buffer_load_invalid_element_return_return_zero(const T* p_src_wave,
+                                                   index_t src_thread_element_offset,
+                                                   bool src_thread_element_valid,
+                                                   index_t src_element_space_size)
 {
     const int32x4_t src_wave_buffer_resource =
-        make_wave_buffer_resource(p_src_wave, src_element_space);
+        make_wave_buffer_resource(p_src_wave, src_element_space_size);
 
-    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(T);
+    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
+
+    using vector_t = typename vector_type_maker<T, N>::type::type;
+    using scalar_t = typename scalar_type<vector_t>::type;
 
-    using vector_t                = typename vector_type_maker<T, N>::type::type;
-    using scalar_t                = typename scalar_type<vector_t>::type;
     constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
 
 #if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
-    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
+    uint32_t src_addr_shift = src_thread_element_valid ? 0 : 0x7fffffff;
 
-    return amd_buffer_load_impl_v2<scalar_t, vector_size>(
+    return amd_buffer_load_impl<scalar_t, vector_size>(
         src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);
 #else
-    vector_t tmp = amd_buffer_load_impl_v2<scalar_t, vector_size>(
+    vector_t tmp = amd_buffer_load_impl<scalar_t, vector_size>(
         src_wave_buffer_resource, src_thread_addr_offset, 0);
 
-    return src_thread_data_valid ? tmp : vector_t(0);
+    return src_thread_element_valid ? tmp : vector_t(0);
 #endif
 }
 
+// buffer_load requires:
+//   1) p_src_wave must be in global memory space
+//   2) p_src_wave must be a wavewise pointer.
+// It is user's responsibility to make sure that is true.
+template <typename T, index_t N>
+__device__ typename vector_type_maker<T, N>::type::type
+amd_buffer_load_invalid_element_return_customized_value(const T* p_src_wave,
+                                                        index_t src_thread_element_offset,
+                                                        bool src_thread_element_valid,
+                                                        index_t src_element_space_size,
+                                                        T customized_value)
+{
+    const int32x4_t src_wave_buffer_resource =
+        make_wave_buffer_resource(p_src_wave, src_element_space_size);
+
+    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
+
+    using vector_t = typename vector_type_maker<T, N>::type::type;
+    using scalar_t = typename scalar_type<vector_t>::type;
+
+    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
+
+    vector_t tmp = amd_buffer_load_impl<scalar_t, vector_size>(
+        src_wave_buffer_resource, src_thread_addr_offset, 0);
+
+    return src_thread_element_valid ? tmp : vector_t(customized_value);
+}
+
 // buffer_store requires:
 //   1) p_dst_wave must be global memory
 //   2) p_dst_wave to be a wavewise pointer.
 // It is user's responsibility to make sure that is true.
 template <typename T, index_t N>
-__device__ void
-amd_buffer_store_v2(const typename vector_type_maker<T, N>::type::type src_thread_data,
-                    T* p_dst_wave,
-                    const index_t dst_thread_data_offset,
-                    const bool dst_thread_data_valid,
-                    const index_t dst_element_space)
+__device__ void amd_buffer_store(const typename vector_type_maker<T, N>::type::type src_thread_data,
+                                 T* p_dst_wave,
+                                 const index_t dst_thread_element_offset,
+                                 const bool dst_thread_element_valid,
+                                 const index_t dst_element_space_size)
 {
     const int32x4_t dst_wave_buffer_resource =
-        make_wave_buffer_resource(p_dst_wave, dst_element_space);
+        make_wave_buffer_resource(p_dst_wave, dst_element_space_size);
 
-    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(T);
+    index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
 
     using vector_t                = typename vector_type_maker<T, N>::type::type;
     using scalar_t                = typename scalar_type<vector_t>::type;
     constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
 
 #if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
-    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
+    uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x7fffffff;
 
-    amd_buffer_store_impl_v2<scalar_t, vector_size>(
+    amd_buffer_store_impl<scalar_t, vector_size>(
         src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
 #else
-    if(dst_thread_data_valid)
+    if(dst_thread_element_valid)
     {
-        amd_buffer_store_impl_v2<scalar_t, vector_size>(
+        amd_buffer_store_impl<scalar_t, vector_size>(
             src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
     }
 #endif
diff --git a/src/composable_kernel/composable_kernel/include/utility/dynamic_buffer.hpp b/src/composable_kernel/composable_kernel/include/utility/dynamic_buffer.hpp
index 920a716765..6f54f7317c 100644
--- a/src/composable_kernel/composable_kernel/include/utility/dynamic_buffer.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/dynamic_buffer.hpp
@@ -6,34 +6,43 @@
 
 namespace ck {
 
-template <AddressSpaceEnum_t BufferAddressSpace, typename T, typename ElementSpaceSize>
+template <AddressSpaceEnum_t BufferAddressSpace,
+          typename T,
+          typename ElementSpaceSize,
+          bool InvalidElementUseNumericalZeroValue>
 struct DynamicBuffer
 {
     using type = T;
 
     T* p_data_;
     ElementSpaceSize element_space_size_;
+    T invalid_element_value_ = T{0};
 
     __host__ __device__ constexpr DynamicBuffer(T* p_data, ElementSpaceSize element_space_size)
         : p_data_{p_data}, element_space_size_{element_space_size}
     {
     }
 
+    __host__ __device__ constexpr DynamicBuffer(T* p_data,
+                                                ElementSpaceSize element_space_size,
+                                                T invalid_element_value)
+        : p_data_{p_data},
+          element_space_size_{element_space_size},
+          invalid_element_value_{invalid_element_value}
+    {
+    }
+
     __host__ __device__ static constexpr AddressSpaceEnum_t GetAddressSpace()
     {
         return BufferAddressSpace;
     }
 
-    __host__ __device__ constexpr const T& operator[](index_t i) const { return p_data_[i]; }
-
-    __host__ __device__ constexpr T& operator()(index_t i) { return p_data_[i]; }
-
     template <typename X,
               typename std::enable_if<
                   is_same<typename scalar_type<remove_cv_t<remove_reference_t<X>>>::type,
                           typename scalar_type<remove_cv_t<remove_reference_t<T>>>::type>::value,
                   bool>::type = false>
-    __host__ __device__ constexpr auto Get(index_t i, bool is_valid_offset) const
+    __host__ __device__ constexpr auto Get(index_t i, bool is_valid_element) const
     {
         // X contains multiple T
         constexpr index_t scalar_per_t_vector =
@@ -45,20 +54,41 @@ struct DynamicBuffer
         static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
                       "wrong! X need to be multiple T");
 
-        if constexpr(GetAddressSpace() == AddressSpaceEnum_t::Global)
-        {
 #if CK_USE_AMD_BUFFER_ADDRESSING
-            constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
-
-            return amd_buffer_load_v2<remove_cv_t<remove_reference_t<T>>, t_per_x>(
-                p_data_, i, is_valid_offset, element_space_size_);
+        bool constexpr use_amd_buffer_addressing = true;
 #else
-            return is_valid_offset ? *c_style_pointer_cast<const X*>(&p_data_[i]) : X{0};
+        bool constexpr use_amd_buffer_addressing = false;
 #endif
+
+        if constexpr(GetAddressSpace() == AddressSpaceEnum_t::Global && use_amd_buffer_addressing)
+        {
+            constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
+
+            if constexpr(InvalidElementUseNumericalZeroValue)
+            {
+                return amd_buffer_load_invalid_element_return_return_zero<
+                    remove_cv_t<remove_reference_t<T>>,
+                    t_per_x>(p_data_, i, is_valid_element, element_space_size_);
+            }
+            else
+            {
+                return amd_buffer_load_invalid_element_return_customized_value<
+                    remove_cv_t<remove_reference_t<T>>,
+                    t_per_x>(
+                    p_data_, i, is_valid_element, element_space_size_, invalid_element_value_);
+            }
         }
         else
         {
-            return is_valid_offset ? *c_style_pointer_cast<const X*>(&p_data_[i]) : X{0};
+            if constexpr(InvalidElementUseNumericalZeroValue)
+            {
+                return is_valid_element ? *c_style_pointer_cast<const X*>(&p_data_[i]) : X{0};
+            }
+            else
+            {
+                return is_valid_element ? *c_style_pointer_cast<const X*>(&p_data_[i])
+                                        : X{invalid_element_value_};
+            }
         }
     }
 
@@ -67,7 +97,7 @@ struct DynamicBuffer
                   is_same<typename scalar_type<remove_cv_t<remove_reference_t<X>>>::type,
                           typename scalar_type<remove_cv_t<remove_reference_t<T>>>::type>::value,
                   bool>::type = false>
-    __host__ __device__ void Set(index_t i, bool is_valid_offset, const X& x)
+    __host__ __device__ void Set(index_t i, bool is_valid_element, const X& x)
     {
         // X contains multiple T
         constexpr index_t scalar_per_t_vector =
@@ -84,10 +114,10 @@ struct DynamicBuffer
 #if CK_USE_AMD_BUFFER_ADDRESSING
             constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
 
-            amd_buffer_store_v2<remove_cv_t<remove_reference_t<T>>, t_per_x>(
-                x, p_data_, i, is_valid_offset, element_space_size_);
+            amd_buffer_store<remove_cv_t<remove_reference_t<T>>, t_per_x>(
+                x, p_data_, i, is_valid_element, element_space_size_);
 #else
-            if(is_valid_offset)
+            if(is_valid_element)
             {
                 *c_style_pointer_cast<X*>(&p_data_[i]) = x;
             }
@@ -95,7 +125,7 @@ struct DynamicBuffer
         }
         else if constexpr(GetAddressSpace() == AddressSpaceEnum_t::Lds)
         {
-            if(is_valid_offset)
+            if(is_valid_element)
             {
 #if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
                 *c_style_pointer_cast<X*>(&p_data_[i]) = x;
@@ -185,7 +215,7 @@ struct DynamicBuffer
         }
         else
         {
-            if(is_valid_offset)
+            if(is_valid_element)
             {
                 *c_style_pointer_cast<X*>(&p_data_[i]) = x;
             }
@@ -197,12 +227,18 @@ struct DynamicBuffer
     __host__ __device__ static constexpr bool IsDynamicBuffer() { return true; }
 };
 
-template <AddressSpaceEnum_t BufferAddressSpace = AddressSpaceEnum_t::Generic,
-          typename T,
-          typename ElementSpaceSize>
+template <AddressSpaceEnum_t BufferAddressSpace, typename T, typename ElementSpaceSize>
 __host__ __device__ constexpr auto make_dynamic_buffer(T* p, ElementSpaceSize element_space_size)
 {
-    return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize>{p, element_space_size};
+    return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, true>{p, element_space_size};
+}
+
+template <AddressSpaceEnum_t BufferAddressSpace, typename T, typename ElementSpaceSize>
+__host__ __device__ constexpr auto
+make_dynamic_buffer(T* p, ElementSpaceSize element_space_size, T invalid_element_value)
+{
+    return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, false>{
+        p, element_space_size, invalid_element_value};
 }
 
 } // namespace ck
diff --git a/src/composable_kernel/composable_kernel/include/utility/static_buffer.hpp b/src/composable_kernel/composable_kernel/include/utility/static_buffer.hpp
index a23cf4f80d..cd67b8a0be 100644
--- a/src/composable_kernel/composable_kernel/include/utility/static_buffer.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/static_buffer.hpp
@@ -5,30 +5,66 @@
 
 namespace ck {
 
-template <AddressSpaceEnum_t BufferAddressSpace, typename T, index_t N>
+template <AddressSpaceEnum_t BufferAddressSpace,
+          typename T,
+          index_t N,
+          bool InvalidElementUseNumericalZeroValue>
 struct StaticBuffer : public StaticallyIndexedArray<T, N>
 {
     using type = T;
     using base = StaticallyIndexedArray<T, N>;
 
+    T invalid_element_value_ = T{0};
+
     __host__ __device__ constexpr StaticBuffer() : base{} {}
 
+    __host__ __device__ constexpr StaticBuffer(T invalid_element_value)
+        : base{}, invalid_element_value_{invalid_element_value}
+    {
+    }
+
     __host__ __device__ static constexpr AddressSpaceEnum_t GetAddressSpace()
     {
         return BufferAddressSpace;
     }
 
+    template <index_t I>
+    __host__ __device__ constexpr auto Get(Number<I> i, bool is_valid_element) const
+    {
+        if constexpr(InvalidElementUseNumericalZeroValue)
+        {
+            return is_valid_element ? At(i) : T{0};
+        }
+        else
+        {
+            return is_valid_element ? At(i) : invalid_element_value_;
+        }
+    }
+
+    template <index_t I>
+    __host__ __device__ void Set(Number<I> i, bool is_valid_element, const T& x)
+    {
+        if(is_valid_element)
+        {
+            At(i) = x;
+        }
+    }
+
     __host__ __device__ static constexpr bool IsStaticBuffer() { return true; }
 
     __host__ __device__ static constexpr bool IsDynamicBuffer() { return false; }
 };
 
-template <AddressSpaceEnum_t BufferAddressSpace = AddressSpaceEnum_t::Generic,
-          typename T,
-          index_t N>
+template <AddressSpaceEnum_t BufferAddressSpace, typename T, index_t N>
 __host__ __device__ constexpr auto make_static_buffer(Number<N>)
 {
-    return StaticBuffer<BufferAddressSpace, T, N>{};
+    return StaticBuffer<BufferAddressSpace, T, N, true>{};
+}
+
+template <AddressSpaceEnum_t BufferAddressSpace, typename T, index_t N>
+__host__ __device__ constexpr auto make_static_buffer(Number<N>, T invalid_element_value)
+{
+    return StaticBuffer<BufferAddressSpace, T, N, false>{invalid_element_value};
 }
 
 } // namespace ck
diff --git a/src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp b/src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp
index 161d17a4de..32c33003c5 100644
--- a/src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp
+++ b/src/composable_kernel/host/driver_offline/src/conv_fwd_driver_offline.cpp
@@ -21,7 +21,7 @@
 
 #define USE_MODE 1
 #define USE_CONV_FWD_V4R4_NCHW 1
-#define USE_CONV_FWD_V4R4R2_NHWC 0
+#define USE_CONV_FWD_V4R4R2_NHWC 1
 #define USE_CONV_FWD_V6R1_NCHW 0
 #define USE_CONV_FWD_V5R1_NCHW 0
 #define USE_CONV_FWD_V4R4R2_XDL_NCHW 0

From 18a75706e97ce1b59bb3dd27a09e3b17cc060075 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Mon, 16 Aug 2021 20:36:47 +0000
Subject: [PATCH 37/57] refactor

---
 src/composable_kernel/CMakeLists.txt          |  1 +
 .../multi_index_transform.hpp                 |  2 +-
 .../multi_index_transform_helper.hpp          |  2 +-
 .../tensor_description/tensor_adaptor.hpp     |  4 +--
 .../tensor_descriptor_helper.hpp              |  2 +-
 .../blockwise_gemm_dlops_v2r2.hpp             | 36 +++++++++----------
 .../blockwise_gemm_dlops_v2r3.hpp             |  6 ++--
 .../threadwise_contraction_dlops.hpp          | 16 ++++-----
 .../threadwise_gemm_dlops_v3.hpp              |  6 ++--
 .../threadwise_tensor_slice_set.hpp           |  2 +-
 .../threadwise_tensor_slice_transfer.hpp      | 27 +++++++-------
 .../threadwise_tensor_slice_transfer_v2.hpp   | 21 ++++++-----
 .../include/utility/c_style_pointer_cast.hpp  |  3 +-
 .../include/utility/common_header.hpp         |  1 +
 .../include/utility/dynamic_buffer.hpp        |  5 +--
 .../include/utility/enable_if.hpp             | 13 +++++++
 .../include/utility/math.hpp                  |  9 ++---
 .../include/utility/tuple.hpp                 | 29 ++++++++-------
 .../include/utility/type.hpp                  |  5 ++-
 19 files changed, 99 insertions(+), 91 deletions(-)
 create mode 100644 src/composable_kernel/composable_kernel/include/utility/enable_if.hpp

diff --git a/src/composable_kernel/CMakeLists.txt b/src/composable_kernel/CMakeLists.txt
index 4ac8550082..306e6ca649 100644
--- a/src/composable_kernel/CMakeLists.txt
+++ b/src/composable_kernel/CMakeLists.txt
@@ -43,6 +43,7 @@ message(STATUS "Build with HIP ${hip_VERSION}")
 message("HALF_INCLUDE_DIR: ${HALF_INCLUDE_DIR}")
 
 # CMAKE_CXX_FLAGS
+SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV")
 if(BUILD_DEV)
     string(APPEND CMAKE_CXX_FLAGS " -Werror -Weverything")
 endif()
diff --git a/src/composable_kernel/composable_kernel/include/tensor_description/multi_index_transform.hpp b/src/composable_kernel/composable_kernel/include/tensor_description/multi_index_transform.hpp
index fa5d2246d7..a33b9aee8d 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_description/multi_index_transform.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_description/multi_index_transform.hpp
@@ -377,7 +377,7 @@ struct RightPad
 //   at compile-time
 template <typename UpLengths,
           typename Coefficients,
-          typename std::enable_if<UpLengths::Size() == Coefficients::Size(), bool>::type = false>
+          typename enable_if<UpLengths::Size() == Coefficients::Size(), bool>::type = false>
 struct Embed
 {
     static constexpr index_t NDimUp = UpLengths::Size();
diff --git a/src/composable_kernel/composable_kernel/include/tensor_description/multi_index_transform_helper.hpp b/src/composable_kernel/composable_kernel/include/tensor_description/multi_index_transform_helper.hpp
index abb48c450b..6d4e01888b 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_description/multi_index_transform_helper.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_description/multi_index_transform_helper.hpp
@@ -42,7 +42,7 @@ __host__ __device__ constexpr auto make_right_pad_transform(
 
 template <typename UpLengths,
           typename Coefficients,
-          typename std::enable_if<UpLengths::Size() == Coefficients::Size(), bool>::type = false>
+          typename enable_if<UpLengths::Size() == Coefficients::Size(), bool>::type = false>
 __host__ __device__ constexpr auto make_embed_transform(const UpLengths& up_lengths,
                                                         const Coefficients& coefficients)
 {
diff --git a/src/composable_kernel/composable_kernel/include/tensor_description/tensor_adaptor.hpp b/src/composable_kernel/composable_kernel/include/tensor_description/tensor_adaptor.hpp
index 2508abc6b9..f684ce5e0f 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_description/tensor_adaptor.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_description/tensor_adaptor.hpp
@@ -454,9 +454,7 @@ __host__ __device__ constexpr auto make_single_stage_tensor_adaptor(const Transf
                          remove_cv_t<decltype(top_dim_hidden_ids)>>{transforms};
 }
 
-template <typename X,
-          typename... Xs,
-          typename std::enable_if<sizeof...(Xs) >= 2, bool>::type = false>
+template <typename X, typename... Xs, typename enable_if<sizeof...(Xs) >= 2, bool>::type = false>
 __host__ __device__ constexpr auto chain_tensor_adaptors(const X& x, const Xs&... xs)
 {
     return chain_tensor_adaptors(x, chain_tensor_adaptors(xs...));
diff --git a/src/composable_kernel/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp b/src/composable_kernel/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
index 93f9dac64f..cf329f06a5 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
@@ -37,7 +37,7 @@ __host__ __device__ constexpr auto calculate_element_space_size_impl(const Lengt
 
 template <typename... Lengths,
           typename... Strides,
-          typename std::enable_if<sizeof...(Lengths) == sizeof...(Strides), bool>::type = false>
+          typename enable_if<sizeof...(Lengths) == sizeof...(Strides), bool>::type = false>
 __host__ __device__ constexpr auto make_naive_tensor_descriptor_v2(const Tuple<Lengths...>& lengths,
                                                                    const Tuple<Strides...>& strides)
 {
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r2.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r2.hpp
index 796e6387da..35ff66a2b0 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r2.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r2.hpp
@@ -22,24 +22,24 @@ namespace ck {
 //     2. CThreadBuffer is StaticBuffer
 // Also assume:
 //   M0 = N0 = 2. It will do 2x2 pipelined read and fma (ABBA optimization)
-template <index_t BlockSize,
-          typename FloatA,
-          typename FloatB,
-          typename FloatC,
-          typename AKMBlockDesc,
-          typename BKNBlockDesc,
-          index_t M1PerThreadM11,
-          index_t N1PerThreadN11,
-          index_t KPerThread,
-          index_t M1N1ThreadClusterM100,
-          index_t M1N1ThreadClusterN100,
-          index_t M1N1ThreadClusterM101,
-          index_t M1N1ThreadClusterN101,
-          index_t AThreadCopyScalarPerVector_M11,
-          index_t BThreadCopyScalarPerVector_N11,
-          typename std::enable_if<AKMBlockDesc::IsKnownAtCompileTime() &&
-                                      BKNBlockDesc::IsKnownAtCompileTime(),
-                                  bool>::type = false>
+template <
+    index_t BlockSize,
+    typename FloatA,
+    typename FloatB,
+    typename FloatC,
+    typename AKMBlockDesc,
+    typename BKNBlockDesc,
+    index_t M1PerThreadM11,
+    index_t N1PerThreadN11,
+    index_t KPerThread,
+    index_t M1N1ThreadClusterM100,
+    index_t M1N1ThreadClusterN100,
+    index_t M1N1ThreadClusterM101,
+    index_t M1N1ThreadClusterN101,
+    index_t AThreadCopyScalarPerVector_M11,
+    index_t BThreadCopyScalarPerVector_N11,
+    typename enable_if<AKMBlockDesc::IsKnownAtCompileTime() && BKNBlockDesc::IsKnownAtCompileTime(),
+                       bool>::type = false>
 struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2
 {
     using AIndex = MultiIndex<3>;
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r3.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r3.hpp
index ace940d4f3..26ca0bf111 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r3.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r3.hpp
@@ -38,9 +38,9 @@ template <index_t BlockSize,
                                                 //          BM10BN10ThreadClusterBN101, ...>
           index_t AThreadCopyScalarPerVector_BM11,
           index_t BThreadCopyScalarPerVector_BN11,
-          typename std::enable_if<ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&
-                                      BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(),
-                                  bool>::type = false>
+          typename enable_if<ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&
+                                 BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(),
+                             bool>::type = false>
 struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2
 {
     using AIndex = MultiIndex<3>;
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp
index ca3aca3015..a925a5cd68 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp
@@ -21,10 +21,10 @@ template <typename FloatA,
           typename TKLengths,
           typename TMLengths,
           typename TNLengths,
-          typename std::enable_if<AThreadDesc_TK0_TM0_TM1_TK1::IsKnownAtCompileTime() &&
-                                      BThreadDesc_TK0_TN0_TN1_TK1::IsKnownAtCompileTime() &&
-                                      CThreadDesc_TM0_TM1_TN0_TN1::IsKnownAtCompileTime(),
-                                  bool>::type = false>
+          typename enable_if<AThreadDesc_TK0_TM0_TM1_TK1::IsKnownAtCompileTime() &&
+                                 BThreadDesc_TK0_TN0_TN1_TK1::IsKnownAtCompileTime() &&
+                                 CThreadDesc_TM0_TM1_TN0_TN1::IsKnownAtCompileTime(),
+                             bool>::type = false>
 struct ThreadwiseGemmDlops_km0m1_kn0n1_m0m1n0n1
 {
     __device__ constexpr ThreadwiseGemmDlops_km0m1_kn0n1_m0m1n0n1()
@@ -123,10 +123,10 @@ template <typename FloatA,
           typename TKLengths,
           typename TMLengths,
           typename TNLengths,
-          typename std::enable_if<AThreadDesc_TK0_TM0_TM1_TK1::IsKnownAtCompileTime() &&
-                                      BThreadDesc_TK0_TN0_TN1_TK1::IsKnownAtCompileTime() &&
-                                      CThreadDesc_TM0_TM1_TN0_TN1::IsKnownAtCompileTime(),
-                                  bool>::type = false>
+          typename enable_if<AThreadDesc_TK0_TM0_TM1_TK1::IsKnownAtCompileTime() &&
+                                 BThreadDesc_TK0_TN0_TN1_TK1::IsKnownAtCompileTime() &&
+                                 CThreadDesc_TM0_TM1_TN0_TN1::IsKnownAtCompileTime(),
+                             bool>::type = false>
 struct ThreadwiseContractionDlops_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1
 {
     __device__ constexpr ThreadwiseContractionDlops_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1()
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp
index f9d8ac05b6..015ad675fb 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp
@@ -19,9 +19,9 @@ template <typename FloatA,
           typename CDesc,
           index_t H,
           index_t W,
-          typename std::enable_if<ADesc::IsKnownAtCompileTime() && BDesc::IsKnownAtCompileTime() &&
-                                      CDesc::IsKnownAtCompileTime(),
-                                  bool>::type = false>
+          typename enable_if<ADesc::IsKnownAtCompileTime() && BDesc::IsKnownAtCompileTime() &&
+                                 CDesc::IsKnownAtCompileTime(),
+                             bool>::type = false>
 struct ThreadwiseGemmDlops_km_kn_mn_v3
 {
     template <typename ABuffer,
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_set.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_set.hpp
index a4128c274b..0c7aa978a7 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_set.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_set.hpp
@@ -15,7 +15,7 @@ namespace ck {
 template <typename Data,
           typename Desc,
           typename SliceLengths,
-          typename std::enable_if<Desc::IsKnownAtCompileTime(), bool>::type = false>
+          typename enable_if<Desc::IsKnownAtCompileTime(), bool>::type = false>
 struct ThreadwiseTensorSliceSet_v1
 {
     static constexpr index_t nDim = SliceLengths::Size();
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
index 82e46984e2..0071accf7f 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
@@ -57,7 +57,7 @@ template <typename SrcData,
           InMemoryDataOperationEnum_t DstInMemOp,
           index_t DstScalarStrideInVector,
           bool DstResetCoordinateAfterRun,
-          typename std::enable_if<SrcDesc::IsKnownAtCompileTime(), bool>::type = false>
+          typename enable_if<SrcDesc::IsKnownAtCompileTime(), bool>::type = false>
 struct ThreadwiseTensorSliceTransfer_v1r3
 {
     static constexpr index_t nDim = SliceLengths::Size();
@@ -373,7 +373,7 @@ template <typename SrcData,
           index_t SrcScalarPerVector,
           index_t SrcScalarStrideInVector,
           bool SrcResetCoordinateAfterRun,
-          typename std::enable_if<DstDesc::IsKnownAtCompileTime(), bool>::type = false>
+          typename enable_if<DstDesc::IsKnownAtCompileTime(), bool>::type = false>
 struct ThreadwiseTensorSliceTransfer_v2
 {
     static constexpr index_t nDim = SliceLengths::Size();
@@ -1261,18 +1261,17 @@ struct ThreadwiseTensorSliceTransfer_v3
 //     3. DstOriginIdx is known at compile-time
 //     4. use direct address calculation
 //   3. vector access on src
-template <
-    typename SrcData,
-    typename DstData,
-    typename SrcDesc,
-    typename DstDesc,
-    typename SliceLengths,
-    typename DimAccessOrder,
-    index_t SrcVectorDim,
-    index_t SrcScalarPerVector,
-    index_t SrcScalarStrideInVector,
-    typename std::enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
-                            bool>::type = false>
+template <typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SliceLengths,
+          typename DimAccessOrder,
+          index_t SrcVectorDim,
+          index_t SrcScalarPerVector,
+          index_t SrcScalarStrideInVector,
+          typename enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
+                             bool>::type = false>
 struct ThreadwiseTensorSliceTransfer_v4
 {
     static constexpr index_t nDim = SliceLengths::Size();
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp
index 6d96aa1253..f069540343 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp
@@ -621,17 +621,16 @@ struct ThreadwiseTensorSliceTransfer_v3r1
 //     3. DstOriginIdx is known at compile-time
 //     4. use direct address calculation
 //   3. vector access on src
-template <
-    typename SrcData,
-    typename DstData,
-    typename SrcDesc,
-    typename DstDesc,
-    typename SliceLengths,
-    typename DimAccessOrder,
-    typename SrcVectorTensorLengths,
-    typename SrcVectorTensorContiguousDimOrder,
-    typename std::enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
-                            bool>::type = false>
+template <typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SliceLengths,
+          typename DimAccessOrder,
+          typename SrcVectorTensorLengths,
+          typename SrcVectorTensorContiguousDimOrder,
+          typename enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
+                             bool>::type = false>
 struct ThreadwiseTensorSliceTransfer_v4r1
 {
     static constexpr auto I0 = Number<0>{};
diff --git a/src/composable_kernel/composable_kernel/include/utility/c_style_pointer_cast.hpp b/src/composable_kernel/composable_kernel/include/utility/c_style_pointer_cast.hpp
index 29cd3d07ca..8acf5790c6 100644
--- a/src/composable_kernel/composable_kernel/include/utility/c_style_pointer_cast.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/c_style_pointer_cast.hpp
@@ -2,12 +2,13 @@
 #define CK_C_STYLE_POINTER_CAST_HPP
 
 #include "type.hpp"
+#include "enable_if.hpp"
 
 namespace ck {
 
 template <typename PY,
           typename PX,
-          typename std::enable_if<is_pointer_v<PY> && is_pointer_v<PX>, bool>::type = false>
+          typename enable_if<is_pointer_v<PY> && is_pointer_v<PX>, bool>::type = false>
 __host__ __device__ PY c_style_pointer_cast(PX p_x)
 {
 #pragma clang diagnostic push
diff --git a/src/composable_kernel/composable_kernel/include/utility/common_header.hpp b/src/composable_kernel/composable_kernel/include/utility/common_header.hpp
index ba20248028..85c02a1b99 100644
--- a/src/composable_kernel/composable_kernel/include/utility/common_header.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/common_header.hpp
@@ -14,6 +14,7 @@
 #include "functional2.hpp"
 #include "functional3.hpp"
 #include "functional4.hpp"
+#include "enable_if.hpp"
 #include "integral_constant.hpp"
 #include "math.hpp"
 #include "number.hpp"
diff --git a/src/composable_kernel/composable_kernel/include/utility/dynamic_buffer.hpp b/src/composable_kernel/composable_kernel/include/utility/dynamic_buffer.hpp
index 6f54f7317c..4d583e3ce7 100644
--- a/src/composable_kernel/composable_kernel/include/utility/dynamic_buffer.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/dynamic_buffer.hpp
@@ -3,6 +3,7 @@
 
 #include "amd_buffer_addressing.hpp"
 #include "c_style_pointer_cast.hpp"
+#include "enable_if.hpp"
 
 namespace ck {
 
@@ -38,7 +39,7 @@ struct DynamicBuffer
     }
 
     template <typename X,
-              typename std::enable_if<
+              typename enable_if<
                   is_same<typename scalar_type<remove_cv_t<remove_reference_t<X>>>::type,
                           typename scalar_type<remove_cv_t<remove_reference_t<T>>>::type>::value,
                   bool>::type = false>
@@ -93,7 +94,7 @@ struct DynamicBuffer
     }
 
     template <typename X,
-              typename std::enable_if<
+              typename enable_if<
                   is_same<typename scalar_type<remove_cv_t<remove_reference_t<X>>>::type,
                           typename scalar_type<remove_cv_t<remove_reference_t<T>>>::type>::value,
                   bool>::type = false>
diff --git a/src/composable_kernel/composable_kernel/include/utility/enable_if.hpp b/src/composable_kernel/composable_kernel/include/utility/enable_if.hpp
new file mode 100644
index 0000000000..501e1bfc1c
--- /dev/null
+++ b/src/composable_kernel/composable_kernel/include/utility/enable_if.hpp
@@ -0,0 +1,13 @@
+#ifndef CK_ENABLE_IF_HPP
+#define CK_ENABLE_IF_HPP
+
+namespace ck {
+
+template <bool B, typename T = void>
+using enable_if = std::enable_if<B, T>;
+
+template <bool B, typename T = void>
+using enable_if_t = typename std::enable_if<B, T>::type;
+
+} // namespace ck
+#endif
diff --git a/src/composable_kernel/composable_kernel/include/utility/math.hpp b/src/composable_kernel/composable_kernel/include/utility/math.hpp
index e451059647..bcb25a2941 100644
--- a/src/composable_kernel/composable_kernel/include/utility/math.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/math.hpp
@@ -5,6 +5,7 @@
 #include "integral_constant.hpp"
 #include "number.hpp"
 #include "type.hpp"
+#include "enable_if.hpp"
 
 namespace ck {
 namespace math {
@@ -184,9 +185,7 @@ __host__ __device__ constexpr auto gcd(Number<X>, Number<Y>)
     return Number<r>{};
 }
 
-template <typename X,
-          typename... Ys,
-          typename std::enable_if<sizeof...(Ys) >= 2, bool>::type = false>
+template <typename X, typename... Ys, typename enable_if<sizeof...(Ys) >= 2, bool>::type = false>
 __host__ __device__ constexpr auto gcd(X x, Ys... ys)
 {
     return gcd(x, gcd(ys...));
@@ -199,9 +198,7 @@ __host__ __device__ constexpr auto lcm(X x, Y y)
     return (x * y) / gcd(x, y);
 }
 
-template <typename X,
-          typename... Ys,
-          typename std::enable_if<sizeof...(Ys) >= 2, bool>::type = false>
+template <typename X, typename... Ys, typename enable_if<sizeof...(Ys) >= 2, bool>::type = false>
 __host__ __device__ constexpr auto lcm(X x, Ys... ys)
 {
     return lcm(x, lcm(ys...));
diff --git a/src/composable_kernel/composable_kernel/include/utility/tuple.hpp b/src/composable_kernel/composable_kernel/include/utility/tuple.hpp
index 15b73011b4..ee96a8b435 100644
--- a/src/composable_kernel/composable_kernel/include/utility/tuple.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/tuple.hpp
@@ -4,6 +4,7 @@
 #include "integral_constant.hpp"
 #include "sequence.hpp"
 #include "type.hpp"
+#include "enable_if.hpp"
 
 namespace ck {
 
@@ -20,10 +21,9 @@ struct TupleElement
 {
     __host__ __device__ constexpr TupleElement() = default;
 
-    template <
-        typename T,
-        typename std::enable_if<!is_same<remove_reference_t<remove_cv_t<T>>, TupleElement>::value,
-                                bool>::type = false>
+    template <typename T,
+              typename enable_if<!is_same<remove_reference_t<remove_cv_t<T>>, TupleElement>::value,
+                                 bool>::type = false>
     __host__ __device__ constexpr TupleElement(T&& v) : mData(std::forward<T>(v))
     {
     }
@@ -58,17 +58,16 @@ struct TupleImpl<Sequence<Is...>, Xs...> : TupleElement<TupleElementKey<Is>, Xs>
 {
     __host__ __device__ constexpr TupleImpl() = default;
 
-    template <
-        typename Y,
-        typename std::enable_if<sizeof...(Is) == 1 && sizeof...(Xs) == 1 &&
-                                    !is_same<remove_reference_t<remove_cv_t<Y>>, TupleImpl>::value,
-                                bool>::type = false>
+    template <typename Y,
+              typename enable_if<sizeof...(Is) == 1 && sizeof...(Xs) == 1 &&
+                                     !is_same<remove_reference_t<remove_cv_t<Y>>, TupleImpl>::value,
+                                 bool>::type = false>
     __host__ __device__ constexpr TupleImpl(Y&& y)
         : TupleElement<TupleElementKey<Is>, Xs>(std::forward<Y>(y))...
     {
     }
 
-    template <typename... Ys, typename std::enable_if<sizeof...(Ys) >= 2, bool>::type = false>
+    template <typename... Ys, typename enable_if<sizeof...(Ys) >= 2, bool>::type = false>
     __host__ __device__ constexpr TupleImpl(Ys&&... ys)
         : TupleElement<TupleElementKey<Is>, Xs>(std::forward<Ys>(ys))...
     {
@@ -102,16 +101,16 @@ struct Tuple : detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(X
     __host__ __device__ constexpr Tuple() = default;
 
     template <typename Y,
-              typename std::enable_if<
-                  sizeof...(Xs) == 1 && !is_same<remove_reference_t<remove_cv_t<Y>>, Tuple>::value,
-                  bool>::type = false>
+              typename enable_if<sizeof...(Xs) == 1 &&
+                                     !is_same<remove_reference_t<remove_cv_t<Y>>, Tuple>::value,
+                                 bool>::type = false>
     __host__ __device__ constexpr Tuple(Y&& y) : base(std::forward<Y>(y))
     {
     }
 
     template <typename... Ys,
-              typename std::enable_if<sizeof...(Ys) == sizeof...(Xs) && sizeof...(Ys) >= 2,
-                                      bool>::type = false>
+              typename enable_if<sizeof...(Ys) == sizeof...(Xs) && sizeof...(Ys) >= 2, bool>::type =
+                  false>
     __host__ __device__ constexpr Tuple(Ys&&... ys) : base(std::forward<Ys>(ys)...)
     {
     }
diff --git a/src/composable_kernel/composable_kernel/include/utility/type.hpp b/src/composable_kernel/composable_kernel/include/utility/type.hpp
index 12ed435658..b7902ad496 100644
--- a/src/composable_kernel/composable_kernel/include/utility/type.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/type.hpp
@@ -2,6 +2,7 @@
 #define CK_TYPE_HPP
 
 #include "integral_constant.hpp"
+#include "enable_if.hpp"
 
 namespace ck {
 
@@ -39,9 +40,7 @@ struct is_known_at_compile_time<integral_constant<T, X>>
     static constexpr bool value = true;
 };
 
-template <typename Y,
-          typename X,
-          typename std::enable_if<sizeof(X) == sizeof(Y), bool>::type = false>
+template <typename Y, typename X, typename enable_if<sizeof(X) == sizeof(Y), bool>::type = false>
 __host__ __device__ constexpr Y as_type(X x)
 {
     union AsType

From 3241ab44b9e6d67a90813ddb75d8ffd384142958 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Mon, 16 Aug 2021 21:01:33 +0000
Subject: [PATCH 38/57] refactor

---
 .../tensor_description/cluster_descriptor.hpp |  2 +-
 .../multi_index_transform.hpp                 | 28 +++++++++----------
 .../tensor_description/tensor_adaptor.hpp     |  2 +-
 .../tensor_description/tensor_descriptor.hpp  |  2 +-
 .../tensor_descriptor_helper.hpp              | 12 ++++----
 .../blockwise_tensor_slice_transfer.hpp       |  2 +-
 .../blockwise_tensor_slice_transfer_v2.hpp    |  2 +-
 .../gridwise_contraction_dlops_v1r2.hpp       | 16 +++++------
 .../gridwise_gemm_dlops_v1r2.hpp              | 12 ++++----
 .../gridwise_gemm_dlops_v1r3.hpp              | 16 +++++------
 .../gridwise_gemm_dlops_v2.hpp                |  6 ++--
 .../gridwise_gemm_xdlops_v2r3.hpp             |  8 +++---
 .../threadwise_tensor_slice_transfer_v2.hpp   | 18 ++++++------
 .../include/utility/math.hpp                  |  6 ----
 14 files changed, 63 insertions(+), 69 deletions(-)

diff --git a/src/composable_kernel/composable_kernel/include/tensor_description/cluster_descriptor.hpp b/src/composable_kernel/composable_kernel/include/tensor_description/cluster_descriptor.hpp
index c3523623d9..d69bfb70c1 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_description/cluster_descriptor.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_description/cluster_descriptor.hpp
@@ -8,7 +8,7 @@ namespace ck {
 
 template <typename Lengths,
           typename ArrangeOrder = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type>
-__host__ __device__ constexpr auto make_cluster_descriptor_v2(
+__host__ __device__ constexpr auto make_cluster_descriptor(
     const Lengths& lengths,
     ArrangeOrder order = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type{})
 {
diff --git a/src/composable_kernel/composable_kernel/include/tensor_description/multi_index_transform.hpp b/src/composable_kernel/composable_kernel/include/tensor_description/multi_index_transform.hpp
index a33b9aee8d..42a5a875b7 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_description/multi_index_transform.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_description/multi_index_transform.hpp
@@ -481,11 +481,11 @@ struct Merge_v1_carry_check
     using LowerIndex = MultiIndex<NDimLow>;
     using UpperIndex = MultiIndex<1>;
 
-    using LowLengthsScan = decltype(
-        container_reverse_exclusive_scan(LowLengths{}, math::multiplies_v2{}, Number<1>{}));
+    using LowLengthsScan =
+        decltype(container_reverse_exclusive_scan(LowLengths{}, math::multiplies{}, Number<1>{}));
 
     using UpLengths =
-        decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies_v2{}, Number<1>{})));
+        decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies{}, Number<1>{})));
 
     LowLengths low_lengths_;
     LowLengthsScan low_lengths_scan_;
@@ -496,8 +496,8 @@ struct Merge_v1_carry_check
     __host__ __device__ constexpr Merge_v1_carry_check(const LowLengths& low_lengths)
         : low_lengths_{low_lengths},
           low_lengths_scan_{
-              container_reverse_exclusive_scan(low_lengths, math::multiplies_v2{}, Number<1>{})},
-          up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies_v2{}, Number<1>{}))}
+              container_reverse_exclusive_scan(low_lengths, math::multiplies{}, Number<1>{})},
+          up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies{}, Number<1>{}))}
     {
         static_assert(LowerIndex::Size() == NDimLow, "wrong!");
     }
@@ -1037,7 +1037,7 @@ struct Merge_v2_magic_division
     using UpperIndex = MultiIndex<1>;
 
     using UpLengths =
-        decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies_v2{}, Number<1>{})));
+        decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies{}, Number<1>{})));
 
     using LowLengthsMagicDivisorMultipiler = decltype(
         generate_tuple(lambda_merge_generate_MagicDivision_calculate_magic_multiplier<LowLengths>{},
@@ -1062,7 +1062,7 @@ struct Merge_v2_magic_division
           low_lengths_magic_divisor_shift_{generate_tuple(
               [&](auto i) { return MagicDivision::CalculateMagicShift(low_lengths[i]); },
               Number<NDimLow>{})},
-          up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies_v2{}, Number<1>{}))}
+          up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies{}, Number<1>{}))}
     {
         static_assert(LowerIndex::Size() == NDimLow, "wrong!");
     }
@@ -1188,11 +1188,11 @@ struct Merge_v2r2_magic_division
     using LowerIndex = MultiIndex<NDimLow>;
     using UpperIndex = MultiIndex<1>;
 
-    using LowLengthsScan = decltype(
-        container_reverse_exclusive_scan(LowLengths{}, math::multiplies_v2{}, Number<1>{}));
+    using LowLengthsScan =
+        decltype(container_reverse_exclusive_scan(LowLengths{}, math::multiplies{}, Number<1>{}));
 
     using UpLengths =
-        decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies_v2{}, Number<1>{})));
+        decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies{}, Number<1>{})));
 
     using LowLengthsScanMagicDivisorMultipiler = decltype(generate_tuple(
         lambda_merge_generate_MagicDivision_calculate_magic_multiplier<LowLengthsScan>{},
@@ -1213,14 +1213,14 @@ struct Merge_v2r2_magic_division
     __host__ __device__ constexpr Merge_v2r2_magic_division(const LowLengths& low_lengths)
         : low_lengths_{low_lengths},
           low_lengths_scan_{
-              container_reverse_exclusive_scan(low_lengths, math::multiplies_v2{}, Number<1>{})},
+              container_reverse_exclusive_scan(low_lengths, math::multiplies{}, Number<1>{})},
           low_lengths_scan_magic_divisor_multiplier_{generate_tuple(
               [&](auto i) { return MagicDivision::CalculateMagicMultiplier(low_lengths_scan_[i]); },
               Number<NDimLow>{})},
           low_lengths_scan_magic_divisor_shift_{generate_tuple(
               [&](auto i) { return MagicDivision::CalculateMagicShift(low_lengths_scan_[i]); },
               Number<NDimLow>{})},
-          up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies_v2{}, Number<1>{}))}
+          up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies{}, Number<1>{}))}
     {
         static_assert(LowerIndex::Size() == NDimLow, "wrong!");
     }
@@ -1336,7 +1336,7 @@ struct UnMerge
     using UpperIndex = MultiIndex<NDimUp>;
 
     using UpLengthsScan =
-        decltype(container_reverse_exclusive_scan(UpLengths{}, math::multiplies_v2{}, Number<1>{}));
+        decltype(container_reverse_exclusive_scan(UpLengths{}, math::multiplies{}, Number<1>{}));
 
     UpLengths up_lengths_;
     UpLengthsScan up_lengths_scan_;
@@ -1346,7 +1346,7 @@ struct UnMerge
     __host__ __device__ constexpr UnMerge(const UpLengths& up_lengths)
         : up_lengths_{up_lengths},
           up_lengths_scan_{
-              container_reverse_exclusive_scan(up_lengths, math::multiplies_v2{}, Number<1>{})}
+              container_reverse_exclusive_scan(up_lengths, math::multiplies{}, Number<1>{})}
     {
     }
 
diff --git a/src/composable_kernel/composable_kernel/include/tensor_description/tensor_adaptor.hpp b/src/composable_kernel/composable_kernel/include/tensor_description/tensor_adaptor.hpp
index f684ce5e0f..3b647e433a 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_description/tensor_adaptor.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_description/tensor_adaptor.hpp
@@ -64,7 +64,7 @@ struct TensorAdaptor
             Number<ndim_top_>{});
 
         // TODO: make container_reduce support tuple of Number and index_t
-        return container_reduce(lengths, math::multiplies_v2{}, Number<1>{});
+        return container_reduce(lengths, math::multiplies{}, Number<1>{});
     }
 
     template <index_t IDim>
diff --git a/src/composable_kernel/composable_kernel/include/tensor_description/tensor_descriptor.hpp b/src/composable_kernel/composable_kernel/include/tensor_description/tensor_descriptor.hpp
index 4038ef63da..a6a57ba63b 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_description/tensor_descriptor.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_description/tensor_descriptor.hpp
@@ -69,7 +69,7 @@ struct TensorDescriptor
             Number<ndim_visible_>{});
 
         // TODO: make container_reduce support tuple of Number and index_t
-        return container_reduce(lengths, math::multiplies_v2{}, Number<1>{});
+        return container_reduce(lengths, math::multiplies{}, Number<1>{});
     }
 
     template <index_t IDim>
diff --git a/src/composable_kernel/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp b/src/composable_kernel/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
index cf329f06a5..ad75f9245e 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
@@ -38,8 +38,8 @@ __host__ __device__ constexpr auto calculate_element_space_size_impl(const Lengt
 template <typename... Lengths,
           typename... Strides,
           typename enable_if<sizeof...(Lengths) == sizeof...(Strides), bool>::type = false>
-__host__ __device__ constexpr auto make_naive_tensor_descriptor_v2(const Tuple<Lengths...>& lengths,
-                                                                   const Tuple<Strides...>& strides)
+__host__ __device__ constexpr auto make_naive_tensor_descriptor(const Tuple<Lengths...>& lengths,
+                                                                const Tuple<Strides...>& strides)
 {
     constexpr index_t N = sizeof...(Lengths);
 
@@ -100,7 +100,7 @@ make_naive_tensor_descriptor_packed(const Tuple<Lengths...>& lengths)
 
     constexpr auto visible_dim_hidden_ids = typename arithmetic_sequence_gen<1, N + 1, 1>::type{};
 
-    const auto element_space_size = container_reduce(lengths, math::multiplies_v2{}, Number<1>{});
+    const auto element_space_size = container_reduce(lengths, math::multiplies{}, Number<1>{});
 
     return TensorDescriptor<remove_cv_t<decltype(transforms)>,
                             remove_cv_t<decltype(low_dim_hidden_idss)>,
@@ -112,7 +112,7 @@ make_naive_tensor_descriptor_packed(const Tuple<Lengths...>& lengths)
 
 template <typename... Lengths, typename Align>
 __host__ __device__ constexpr auto
-make_naive_tensor_descriptor_aligned_v2(const Tuple<Lengths...>& lengths, Align align)
+make_naive_tensor_descriptor_aligned(const Tuple<Lengths...>& lengths, Align align)
 {
     constexpr auto I1 = Number<1>{};
 
@@ -133,7 +133,7 @@ make_naive_tensor_descriptor_aligned_v2(const Tuple<Lengths...>& lengths, Align
             else
             {
                 return container_reduce(lengths,
-                                        math::multiplies_v2{},
+                                        math::multiplies{},
                                         Number<stride_n_minus_2>{},
                                         i + I1,
                                         Number<N - 1>{},
@@ -142,7 +142,7 @@ make_naive_tensor_descriptor_aligned_v2(const Tuple<Lengths...>& lengths, Align
         },
         Number<N>{});
 
-    return make_naive_tensor_descriptor_v2(lengths, strides);
+    return make_naive_tensor_descriptor(lengths, strides);
 }
 
 } // namespace ck
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp
index cf21123de6..0214b71352 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp
@@ -143,7 +143,7 @@ struct BlockwiseTensorSliceTransfer_v4
 
     private:
     static constexpr auto thread_cluster_desc_ =
-        make_cluster_descriptor_v2(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
+        make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
 
     using ThreadwiseTransfer =
         ThreadwiseTensorSliceTransfer_v3<ThreadSliceLengths,
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer_v2.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer_v2.hpp
index 4f3336f9f7..6b2d2d5231 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer_v2.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer_v2.hpp
@@ -131,7 +131,7 @@ struct BlockwiseTensorSliceTransfer_v4r1
 
     private:
     static constexpr auto thread_cluster_desc_ =
-        make_cluster_descriptor_v2(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
+        make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
 
     using ThreadwiseTransfer =
         ThreadwiseTensorSliceTransfer_v3r1<ThreadSliceLengths,
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_contraction_dlops_v1r2.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_contraction_dlops_v1r2.hpp
index 366451dcc3..fe56d0d813 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_contraction_dlops_v1r2.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_contraction_dlops_v1r2.hpp
@@ -110,13 +110,13 @@ struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN
 
         // A matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto a_block_desc_gk0_gm0_gm10_gm11_gk1 = make_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_block_desc_gk0_gm0_gm10_gm11_gk1 = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<GK0PerBlock>{}, GM0, I1, Number<GM1PerBlockGM11>{}, GK1),
             max_lds_align);
 
         // B matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto b_block_desc_gk0_gn0_gn10_gn11_gk1 = make_naive_tensor_descriptor_aligned_v2(
+        constexpr auto b_block_desc_gk0_gn0_gn10_gn11_gk1 = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<GK0PerBlock>{}, GN0, I1, Number<GN1PerBlockGN11>{}, GK1),
             max_lds_align);
 
@@ -248,10 +248,10 @@ struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN
         constexpr auto BN = GN0 * GN11;
 
         constexpr auto BM1 =
-            Number<container_reduce(BM10BN10ThreadClusterBM10Xs{}, math::multiplies_v2{}, I1) *
+            Number<container_reduce(BM10BN10ThreadClusterBM10Xs{}, math::multiplies{}, I1) *
                    BM1PerThreadBM11>{};
         constexpr auto BN1 =
-            Number<container_reduce(BM10BN10ThreadClusterBN10Xs{}, math::multiplies_v2{}, I1) *
+            Number<container_reduce(BM10BN10ThreadClusterBN10Xs{}, math::multiplies{}, I1) *
                    BN1PerThreadBN11>{};
 
         constexpr auto BM0 = BM / BM1;
@@ -354,24 +354,24 @@ struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN
 
         // A matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto a_block_desc_gk0_gm0_gm10_gm11_gk1 = make_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_block_desc_gk0_gm0_gm10_gm11_gk1 = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<GK0PerBlock>{}, GM0, I1, Number<GM1PerBlockGM11>{}, GK1),
             max_lds_align);
 
         // B matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto b_block_desc_gk0_gn0_gn10_gn11_gk1 = make_naive_tensor_descriptor_aligned_v2(
+        constexpr auto b_block_desc_gk0_gn0_gn10_gn11_gk1 = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<GK0PerBlock>{}, GN0, I1, Number<GN1PerBlockGN11>{}, GK1),
             max_lds_align);
 
         // A matrix in LDS memory for blockwise GEMM
         //   be careful of LDS alignment
-        constexpr auto a_block_desc_gk0_bm_gk1 = make_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_block_desc_gk0_bm_gk1 = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<GK0PerBlock>{}, GM0 * Number<GM1PerBlockGM11>{}, GK1), max_lds_align);
 
         // B matrix in LDS memory for blockwise GEMM
         //   be careful of LDS alignment
-        constexpr auto b_block_desc_gk0_bn_gk1 = make_naive_tensor_descriptor_aligned_v2(
+        constexpr auto b_block_desc_gk0_bn_gk1 = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<GK0PerBlock>{}, GN0 * Number<GN1PerBlockGN11>{}, GK1), max_lds_align);
 
         static_assert(a_block_desc_gk0_gm0_gm10_gm11_gk1.GetElementSpaceSize() ==
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r2.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r2.hpp
index 31a0fa342a..d91159b884 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r2.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r2.hpp
@@ -166,12 +166,12 @@ struct GridwiseGemmDlops_km_kn_mn_v1r2
 
         // A matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}), max_lds_align);
 
         // B matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned_v2(
+        constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}), max_lds_align);
 
         // LDS allocation for A and B: be careful of alignment
@@ -351,22 +351,22 @@ struct GridwiseGemmDlops_km_kn_mn_v1r2
 
         // A matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}), max_lds_align);
 
         // B matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned_v2(
+        constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}), max_lds_align);
 
         // A matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto a_k_m0_m1_block_desc = make_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_k_m0_m1_block_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<KPerBlock>{}, I1, Number<MPerBlockM1>{}), max_lds_align);
 
         // B matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto b_k_n0_n1_block_desc = make_naive_tensor_descriptor_aligned_v2(
+        constexpr auto b_k_n0_n1_block_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<KPerBlock>{}, I1, Number<NPerBlockN1>{}), max_lds_align);
 
         // A matrix blockwise copy
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r3.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r3.hpp
index 1017dcc2a1..2653dd4340 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r3.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r3.hpp
@@ -163,12 +163,12 @@ struct GridwiseGemmDlops_km_kn_mn_v1r3
 
         // TODO: check alignment
         // A matrix in LDS memory, dst of blockwise copy
-        constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}, K1), max_lds_align);
 
         // TODO: check alignment
         // B matrix in LDS memory, dst of blockwise copy
-        constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned_v2(
+        constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}, K1), max_lds_align);
 
         // TODO: check alignment
@@ -274,10 +274,10 @@ struct GridwiseGemmDlops_km_kn_mn_v1r3
         const auto N0 = N / N1;
 
         constexpr auto M11 =
-            Number<container_reduce(M11N11ThreadClusterM110Xs{}, math::multiplies_v2{}, I1) *
+            Number<container_reduce(M11N11ThreadClusterM110Xs{}, math::multiplies{}, I1) *
                    M1PerThreadM111>{};
         constexpr auto N11 =
-            Number<container_reduce(M11N11ThreadClusterN110Xs{}, math::multiplies_v2{}, I1) *
+            Number<container_reduce(M11N11ThreadClusterN110Xs{}, math::multiplies{}, I1) *
                    N1PerThreadN111>{};
 
         constexpr auto M10 = M1 / M11;
@@ -354,23 +354,23 @@ struct GridwiseGemmDlops_km_kn_mn_v1r3
         // TODO: check alignment
         // A matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto a_k0_m0_m1_k1_block_desc = make_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_k0_m0_m1_k1_block_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<KPerBlock>{}, I1, Number<MPerBlockM1>{}, K1), max_lds_align);
 
         // TODO: check alignment
         // B matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto b_k0_n0_n1_k1_block_desc = make_naive_tensor_descriptor_aligned_v2(
+        constexpr auto b_k0_n0_n1_k1_block_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<KPerBlock>{}, I1, Number<NPerBlockN1>{}, K1), max_lds_align);
 
         // TODO: check alignment
         // A matrix in LDS memory, for blockwise GEMM
-        constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}, K1), max_lds_align);
 
         // TODO: check alignment
         // B matrix in LDS memory, for blockwise GEMM
-        constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned_v2(
+        constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}, K1), max_lds_align);
 
         static_assert(a_k0_m0_m1_k1_block_desc.GetElementSpaceSize() ==
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
index b141307b77..84ee6f40ec 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
@@ -58,7 +58,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
 
         // A matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto a_e_k_desc = make_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_e_k_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<E>{}, Number<KPerBlock>{}), max_lds_align);
 
         // LDS allocation for A and B: be careful of alignment
@@ -132,10 +132,10 @@ struct GridwiseGemmDlops_km_kn_mn_v3
 
         // A matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto a_e_k_block_desc = make_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_e_k_block_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<EPerBlock>{}, Number<KPerBlock>{}), max_lds_align);
 
-        constexpr auto a_e_k_desc = make_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_e_k_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<E>{}, Number<KPerBlock>{}), max_lds_align);
 
         // B matrix in LDS memory, dst of blockwise copy
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
index dcb16e5dcd..207f73072f 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
@@ -148,12 +148,12 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
 
         // A matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<KPerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
 
         // B matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned_v2(
+        constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<KPerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
 
         // LDS allocation for A and B: be careful of alignment
@@ -290,12 +290,12 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
 
         // A matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<KPerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
 
         // B matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned_v2(
+        constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<KPerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
 
         // A matrix blockwise copy
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp
index f069540343..ccac4b7b44 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp
@@ -91,13 +91,13 @@ struct ThreadwiseTensorSliceTransfer_v3r1
             container_reverse_exclusive_scan(
                 container_reorder_given_new2old(src_vector_tensor_lengths,
                                                 SrcVectorTensorContiguousDimOrder{}),
-                math::multiplies_v2{},
+                math::multiplies{},
                 I1),
             SrcVectorTensorContiguousDimOrder{});
 
         constexpr auto src_vector_desc =
-            make_naive_tensor_descriptor_v2(sequence_to_tuple_of_number(src_vector_tensor_lengths),
-                                            sequence_to_tuple_of_number(src_vector_tensor_strides));
+            make_naive_tensor_descriptor(sequence_to_tuple_of_number(src_vector_tensor_lengths),
+                                         sequence_to_tuple_of_number(src_vector_tensor_strides));
 
         // access order and lengths
         constexpr auto src_access_lengths = SliceLengths{} / src_vector_tensor_lengths;
@@ -259,13 +259,13 @@ struct ThreadwiseTensorSliceTransfer_v3r1
             container_reverse_exclusive_scan(
                 container_reorder_given_new2old(dst_vector_tensor_lengths,
                                                 DstVectorTensorContiguousDimOrder{}),
-                math::multiplies_v2{},
+                math::multiplies{},
                 I1),
             DstVectorTensorContiguousDimOrder{});
 
         constexpr auto dst_vector_desc =
-            make_naive_tensor_descriptor_v2(sequence_to_tuple_of_number(dst_vector_tensor_lengths),
-                                            sequence_to_tuple_of_number(dst_vector_tensor_strides));
+            make_naive_tensor_descriptor(sequence_to_tuple_of_number(dst_vector_tensor_lengths),
+                                         sequence_to_tuple_of_number(dst_vector_tensor_strides));
 
         // dst access order and lengths
         constexpr auto dst_access_lengths = SliceLengths{} / dst_vector_tensor_lengths;
@@ -699,13 +699,13 @@ struct ThreadwiseTensorSliceTransfer_v4r1
             container_reverse_exclusive_scan(
                 container_reorder_given_new2old(src_vector_tensor_lengths,
                                                 SrcVectorTensorContiguousDimOrder{}),
-                math::multiplies_v2{},
+                math::multiplies{},
                 I1),
             SrcVectorTensorContiguousDimOrder{});
 
         constexpr auto src_vector_desc =
-            make_naive_tensor_descriptor_v2(sequence_to_tuple_of_number(src_vector_tensor_lengths),
-                                            sequence_to_tuple_of_number(src_vector_tensor_strides));
+            make_naive_tensor_descriptor(sequence_to_tuple_of_number(src_vector_tensor_lengths),
+                                         sequence_to_tuple_of_number(src_vector_tensor_strides));
 
         // access order and lengths
         constexpr auto access_lengths = SliceLengths{} / src_vector_tensor_lengths;
diff --git a/src/composable_kernel/composable_kernel/include/utility/math.hpp b/src/composable_kernel/composable_kernel/include/utility/math.hpp
index bcb25a2941..48438e6179 100644
--- a/src/composable_kernel/composable_kernel/include/utility/math.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/math.hpp
@@ -28,13 +28,7 @@ struct minus
     __host__ __device__ constexpr T operator()(T a, T b) const { return a - b; }
 };
 
-template <typename T>
 struct multiplies
-{
-    __host__ __device__ constexpr T operator()(T a, T b) const { return a * b; }
-};
-
-struct multiplies_v2
 {
     template <typename A, typename B>
     __host__ __device__ constexpr auto operator()(const A& a, const B& b) const

From d50dc0472229a8194c58b0aee83cee35181fcadd Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Tue, 17 Aug 2021 10:33:24 -0500
Subject: [PATCH 39/57] change cmakelist

---
 src/CMakeLists.txt                             |  4 ++--
 src/solver/conv_hip_implicit_gemm_fwd_v4r1.cpp | 14 ++++++++++----
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 1e21242d4b..e716ea16f2 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -267,8 +267,8 @@ endif()
 if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN_BACKEND STREQUAL "HIP" OR MIOPEN_BACKEND STREQUAL "HIPNOGPU")
     file(GLOB_RECURSE STATIC_COMPOSABLE_KERNEL_INCLUDE "kernels/static_composable_kernel/include/*/*.hpp")
     file(GLOB_RECURSE STATIC_COMPOSABLE_KERNEL_SOURCE "kernels/static_composable_kernel/src/*/*.cpp")
-    file(GLOB_RECURSE COMPOSABLE_KERNEL_INCLUDE "composable_kernel/composable_kernel/include/*/*.hpp")
-    file(GLOB_RECURSE COMPOSABLE_KERNEL_SOURCE "composable_kernel/composable_kernel/src/*/*.cpp")
+    file(GLOB_RECURSE COMPOSABLE_KERNEL_INCLUDE "composable_kernel/composable_kernel/include/*.hpp")
+    file(GLOB_RECURSE COMPOSABLE_KERNEL_SOURCE "composable_kernel/composable_kernel/src/*.cpp")
     file(GLOB_RECURSE COMPOSABLE_KERNEL_DYNAMIC_ASM_SOURCE "kernels/dynamic_igemm/*.s")
     file(GLOB_RECURSE COMPOSABLE_KERNEL_DYNAMIC_ASM_INCLUDE "kernels/dynamic_igemm/*.inc")
     file(GLOB_RECURSE COMPOSABLE_KERNEL_DYNAMIC_CPP_SOURCE "kernels/dynamic_igemm/*.cpp")
diff --git a/src/solver/conv_hip_implicit_gemm_fwd_v4r1.cpp b/src/solver/conv_hip_implicit_gemm_fwd_v4r1.cpp
index 766722c484..123315dc1c 100644
--- a/src/solver/conv_hip_implicit_gemm_fwd_v4r1.cpp
+++ b/src/solver/conv_hip_implicit_gemm_fwd_v4r1.cpp
@@ -444,15 +444,21 @@ ConvSolution ConvHipImplicitGemmV4R1WrW::GetSolution(const ConvolutionContext& c
     if(ctx.group_counts > 1)
     {
         // clang-format off
-        construction_parameters.kernel_file = "static_kernel_gridwise_convolution_implicit_gemm_v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer.cpp";
-        construction_parameters.kernel_name = "gridwise_convolution_implicit_gemm_v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer";
+        construction_parameters.kernel_file =
+            "static_kernel_gridwise_convolution_implicit_gemm_v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer.cpp";
+
+        construction_parameters.kernel_name =
+            "gridwise_convolution_implicit_gemm_v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer";
         // clang-format on
     }
     else
     {
         // clang-format off
-        construction_parameters.kernel_file = "static_kernel_gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.cpp";
-        construction_parameters.kernel_name = "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer";
+        construction_parameters.kernel_file =
+            "static_kernel_gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.cpp";
+
+        construction_parameters.kernel_name =
+            "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer";
         // clang-format on
     }
 

From 8db34a55d07742222a6f91a5fca89a11ce21383e Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Tue, 17 Aug 2021 10:46:48 -0500
Subject: [PATCH 40/57] change ck common utility

---
 .../miopen/solver/ck_utility_common.hpp       | 34 +++++++++----------
 .../conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp     |  6 ++--
 2 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/src/include/miopen/solver/ck_utility_common.hpp b/src/include/miopen/solver/ck_utility_common.hpp
index 9000162201..946546c988 100644
--- a/src/include/miopen/solver/ck_utility_common.hpp
+++ b/src/include/miopen/solver/ck_utility_common.hpp
@@ -46,15 +46,14 @@ namespace miopen {
 namespace solver {
 namespace ck_utility {
 
-static inline bool is_ck_supported_hardware(const ConvolutionContext& c)
+static inline bool is_ck_supported_hardware(const Handle& handle)
 {
-    return (StartsWith(c.GetStream().GetDeviceName(), "gfx803") &&
-            c.GetStream().GetMaxComputeUnits() == 64) ||
-           StartsWith(c.GetStream().GetDeviceName(), "gfx900") ||
-           StartsWith(c.GetStream().GetDeviceName(), "gfx906") ||
-           StartsWith(c.GetStream().GetDeviceName(), "gfx908") ||
-           StartsWith(c.GetStream().GetDeviceName(), "gfx90a") ||
-           StartsWith(c.GetStream().GetDeviceName(), "gfx1030");
+    return (StartsWith(handle.GetDeviceName(), "gfx803") && handle.GetMaxComputeUnits() == 64) ||
+           StartsWith(handle.GetDeviceName(), "gfx900") ||
+           StartsWith(handle.GetDeviceName(), "gfx906") ||
+           StartsWith(handle.GetDeviceName(), "gfx908") ||
+           StartsWith(handle.GetDeviceName(), "gfx90a") ||
+           StartsWith(handle.GetDeviceName(), "gfx1030");
 }
 
 static inline bool is_support_amd_buffer_atomic_fadd(const std::string& device_name)
@@ -62,7 +61,7 @@ static inline bool is_support_amd_buffer_atomic_fadd(const std::string& device_n
     return StartsWith(device_name, "gfx908");
 }
 
-static inline auto get_ck_common_compiler_flag(const ConvolutionContext& ctx)
+static inline auto get_ck_common_compiler_flag(const Handle& handle)
 {
     auto compiler_flag = std::stringstream();
 
@@ -70,25 +69,24 @@ static inline auto get_ck_common_compiler_flag(const ConvolutionContext& ctx)
     compiler_flag << " --std=c++17";
 
     // GPU target
-    static const std::string gpu_target = ctx.GetStream().GetDeviceName();
+    static const std::string device_name = handle.GetDeviceName();
 
-    if(StartsWith(gpu_target, "gfx803"))
+    if(StartsWith(device_name, "gfx803"))
         compiler_flag << " -DCK_AMD_GPU_GFX803";
-    else if(StartsWith(gpu_target, "gfx900"))
+    else if(StartsWith(device_name, "gfx900"))
         compiler_flag << " -DCK_AMD_GPU_GFX900";
-    else if(StartsWith(gpu_target, "gfx906"))
+    else if(StartsWith(device_name, "gfx906"))
         compiler_flag << " -DCK_AMD_GPU_GFX906";
-    else if(StartsWith(gpu_target, "gfx908"))
+    else if(StartsWith(device_name, "gfx908"))
         compiler_flag << " -DCK_AMD_GPU_GFX908";
-    else if(StartsWith(gpu_target, "gfx90a"))
+    else if(StartsWith(device_name, "gfx90a"))
         compiler_flag << " -DCK_AMD_GPU_GFX90A";
-    else if(StartsWith(gpu_target, "gfx1030"))
+    else if(StartsWith(device_name, "gfx1030"))
         compiler_flag << " -DCK_AMD_GPU_GFX1030";
 
     // buffer atomic-fadd
     compiler_flag << " -DCK_USE_AMD_BUFFER_ATOMIC_FADD="
-                  << (is_support_amd_buffer_atomic_fadd(ctx.GetStream().GetDeviceName()) ? '1'
-                                                                                         : '0');
+                  << (is_support_amd_buffer_atomic_fadd(device_name) ? '1' : '0');
 
     // sync LDS
     compiler_flag << " -DCK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM="
diff --git a/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp b/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp
index 8fee95f66a..3b846031d5 100644
--- a/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp
+++ b/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp
@@ -85,7 +85,7 @@ bool ConvCkIgemmFwdV6r1DlopsNchw::IsApplicable(const ConvolutionContext& ctx) co
         return false;
     if(!ctx.use_hip_kernels)
         return false;
-    if(!ck_utility::is_ck_supported_hardware(ctx))
+    if(!ck_utility::is_ck_supported_hardware(ctx.GetStream()))
         return false;
     if(!ctx.IsLayoutDefault())
         return false;
@@ -151,7 +151,7 @@ ConvSolution ConvCkIgemmFwdV6r1DlopsNchw::GetSolution(
         kernel0_info.g_wk = {1, 1, 1};
 
         kernel0_info.comp_options = ck_compile_param.GetCompileParameterString() +
-                                    ck_utility::get_ck_common_compiler_flag(ctx);
+                                    ck_utility::get_ck_common_compiler_flag(ctx.GetStream());
     }
 
     // kernel1: compute
@@ -173,7 +173,7 @@ ConvSolution ConvCkIgemmFwdV6r1DlopsNchw::GetSolution(
         kernel1_info.g_wk = {block_size * grid_size, 1, 1};
 
         kernel1_info.comp_options = ck_compile_param.GetCompileParameterString() +
-                                    ck_utility::get_ck_common_compiler_flag(ctx);
+                                    ck_utility::get_ck_common_compiler_flag(ctx.GetStream());
     }
 
     sol.construction_params.push_back(kernel0_info);

From 77b4c4356b80441a423cce35d1b9781fe7e661a3 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Tue, 17 Aug 2021 23:51:18 +0000
Subject: [PATCH 41/57] fix

---
 src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp b/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp
index 3b846031d5..f8332fe4b9 100644
--- a/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp
+++ b/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp
@@ -98,6 +98,15 @@ bool ConvCkIgemmFwdV6r1DlopsNchw::IsApplicable(const ConvolutionContext& ctx) co
     if(ctx.group_counts != 1)
         return false;
 
+    {
+        // this kernel use int32_t for memory offset, which covers 2GB of memory maximum
+        const std::size_t max_index_range = std::size_t(2) * 1024 * 1024 * 1024;
+
+        if(!(ctx.bot_sz < max_index_range && ctx.weights_sz < max_index_range &&
+             ctx.top_sz < max_index_range))
+            return false;
+    }
+
     return ck::driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::IsApplicable(
         ck_utility::get_ck_convolution_problem_descriptor(ctx));
 }

From 3952bef74baa9cebb447035ccfdc67ed17c2c0e6 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Sat, 21 Aug 2021 00:04:22 +0000
Subject: [PATCH 42/57] Squashed 'src/composable_kernel/' changes from
 5781adf5c..31b403526

31b403526 Merge pull request #16 from ROCmSoftwarePlatform/develop
b62bf8c3f Merge pull request #14 from ROCmSoftwarePlatform/miopen_downstream_init_integration
ccc4a1d36 Merge pull request #8 from ROCmSoftwarePlatform/miopen_downstream_init_integration
67ad47e7c refactor
16effa767 refactor
a91b68dfc DynamicBuffer, StaticBuffer, amd_buffer_load support customized value for invalid element
2cbabbba5 use int instead of index_t in kernel wrapper
0834bc763 compiler parameter use stream
f2ac7832c make innner product compatiable on gfx900
4e57b30a6 rename
c03045ce2 rename
b2589957f update CK build script
2c48039d0 fix kernel filename
d626dccc9 fix enum issue
643ebd4f3 tidy
ddd49ec9e fix clang warning suppression
4f566c622 vector/scalar pointer cast use c-style pointer cast instead of reinterpret_cast
172036d72 add c-style pointer cast
76f313193 tidy
d18428901 tidy
f885c131d tidy
80120f0a0 tidy
c3efeb5e2 tidy
56fc0842b tidy
54fba515b tidy
e62bae7a4 tidy
24c872894 add tidy
61487e0a0 fix
ae98b52ad remove online compilation from CK
cb9542131 refactor
73ca97015 Merge commit '437cc595c6e206dfebb118985b5171bbc1e29eab' into composable_kernel_init_integration_v3
3b8664611 Merge pull request #7 from ROCmSoftwarePlatform/master
d09ea4f4e Update develop (#5)
3d32ae940 add solver ConvIgemmFwdV6r1DlopsNchwKcyxNkhw; rename static ck source files

git-subtree-dir: src/composable_kernel
git-subtree-split: 31b403526ec54abf13c4bb58dfb6635b4d2aa619
---
 .clang-tidy                                   |    3 +
 CMakeLists.txt                                |  164 +-
 README.md                                     |   10 +-
 cmake/AddKernels.cmake                        |   40 -
 .../CMakeLists.txt => cmake/Analyzers.cmake   |    8 +-
 cmake/ClangTidy.cmake                         |  162 +
 cmake/CppCheck.cmake                          |  130 +
 cmake/DoxygenDoc.cmake                        |  355 ++
 cmake/EnableCompilerWarnings.cmake            |  110 +
 cmake/TargetFlags.cmake                       |   50 -
 ...volution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp |   72 +-
 ...lution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp |   72 +-
 ...volution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp |   80 +-
 ...volution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp |   53 +-
 ...lution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp |   58 +-
 ...lution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp |   58 +-
 ...lution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp |   58 +-
 ...volution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp |   46 +-
 .../tensor_description/cluster_descriptor.hpp |    2 +-
 ...ransform.hpp => multi_index_transform.hpp} |  222 +-
 ...r.hpp => multi_index_transform_helper.hpp} |   43 +-
 .../tensor_description/tensor_adaptor.hpp     |   10 +-
 ...r_descriptor.hpp => tensor_descriptor.hpp} |  107 +-
 ...elper.hpp => tensor_descriptor_helper.hpp} |   49 +-
 .../blockwise_gemm_dlops_v2r2.hpp             |   92 +-
 .../blockwise_gemm_dlops_v2r3.hpp             |   20 +-
 .../blockwise_gemm_dlops_v3.hpp               |   31 +-
 .../blockwise_gemm_xdlops.hpp                 |  114 +-
 ...pp => blockwise_tensor_slice_transfer.hpp} |   75 +-
 ...=> blockwise_tensor_slice_transfer_v2.hpp} |   72 +-
 ...pp => gridwise_contraction_dlops_v1r2.hpp} |  125 +-
 ..._v1r2.hpp => gridwise_gemm_dlops_v1r2.hpp} |  255 +-
 ..._v1r3.hpp => gridwise_gemm_dlops_v1r3.hpp} |  177 +-
 ...lops_v2.hpp => gridwise_gemm_dlops_v2.hpp} |  185 +-
 ...v2r3.hpp => gridwise_gemm_xdlops_v2r3.hpp} |  278 +-
 .../threadwise_contraction_dlops.hpp          |   25 +-
 .../threadwise_gemm_dlops_v3.hpp              |    8 +-
 ...et.hpp => threadwise_tensor_slice_set.hpp} |   16 +-
 ...p => threadwise_tensor_slice_transfer.hpp} |  342 +-
 ...> threadwise_tensor_slice_transfer_v2.hpp} |  230 +-
 .../include/tensor_operation/xdlops_gemm.hpp  |   20 +-
 .../include/utility/amd_address_space.hpp     |   44 +
 ...ssing_v2.hpp => amd_buffer_addressing.hpp} |  109 +-
 .../include/utility/amd_dlop.hpp              |  188 -
 .../include/utility/amd_inline_asm.hpp        |   39 +-
 .../include/utility/c_style_pointer_cast.hpp  |   22 +
 .../include/utility/common_header.hpp         |   17 +-
 composable_kernel/include/utility/config.hpp  |   42 +-
 .../include/utility/data_type_enum.hpp        |    5 +-
 ...e_helper.hpp => data_type_enum_helper.hpp} |    4 +-
 .../include/utility/dynamic_buffer.hpp        |  134 +-
 .../include/utility/enable_if.hpp             |   13 +
 .../include/utility/inner_product.hpp         |  207 +
 composable_kernel/include/utility/math.hpp    |   15 +-
 composable_kernel/include/utility/print.hpp   |   48 -
 .../include/utility/sequence.hpp              |    2 -
 .../include/utility/static_buffer.hpp         |   46 +-
 composable_kernel/include/utility/tuple.hpp   |   29 +-
 composable_kernel/include/utility/type.hpp    |   10 +-
 ...mplicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp |  370 ++
 ...plicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp |  358 ++
 ...plicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp |  357 ++
 ...plicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp} |  115 +-
 ...mplicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp |  374 --
 ...plicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp |  362 --
 ...plicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp |  362 --
 external/half/include/half.hpp                | 5670 -----------------
 host/CMakeLists.txt                           |    2 -
 host/driver_offline/CMakeLists.txt            |    5 +-
 ...licit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp} |   55 +-
 ...cit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp} |   55 +-
 ...plicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp} |   55 +-
 ...licit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp} |   17 +-
 ...icit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp} |   57 +-
 ...cit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp} |   55 +-
 ...cit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp} |   53 +-
 ...cit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp} |   45 +-
 ...cit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp} |   55 +-
 ...plicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp} |   16 +-
 ...plicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp} |   49 +-
 ....hpp => driver_contraction_dlops_v1r2.hpp} |   70 +-
 ...plicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp} |   65 +-
 ...gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp} |   65 +-
 .../driver_dynamic_gemm_dlops_v1r2.hpp        |  415 --
 .../driver_dynamic_gemm_dlops_v1r3.hpp        |  411 --
 .../driver_dynamic_gemm_xdlops_v2r3.hpp       |  196 -
 .../include/driver_gemm_dlops_v1r2.hpp        |  413 ++
 .../include/driver_gemm_dlops_v1r3.hpp        |  418 ++
 .../include/driver_gemm_xdlops_v2r3.hpp       |  191 +
 .../{ => src}/conv_bwd_driver_offline.cpp     |  182 +-
 .../{ => src}/conv_fwd_driver_offline.cpp     |  222 +-
 host/driver_online/CMakeLists.txt             |   22 -
 host/driver_online/conv_fwd_driver_online.cpp |  453 --
 ...mplicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp |  395 --
 ...plicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp |  386 --
 ...plicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp |  389 --
 ...mplicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp |  182 -
 host/host_tensor/CMakeLists.txt               |    2 +
 host/host_tensor/include/conv_common.hpp      |   10 +-
 host/host_tensor/include/device.hpp           |   20 +-
 host/host_tensor/include/host_conv.hpp        |   32 +-
 .../include/host_conv_bwd_data.hpp            |   26 +-
 host/host_tensor/include/host_tensor.hpp      |    2 +-
 .../include/host_tensor_generator.hpp         |    2 +-
 host/host_tensor/src/device.cpp               |   18 +-
 host/online_compile/CMakeLists.txt            |  168 -
 host/online_compile/addkernels/addkernels.cpp |  264 -
 .../addkernels/include_inliner.cpp            |  213 -
 .../addkernels/include_inliner.hpp            |  142 -
 .../addkernels/source_file_desc.hpp           |   45 -
 .../hip_utility/binary_cache.cpp              |  112 -
 .../online_compile/hip_utility/exec_utils.cpp |   93 -
 host/online_compile/hip_utility/handlehip.cpp |  285 -
 .../hip_utility/hip_build_utils.cpp           |  346 -
 .../hip_utility/hipoc_kernel.cpp              |   84 -
 .../hip_utility/hipoc_program.cpp             |  139 -
 .../hip_utility/kernel_build_params.cpp       |   66 -
 .../hip_utility/kernel_cache.cpp              |  154 -
 host/online_compile/hip_utility/logger.cpp    |   43 -
 host/online_compile/hip_utility/md5.cpp       |  319 -
 .../hip_utility/target_properties.cpp         |  119 -
 host/online_compile/hip_utility/tmp_dir.cpp   |   66 -
 host/online_compile/include/binary_cache.hpp  |   52 -
 host/online_compile/include/config.h.in       |   47 -
 host/online_compile/include/env.hpp           |  123 -
 host/online_compile/include/exec_utils.hpp    |   42 -
 host/online_compile/include/handle.hpp        |  145 -
 host/online_compile/include/hipCheck.hpp      |   22 -
 .../include/hip_build_utils.hpp               |   97 -
 host/online_compile/include/hipoc_kernel.hpp  |  174 -
 host/online_compile/include/hipoc_program.hpp |   64 -
 .../include/hipoc_program_impl.hpp            |   61 -
 host/online_compile/include/kernel.hpp        |   45 -
 .../include/kernel_build_params.hpp           |  137 -
 host/online_compile/include/kernel_cache.hpp  |   97 -
 host/online_compile/include/logger.hpp        |   23 -
 host/online_compile/include/manage_ptr.hpp    |   76 -
 host/online_compile/include/md5.hpp           |   12 -
 .../online_compile/include/op_kernel_args.hpp |   40 -
 host/online_compile/include/simple_hash.hpp   |   44 -
 host/online_compile/include/stringutils.hpp   |  133 -
 .../include/target_properties.hpp             |   56 -
 host/online_compile/include/tmp_dir.hpp       |   26 -
 host/online_compile/include/write_file.hpp    |   30 -
 host/online_compile/kernel.cpp.in             |   70 -
 host/online_compile/kernel_includes.cpp.in    |   80 -
 host/online_compile/kernels_batch.cpp.in      |    1 -
 ...nv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp |  270 +-
 .../convolution_problem_descriptor.hpp        |    6 +-
 .../include/solver_common.hpp}                |   17 +-
 script/cmake-rocm.sh                          |   32 +-
 151 files changed, 5790 insertions(+), 16965 deletions(-)
 create mode 100644 .clang-tidy
 delete mode 100644 cmake/AddKernels.cmake
 rename host/online_compile/addkernels/CMakeLists.txt => cmake/Analyzers.cmake (90%)
 create mode 100644 cmake/ClangTidy.cmake
 create mode 100644 cmake/CppCheck.cmake
 create mode 100644 cmake/DoxygenDoc.cmake
 create mode 100644 cmake/EnableCompilerWarnings.cmake
 delete mode 100644 cmake/TargetFlags.cmake
 rename composable_kernel/include/tensor_description/{dynamic_multi_index_transform.hpp => multi_index_transform.hpp} (90%)
 rename composable_kernel/include/tensor_description/{dynamic_multi_index_transform_helper.hpp => multi_index_transform_helper.hpp} (63%)
 rename composable_kernel/include/tensor_description/{dynamic_tensor_descriptor.hpp => tensor_descriptor.hpp} (85%)
 rename composable_kernel/include/tensor_description/{dynamic_tensor_descriptor_helper.hpp => tensor_descriptor_helper.hpp} (70%)
 rename composable_kernel/include/tensor_operation/{blockwise_dynamic_tensor_slice_transfer.hpp => blockwise_tensor_slice_transfer.hpp} (65%)
 rename composable_kernel/include/tensor_operation/{blockwise_dynamic_tensor_slice_transfer_v2.hpp => blockwise_tensor_slice_transfer_v2.hpp} (65%)
 rename composable_kernel/include/tensor_operation/{gridwise_dynamic_contraction_dlops_v1r2.hpp => gridwise_contraction_dlops_v1r2.hpp} (89%)
 rename composable_kernel/include/tensor_operation/{gridwise_dynamic_gemm_dlops_v1r2.hpp => gridwise_gemm_dlops_v1r2.hpp} (74%)
 rename composable_kernel/include/tensor_operation/{gridwise_dynamic_gemm_dlops_v1r3.hpp => gridwise_gemm_dlops_v1r3.hpp} (81%)
 rename composable_kernel/include/tensor_operation/{gridwise_dynamic_gemm_dlops_v2.hpp => gridwise_gemm_dlops_v2.hpp} (74%)
 rename composable_kernel/include/tensor_operation/{gridwise_dynamic_gemm_xdlops_v2r3.hpp => gridwise_gemm_xdlops_v2r3.hpp} (74%)
 rename composable_kernel/include/tensor_operation/{threadwise_dynamic_tensor_slice_set.hpp => threadwise_tensor_slice_set.hpp} (78%)
 rename composable_kernel/include/tensor_operation/{threadwise_dynamic_tensor_slice_transfer.hpp => threadwise_tensor_slice_transfer.hpp} (80%)
 rename composable_kernel/include/tensor_operation/{threadwise_dynamic_tensor_slice_transfer_v2.hpp => threadwise_tensor_slice_transfer_v2.hpp} (76%)
 create mode 100644 composable_kernel/include/utility/amd_address_space.hpp
 rename composable_kernel/include/utility/{amd_buffer_addressing_v2.hpp => amd_buffer_addressing.hpp} (87%)
 delete mode 100644 composable_kernel/include/utility/amd_dlop.hpp
 create mode 100644 composable_kernel/include/utility/c_style_pointer_cast.hpp
 rename composable_kernel/include/utility/{data_type_helper.hpp => data_type_enum_helper.hpp} (94%)
 create mode 100644 composable_kernel/include/utility/enable_if.hpp
 create mode 100644 composable_kernel/include/utility/inner_product.hpp
 create mode 100644 composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp
 create mode 100644 composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
 create mode 100644 composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp
 rename composable_kernel/src/kernel_wrapper/{dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp => convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp} (87%)
 delete mode 100644 composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp
 delete mode 100644 composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
 delete mode 100644 composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp
 delete mode 100644 external/half/include/half.hpp
 rename host/driver_offline/include/{device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp => device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp} (89%)
 rename host/driver_offline/include/{device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp => device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp} (88%)
 rename host/driver_offline/include/{device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp => device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp} (84%)
 rename host/driver_offline/include/{device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp => device_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp} (94%)
 rename host/driver_offline/include/{device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp => device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp} (88%)
 rename host/driver_offline/include/{device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp => device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp} (84%)
 rename host/driver_offline/include/{device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp => device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp} (84%)
 rename host/driver_offline/include/{device_dynamic_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp => device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp} (90%)
 rename host/driver_offline/include/{device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp => device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp} (89%)
 rename host/driver_offline/include/{device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp => device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp} (91%)
 rename host/driver_offline/include/{device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp => device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp} (88%)
 rename host/driver_offline/include/{driver_dynamic_contraction_dlops_v1r2.hpp => driver_contraction_dlops_v1r2.hpp} (84%)
 rename host/driver_offline/include/{driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp => driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp} (86%)
 rename host/driver_offline/include/{driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp => driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp} (86%)
 delete mode 100644 host/driver_offline/include/driver_dynamic_gemm_dlops_v1r2.hpp
 delete mode 100644 host/driver_offline/include/driver_dynamic_gemm_dlops_v1r3.hpp
 delete mode 100644 host/driver_offline/include/driver_dynamic_gemm_xdlops_v2r3.hpp
 create mode 100644 host/driver_offline/include/driver_gemm_dlops_v1r2.hpp
 create mode 100644 host/driver_offline/include/driver_gemm_dlops_v1r3.hpp
 create mode 100644 host/driver_offline/include/driver_gemm_xdlops_v2r3.hpp
 rename host/driver_offline/{ => src}/conv_bwd_driver_offline.cpp (67%)
 rename host/driver_offline/{ => src}/conv_fwd_driver_offline.cpp (67%)
 delete mode 100644 host/driver_online/CMakeLists.txt
 delete mode 100644 host/driver_online/conv_fwd_driver_online.cpp
 delete mode 100644 host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
 delete mode 100644 host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
 delete mode 100644 host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
 delete mode 100644 host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
 delete mode 100644 host/online_compile/CMakeLists.txt
 delete mode 100644 host/online_compile/addkernels/addkernels.cpp
 delete mode 100644 host/online_compile/addkernels/include_inliner.cpp
 delete mode 100644 host/online_compile/addkernels/include_inliner.hpp
 delete mode 100644 host/online_compile/addkernels/source_file_desc.hpp
 delete mode 100644 host/online_compile/hip_utility/binary_cache.cpp
 delete mode 100644 host/online_compile/hip_utility/exec_utils.cpp
 delete mode 100644 host/online_compile/hip_utility/handlehip.cpp
 delete mode 100644 host/online_compile/hip_utility/hip_build_utils.cpp
 delete mode 100644 host/online_compile/hip_utility/hipoc_kernel.cpp
 delete mode 100644 host/online_compile/hip_utility/hipoc_program.cpp
 delete mode 100644 host/online_compile/hip_utility/kernel_build_params.cpp
 delete mode 100644 host/online_compile/hip_utility/kernel_cache.cpp
 delete mode 100644 host/online_compile/hip_utility/logger.cpp
 delete mode 100644 host/online_compile/hip_utility/md5.cpp
 delete mode 100644 host/online_compile/hip_utility/target_properties.cpp
 delete mode 100644 host/online_compile/hip_utility/tmp_dir.cpp
 delete mode 100644 host/online_compile/include/binary_cache.hpp
 delete mode 100644 host/online_compile/include/config.h.in
 delete mode 100644 host/online_compile/include/env.hpp
 delete mode 100644 host/online_compile/include/exec_utils.hpp
 delete mode 100644 host/online_compile/include/handle.hpp
 delete mode 100644 host/online_compile/include/hipCheck.hpp
 delete mode 100644 host/online_compile/include/hip_build_utils.hpp
 delete mode 100644 host/online_compile/include/hipoc_kernel.hpp
 delete mode 100644 host/online_compile/include/hipoc_program.hpp
 delete mode 100644 host/online_compile/include/hipoc_program_impl.hpp
 delete mode 100644 host/online_compile/include/kernel.hpp
 delete mode 100644 host/online_compile/include/kernel_build_params.hpp
 delete mode 100644 host/online_compile/include/kernel_cache.hpp
 delete mode 100644 host/online_compile/include/logger.hpp
 delete mode 100644 host/online_compile/include/manage_ptr.hpp
 delete mode 100644 host/online_compile/include/md5.hpp
 delete mode 100644 host/online_compile/include/op_kernel_args.hpp
 delete mode 100644 host/online_compile/include/simple_hash.hpp
 delete mode 100644 host/online_compile/include/stringutils.hpp
 delete mode 100644 host/online_compile/include/target_properties.hpp
 delete mode 100644 host/online_compile/include/tmp_dir.hpp
 delete mode 100644 host/online_compile/include/write_file.hpp
 delete mode 100644 host/online_compile/kernel.cpp.in
 delete mode 100644 host/online_compile/kernel_includes.cpp.in
 delete mode 100644 host/online_compile/kernels_batch.cpp.in
 rename host/{driver_online/include/online_driver_common.hpp => solver/include/solver_common.hpp} (72%)

diff --git a/.clang-tidy b/.clang-tidy
new file mode 100644
index 0000000000..5c2b781687
--- /dev/null
+++ b/.clang-tidy
@@ -0,0 +1,3 @@
+CheckOptions:
+  - key: bugprone-reserved-identifier.AllowedIdentifiers
+    value: '__HIP_PLATFORM_HCC__;__HIP_ROCclr__'
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0cf342bb45..306e6ca649 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,10 +1,9 @@
-cmake_minimum_required(VERSION 2.8.3)
-project(modular_convolution)
+cmake_minimum_required(VERSION 3.5)
+project(composable_kernel)
 
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
 
-include(TargetFlags)
-include(AddKernels)
+include(CheckCXXCompilerFlag)
 
 ## C++
 enable_language(CXX)
@@ -39,4 +38,161 @@ link_libraries(${OpenMP_pthread_LIBRARY})
 find_package(HIP REQUIRED)
 message(STATUS "Build with HIP ${hip_VERSION}")
 
+## half
+#find_path(HALF_INCLUDE_DIR half.hpp)
+message("HALF_INCLUDE_DIR: ${HALF_INCLUDE_DIR}")
+
+# CMAKE_CXX_FLAGS
+SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV")
+if(BUILD_DEV)
+    string(APPEND CMAKE_CXX_FLAGS " -Werror -Weverything")
+endif()
+message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
+
+## tidy
+include(EnableCompilerWarnings)
+set(MIOPEN_TIDY_ERRORS ERRORS * -readability-inconsistent-declaration-parameter-name)
+if(CMAKE_CXX_COMPILER MATCHES ".*hcc" OR CMAKE_CXX_COMPILER MATCHES ".*clang\\+\\+")
+    set(MIOPEN_TIDY_CHECKS -modernize-use-override -readability-non-const-parameter)
+# Enable tidy on hip
+elseif(MIOPEN_BACKEND STREQUAL "HIP" OR MIOPEN_BACKEND STREQUAL "HIPNOGPU")
+    set(MIOPEN_TIDY_ERRORS ALL)
+endif()
+
+include(ClangTidy)
+enable_clang_tidy(
+    CHECKS
+        *
+        -abseil-*
+        -android-cloexec-fopen
+        # Yea we shouldn't be using rand()
+        -cert-msc30-c
+        -bugprone-exception-escape
+        -bugprone-macro-parentheses
+        -cert-env33-c
+        -cert-msc32-c
+        -cert-msc50-cpp
+        -cert-msc51-cpp
+        -cert-dcl37-c
+        -cert-dcl51-cpp
+        -clang-analyzer-alpha.core.CastToStruct
+        -clang-analyzer-optin.performance.Padding
+        -clang-diagnostic-deprecated-declarations
+        -clang-diagnostic-extern-c-compat
+        -clang-diagnostic-unused-command-line-argument
+        -cppcoreguidelines-avoid-c-arrays
+        -cppcoreguidelines-avoid-magic-numbers
+        -cppcoreguidelines-explicit-virtual-functions
+        -cppcoreguidelines-init-variables
+        -cppcoreguidelines-macro-usage
+        -cppcoreguidelines-non-private-member-variables-in-classes
+        -cppcoreguidelines-pro-bounds-array-to-pointer-decay
+        -cppcoreguidelines-pro-bounds-constant-array-index
+        -cppcoreguidelines-pro-bounds-pointer-arithmetic
+        -cppcoreguidelines-pro-type-member-init
+        -cppcoreguidelines-pro-type-reinterpret-cast
+        -cppcoreguidelines-pro-type-union-access
+        -cppcoreguidelines-pro-type-vararg
+        -cppcoreguidelines-special-member-functions
+        -fuchsia-*
+        -google-explicit-constructor
+        -google-readability-braces-around-statements
+        -google-readability-todo
+        -google-runtime-int
+        -google-runtime-references
+        -hicpp-vararg
+        -hicpp-braces-around-statements
+        -hicpp-explicit-conversions
+        -hicpp-named-parameter
+        -hicpp-no-array-decay
+        # We really shouldn't use bitwise operators with signed integers, but
+        # opencl leaves us no choice
+        -hicpp-avoid-c-arrays
+        -hicpp-signed-bitwise
+        -hicpp-special-member-functions
+        -hicpp-uppercase-literal-suffix
+        -hicpp-use-auto
+        -hicpp-use-equals-default
+        -hicpp-use-override
+        -llvm-header-guard
+        -llvm-include-order
+        #-llvmlibc-*
+        -llvmlibc-restrict-system-libc-headers
+        -llvmlibc-callee-namespace
+        -llvmlibc-implementation-in-namespace
+        -llvm-else-after-return
+        -llvm-qualified-auto
+        -misc-misplaced-const
+        -misc-non-private-member-variables-in-classes
+        -misc-no-recursion
+        -modernize-avoid-bind
+        -modernize-avoid-c-arrays
+        -modernize-pass-by-value
+        -modernize-use-auto
+        -modernize-use-default-member-init
+        -modernize-use-equals-default
+        -modernize-use-trailing-return-type
+        -modernize-use-transparent-functors
+        -performance-unnecessary-value-param
+        -readability-braces-around-statements
+        -readability-else-after-return
+        # we are not ready to use it, but very useful
+        -readability-function-cognitive-complexity
+        -readability-isolate-declaration
+        -readability-magic-numbers
+        -readability-named-parameter
+        -readability-uppercase-literal-suffix
+        -readability-convert-member-functions-to-static
+        -readability-qualified-auto
+        -readability-redundant-string-init
+        # too many narrowing conversions in our code
+        -bugprone-narrowing-conversions
+        -cppcoreguidelines-narrowing-conversions
+        -altera-struct-pack-align
+        -cppcoreguidelines-prefer-member-initializer
+
+        ${MIOPEN_TIDY_CHECKS}
+        ${MIOPEN_TIDY_ERRORS}
+    HEADER_FILTER
+        "\.hpp$"
+    EXTRA_ARGS
+        -DMIOPEN_USE_CLANG_TIDY
+)
+
+include(CppCheck)
+enable_cppcheck(
+    CHECKS
+        warning
+        style
+        performance
+        portability
+    SUPPRESS
+        ConfigurationNotChecked
+        constStatement
+        duplicateCondition
+        noExplicitConstructor
+        passedByValue
+        preprocessorErrorDirective
+        shadowVariable
+        unusedFunction
+        unusedPrivateFunction
+        unusedStructMember
+        unmatchedSuppression
+    FORCE
+    SOURCES
+        host/host_tensor/src
+        host/driver_offline/src
+        composable_kernel/src/kernel_wrapper
+    INCLUDE
+        host/host_tensor/include
+        host/solver/include
+        host/driver_offline/include
+        composable_kernel/include/*
+        ${CMAKE_CURRENT_SOURCE_DIR}/include
+        ${CMAKE_CURRENT_BINARY_DIR}/include
+    DEFINE
+        CPPCHECK=1
+        __linux__=1
+)
+
 add_subdirectory(host)
diff --git a/README.md b/README.md
index 6e6019601a..4f071d5896 100644
--- a/README.md
+++ b/README.md
@@ -78,7 +78,7 @@ InLeftPads size 2, {1, 1, }
 InRightPads size 2, {1, 1, }
 ConvStrides size 2, {2, 2, }
 ConvDilations size 2, {1, 1, }
-device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw
+device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw
 a_k0_m_k1_grid_desc{216, 256, 8}
 b_k0_n_k1_grid_desc{216, 165888, 8}
 c_m_n_grid_desc{ 256, 165888}
@@ -100,7 +100,7 @@ InLeftPads size 2, {1, 1, }
 InRightPads size 2, {1, 1, }
 ConvStrides size 2, {1, 1, }
 ConvDilations size 2, {1, 1, }
-device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw
+device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw
 a_k0_m_k1_grid_desc{288, 1024, 8}
 b_k0_n_k1_grid_desc{288, 50176, 8}
 c_m_n_grid_desc{ 1024, 50176}
@@ -122,7 +122,7 @@ InLeftPads size 2, {1, 1, }
 InRightPads size 2, {1, 1, }
 ConvStrides size 2, {2, 2, }
 ConvDilations size 2, {1, 1, }
-device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk
+device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk
 a_k0_m_k1_grid_desc{216, 165888, 8}
 b_k0_n_k1_grid_desc{216, 256, 8}
 c_m_n_grid_desc{ 165888, 256}
@@ -144,7 +144,7 @@ InLeftPads size 2, {1, 1, }
 InRightPads size 2, {1, 1, }
 ConvStrides size 2, {1, 1, }
 ConvDilations size 2, {1, 1, }
-device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk
+device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk
 a_k0_m_k1_grid_desc{288, 50176, 8}
 b_k0_n_k1_grid_desc{288, 1024, 8}
 c_m_n_grid_desc{ 50176, 1024}
@@ -166,7 +166,7 @@ InLeftPads size 2, {1, 1, }
 InRightPads size 2, {1, 1, }
 ConvStrides size 2, {1, 1, }
 ConvDilations size 2, {1, 1, }
-device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk
+device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk
 a_k0_m_k1_grid_desc{288, 50176, 8}
 b_k0_n_k1_grid_desc{288, 1024, 8}
 c_m_n_grid_desc{ 50176, 1024}
diff --git a/cmake/AddKernels.cmake b/cmake/AddKernels.cmake
deleted file mode 100644
index 429ecc47a9..0000000000
--- a/cmake/AddKernels.cmake
+++ /dev/null
@@ -1,40 +0,0 @@
-
-function(add_kernels SRC_DIR KERNEL_FILES)
-    set(INIT_KERNELS_LIST)
-    set(KERNELS_DECLS)
-    foreach(KERNEL_FILE ${KERNEL_FILES})
-        if("${CMAKE_VERSION}" VERSION_LESS 3.0)
-            configure_file(${KERNEL_FILE} ${KERNEL_FILE}.delete)
-        else()
-            set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${KERNEL_FILE})
-        endif()
-        get_filename_component(BASE_NAME ${KERNEL_FILE} NAME_WE)
-        string(TOUPPER "${BASE_NAME}" KEY_NAME)
-        string(MAKE_C_IDENTIFIER "${KEY_NAME}" VAR_NAME)
-	string(APPEND KERNELS_DECLS "extern const size_t APP_KERNEL_${VAR_NAME}_SIZE;\n")
-	string(APPEND KERNELS_DECLS "extern const unsigned char APP_KERNEL_${VAR_NAME}[];\n")
-	list(APPEND INIT_KERNELS_LIST "    { \"${KEY_NAME}\", std::string(reinterpret_cast<const char*>(APP_KERNEL_${VAR_NAME}), APP_KERNEL_${VAR_NAME}_SIZE) }")
-    endforeach()
-    string(REPLACE ";" ",\n" INIT_KERNELS "${INIT_KERNELS_LIST}")
-    configure_file(${SRC_DIR}/kernel.cpp.in ${PROJECT_BINARY_DIR}/kernel.cpp)
-endfunction()
-
-function(add_kernel_includes SRC_DIR KERNEL_FILES)
-    set(INIT_KERNELS_LIST)
-    foreach(KERNEL_FILE ${KERNEL_FILES})
-        if("${CMAKE_VERSION}" VERSION_LESS 3.0)
-            configure_file(${KERNEL_FILE} ${KERNEL_FILE}.delete)
-        else()
-            set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${KERNEL_FILE})
-        endif()
-        get_filename_component(BASE_NAME ${KERNEL_FILE} NAME_WE)
-        get_filename_component(FILE_NAME ${KERNEL_FILE} NAME)
-        string(TOUPPER "${BASE_NAME}" KEY_NAME)
-        string(MAKE_C_IDENTIFIER "${KEY_NAME}" VAR_NAME)
-        list(APPEND INIT_KERNELS_LIST "    { \"${FILE_NAME}\", std::string(reinterpret_cast<const char*>(${VAR_NAME}), ${VAR_NAME}_SIZE) }")
-    endforeach()
-    string(REPLACE ";" ",\n" INIT_KERNELS "${INIT_KERNELS_LIST}")
-    configure_file(${SRC_DIR}/kernel_includes.cpp.in ${PROJECT_BINARY_DIR}/kernel_includes.cpp)
-endfunction()
-
-
diff --git a/host/online_compile/addkernels/CMakeLists.txt b/cmake/Analyzers.cmake
similarity index 90%
rename from host/online_compile/addkernels/CMakeLists.txt
rename to cmake/Analyzers.cmake
index 874cba6a5e..1bf1a52c68 100644
--- a/host/online_compile/addkernels/CMakeLists.txt
+++ b/cmake/Analyzers.cmake
@@ -24,7 +24,11 @@
 # 
 ################################################################################
 
-set(ADD_KERNELS_SOURCE include_inliner.cpp addkernels.cpp)
+if(NOT TARGET analyze)
+    add_custom_target(analyze)
+endif()
 
-add_executable(addkernels EXCLUDE_FROM_ALL ${ADD_KERNELS_SOURCE})
+function(mark_as_analyzer)
+    add_dependencies(analyze ${ARGN})
+endfunction()
 
diff --git a/cmake/ClangTidy.cmake b/cmake/ClangTidy.cmake
new file mode 100644
index 0000000000..01b348c458
--- /dev/null
+++ b/cmake/ClangTidy.cmake
@@ -0,0 +1,162 @@
+################################################################################
+#
+# MIT License
+#
+# Copyright (c) 2017 Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+################################################################################
+include(CMakeParseArguments)
+include(Analyzers)
+
+get_filename_component(CLANG_TIDY_EXE_HINT "${CMAKE_CXX_COMPILER}" PATH)
+
+find_program(CLANG_TIDY_EXE
+    NAMES
+        clang-tidy
+        clang-tidy-5.0
+        clang-tidy-4.0
+        clang-tidy-3.9
+        clang-tidy-3.8
+        clang-tidy-3.7
+        clang-tidy-3.6
+        clang-tidy-3.5
+    HINTS
+        ${CLANG_TIDY_EXE_HINT}
+    PATH_SUFFIXES
+        compiler/bin
+    PATHS
+        /opt/rocm/llvm/bin
+        /opt/rocm/hcc
+        /usr/local/opt/llvm/bin
+)
+
+function(find_clang_tidy_version VAR)
+    execute_process(COMMAND ${CLANG_TIDY_EXE} -version OUTPUT_VARIABLE VERSION_OUTPUT)
+    separate_arguments(VERSION_OUTPUT_LIST UNIX_COMMAND "${VERSION_OUTPUT}")
+    list(FIND VERSION_OUTPUT_LIST "version" VERSION_INDEX)
+    if(VERSION_INDEX GREATER 0)
+        math(EXPR VERSION_INDEX "${VERSION_INDEX} + 1")
+        list(GET VERSION_OUTPUT_LIST ${VERSION_INDEX} VERSION)
+        set(${VAR} ${VERSION} PARENT_SCOPE)
+    else()
+        set(${VAR} "0.0" PARENT_SCOPE)
+    endif()
+
+endfunction()
+
+if( NOT CLANG_TIDY_EXE )
+    message( STATUS "Clang tidy not found" )
+    set(CLANG_TIDY_VERSION "0.0")
+else()
+    find_clang_tidy_version(CLANG_TIDY_VERSION)
+    message( STATUS "Clang tidy found: ${CLANG_TIDY_VERSION}")
+endif()
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+set(CLANG_TIDY_FIXIT_DIR ${CMAKE_BINARY_DIR}/fixits)
+file(MAKE_DIRECTORY ${CLANG_TIDY_FIXIT_DIR})
+set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${CLANG_TIDY_FIXIT_DIR})
+
+macro(enable_clang_tidy)
+    set(options ANALYZE_TEMPORARY_DTORS ALL)
+    set(oneValueArgs HEADER_FILTER)
+    set(multiValueArgs CHECKS ERRORS EXTRA_ARGS)
+
+    cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    string(REPLACE ";" "," CLANG_TIDY_CHECKS "${PARSE_CHECKS}")
+    string(REPLACE ";" "," CLANG_TIDY_ERRORS "${PARSE_ERRORS}")
+    set(CLANG_TIDY_EXTRA_ARGS)
+    foreach(ARG ${PARSE_EXTRA_ARGS})
+        list(APPEND CLANG_TIDY_EXTRA_ARGS "-extra-arg=${ARG}")
+    endforeach()
+
+    set(CLANG_TIDY_ALL)
+    if(PARSE_ALL)
+        set(CLANG_TIDY_ALL ALL)
+    endif()
+
+    message(STATUS "Clang tidy checks: ${CLANG_TIDY_CHECKS}")
+
+    if (${PARSE_ANALYZE_TEMPORARY_DTORS})
+        set(CLANG_TIDY_ANALYZE_TEMPORARY_DTORS "-analyze-temporary-dtors")
+    endif()
+
+    if (${CLANG_TIDY_VERSION} VERSION_LESS "3.9.0")
+        set(CLANG_TIDY_ERRORS_ARG "")
+    else()
+        set(CLANG_TIDY_ERRORS_ARG "-warnings-as-errors='${CLANG_TIDY_ERRORS}'")
+    endif()
+
+    if (${CLANG_TIDY_VERSION} VERSION_LESS "3.9.0")
+        set(CLANG_TIDY_QUIET_ARG "")
+    else()
+        set(CLANG_TIDY_QUIET_ARG "-quiet")
+    endif()
+
+    if(PARSE_HEADER_FILTER)
+        string(REPLACE "$" "$$" CLANG_TIDY_HEADER_FILTER "${PARSE_HEADER_FILTER}")
+    else()
+        set(CLANG_TIDY_HEADER_FILTER ".*")
+    endif()
+
+    set(CLANG_TIDY_COMMAND
+        ${CLANG_TIDY_EXE}
+        ${CLANG_TIDY_QUIET_ARG}
+        -p ${CMAKE_BINARY_DIR}
+        -checks='${CLANG_TIDY_CHECKS}'
+        ${CLANG_TIDY_ERRORS_ARG}
+        ${CLANG_TIDY_EXTRA_ARGS}
+        ${CLANG_TIDY_ANALYZE_TEMPORARY_DTORS}
+        -header-filter='${CLANG_TIDY_HEADER_FILTER}'
+    )
+    add_custom_target(tidy ${CLANG_TIDY_ALL})
+    mark_as_analyzer(tidy)
+    add_custom_target(tidy-base)
+    add_custom_target(tidy-make-fixit-dir COMMAND ${CMAKE_COMMAND} -E make_directory ${CLANG_TIDY_FIXIT_DIR})
+    add_custom_target(tidy-rm-fixit-dir COMMAND ${CMAKE_COMMAND} -E remove_directory ${CLANG_TIDY_FIXIT_DIR})
+    add_dependencies(tidy-make-fixit-dir tidy-rm-fixit-dir)
+    add_dependencies(tidy-base tidy-make-fixit-dir)
+endmacro()
+
+function(clang_tidy_check TARGET)
+    get_target_property(SOURCES ${TARGET} SOURCES)
+    # TODO: Use generator expressions instead
+    # COMMAND ${CLANG_TIDY_COMMAND} $<TARGET_PROPERTY:${TARGET},SOURCES>
+    # COMMAND ${CLANG_TIDY_COMMAND} $<JOIN:$<TARGET_PROPERTY:${TARGET},SOURCES>, >
+    foreach(SOURCE ${SOURCES})
+        if((NOT "${SOURCE}" MATCHES "(h|hpp|hxx)$") AND (NOT "${SOURCE}" MATCHES "TARGET_OBJECTS"))
+            string(MAKE_C_IDENTIFIER "${SOURCE}" tidy_file)
+            set(tidy_target tidy-target-${TARGET}-${tidy_file})
+            add_custom_target(${tidy_target}
+                # for some targets clang-tidy not able to get information from .clang-tidy
+                DEPENDS ${SOURCE}
+                COMMAND ${CLANG_TIDY_COMMAND} "-config=\{CheckOptions: \[\{key: bugprone-reserved-identifier.AllowedIdentifiers,value: __HIP_PLATFORM_HCC__\; __HIP_ROCclr__\}\]\}" ${SOURCE} "-export-fixes=${CLANG_TIDY_FIXIT_DIR}/${TARGET}-${tidy_file}.yaml"
+                WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+                COMMENT "clang-tidy: Running clang-tidy on target ${SOURCE}..."
+            )
+            add_dependencies(${tidy_target} ${TARGET})
+            add_dependencies(${tidy_target} tidy-base)
+            add_dependencies(tidy ${tidy_target})
+        endif()
+    endforeach()
+endfunction()
+
diff --git a/cmake/CppCheck.cmake b/cmake/CppCheck.cmake
new file mode 100644
index 0000000000..797dcf4b4d
--- /dev/null
+++ b/cmake/CppCheck.cmake
@@ -0,0 +1,130 @@
+################################################################################
+# 
+# MIT License
+# 
+# Copyright (c) 2017 Advanced Micro Devices, Inc.
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# 
+################################################################################
+
+include(CMakeParseArguments)
+include(ProcessorCount)
+include(Analyzers)
+
+find_program(CPPCHECK_EXE 
+    NAMES 
+        cppcheck
+    PATHS
+        /opt/rocm/bin
+)
+
+ProcessorCount(CPPCHECK_JOBS)
+
+set(CPPCHECK_BUILD_DIR ${CMAKE_BINARY_DIR}/cppcheck-build)
+file(MAKE_DIRECTORY ${CPPCHECK_BUILD_DIR})
+set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${CPPCHECK_BUILD_DIR})
+
+macro(enable_cppcheck)
+    set(options FORCE)
+    set(oneValueArgs)
+    set(multiValueArgs CHECKS SUPPRESS DEFINE UNDEFINE INCLUDE SOURCES)
+
+    cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    string(REPLACE ";" "," CPPCHECK_CHECKS "${PARSE_CHECKS}")
+    string(REPLACE ";" "\n" CPPCHECK_SUPPRESS "${PARSE_SUPPRESS};*:/usr/*")
+    file(WRITE ${CMAKE_BINARY_DIR}/cppcheck-supressions "${CPPCHECK_SUPPRESS}")
+    set(CPPCHECK_DEFINES)
+    foreach(DEF ${PARSE_DEFINE})
+        set(CPPCHECK_DEFINES "${CPPCHECK_DEFINES} -D${DEF}")
+    endforeach()
+
+    set(CPPCHECK_UNDEFINES)
+    foreach(DEF ${PARSE_UNDEFINE})
+        set(CPPCHECK_UNDEFINES "${CPPCHECK_UNDEFINES} -U${DEF}")
+    endforeach()
+
+    set(CPPCHECK_INCLUDES)
+    foreach(INC ${PARSE_INCLUDE})
+        set(CPPCHECK_INCLUDES "${CPPCHECK_INCLUDES} -I${INC}")
+    endforeach()
+
+    # set(CPPCHECK_FORCE)
+    set(CPPCHECK_FORCE "--project=${CMAKE_BINARY_DIR}/compile_commands.json")
+    if(PARSE_FORCE)
+        set(CPPCHECK_FORCE --force)
+    endif()
+
+    set(SOURCES)
+    set(GLOBS)
+    foreach(SOURCE ${PARSE_SOURCES})
+        get_filename_component(ABS_SOURCE ${SOURCE} ABSOLUTE)
+        if(EXISTS ${ABS_SOURCE})
+            if(IS_DIRECTORY ${ABS_SOURCE})
+                set(GLOBS "${GLOBS} ${ABS_SOURCE}/*.cpp ${ABS_SOURCE}/*.hpp ${ABS_SOURCE}/*.cxx ${ABS_SOURCE}/*.c ${ABS_SOURCE}/*.h")
+            else()
+                set(SOURCES "${SOURCES} ${ABS_SOURCE}")
+            endif()
+        else()
+            set(GLOBS "${GLOBS} ${ABS_SOURCE}")
+        endif()
+    endforeach()
+
+    file(WRITE ${CMAKE_BINARY_DIR}/cppcheck.cmake "
+        file(GLOB_RECURSE GSRCS ${GLOBS})
+        set(CPPCHECK_COMMAND
+            ${CPPCHECK_EXE}
+            -q
+            # -v
+            # --report-progress
+            ${CPPCHECK_FORCE}
+            --cppcheck-build-dir=${CPPCHECK_BUILD_DIR}
+            --platform=native
+            --template=gcc
+            --error-exitcode=1
+            -j ${CPPCHECK_JOBS}
+            ${CPPCHECK_DEFINES}
+            ${CPPCHECK_UNDEFINES}
+            ${CPPCHECK_INCLUDES}
+            --enable=${CPPCHECK_CHECKS}
+            --inline-suppr
+            --suppressions-list=${CMAKE_BINARY_DIR}/cppcheck-supressions
+            ${SOURCES} \${GSRCS}
+        )
+        string(REPLACE \";\" \" \" CPPCHECK_SHOW_COMMAND \"\${CPPCHECK_COMMAND}\")
+        message(\"\${CPPCHECK_SHOW_COMMAND}\")
+        execute_process(
+            COMMAND \${CPPCHECK_COMMAND}
+            WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+            RESULT_VARIABLE RESULT
+        )
+        if(NOT RESULT EQUAL 0)
+            message(FATAL_ERROR \"Cppcheck failed\")
+        endif()
+")
+
+    add_custom_target(cppcheck
+        COMMAND ${CMAKE_COMMAND} -P ${CMAKE_BINARY_DIR}/cppcheck.cmake
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        COMMENT "cppcheck: Running cppcheck..."
+    )
+    mark_as_analyzer(cppcheck)
+endmacro()
+
+
diff --git a/cmake/DoxygenDoc.cmake b/cmake/DoxygenDoc.cmake
new file mode 100644
index 0000000000..2e3669fcdf
--- /dev/null
+++ b/cmake/DoxygenDoc.cmake
@@ -0,0 +1,355 @@
+################################################################################
+# 
+# MIT License
+# 
+# Copyright (c) 2017 Advanced Micro Devices, Inc.
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# 
+################################################################################
+include(CMakeParseArguments)
+include(MainDoc)
+
+find_program(DOXYGEN_EXECUTABLE NAMES doxygen
+    PATH_SUFFIXES bin
+    DOC "Doxygen documentation generator"
+)
+mark_as_advanced(DOXYGEN_EXECUTABLE)
+
+find_path(DOT_EXECUTABLE NAMES dot
+    PATH_SUFFIXES bin
+    DOC "Graphviz"
+)
+mark_as_advanced(DOT_EXECUTABLE)
+
+set(DOXYGEN_ARGS
+ABBREVIATE_BRIEF
+ALIASES
+ALLEXTERNALS
+ALLOW_UNICODE_NAMES
+ALPHABETICAL_INDEX
+ALWAYS_DETAILED_SEC
+AUTOLINK_SUPPORT
+BINARY_TOC
+BRIEF_MEMBER_DESC
+BUILTIN_STL_SUPPORT
+CALLER_GRAPH
+CALL_GRAPH
+CASE_SENSE_NAMES
+CHM_FILE
+CHM_INDEX_ENCODING
+CITE_BIB_FILES
+CLANG_ASSISTED_PARSING
+CLANG_OPTIONS
+CLASS_DIAGRAMS
+CLASS_GRAPH
+COLLABORATION_GRAPH
+COLS_IN_ALPHA_INDEX
+COMPACT_LATEX
+COMPACT_RTF
+CPP_CLI_SUPPORT
+CREATE_SUBDIRS
+DIAFILE_DIRS
+DIA_PATH
+DIRECTORY_GRAPH
+DISABLE_INDEX
+DISTRIBUTE_GROUP_DOC
+DOCBOOK_OUTPUT
+DOCBOOK_PROGRAMLISTING
+DOCSET_BUNDLE_ID
+DOCSET_FEEDNAME
+DOCSET_PUBLISHER_ID
+DOCSET_PUBLISHER_NAME
+DOTFILE_DIRS
+DOT_CLEANUP
+DOT_FONTNAME
+DOT_FONTPATH
+DOT_FONTSIZE
+DOT_GRAPH_MAX_NODES
+DOT_IMAGE_FORMAT
+DOT_MULTI_TARGETS
+DOT_NUM_THREADS
+# DOT_PATH
+DOT_TRANSPARENT
+DOXYFILE_ENCODING
+ECLIPSE_DOC_ID
+ENABLED_SECTIONS
+ENABLE_PREPROCESSING
+ENUM_VALUES_PER_LINE
+EXAMPLE_PATH
+EXAMPLE_PATTERNS
+EXAMPLE_RECURSIVE
+EXCLUDE
+EXCLUDE_PATTERNS
+EXCLUDE_SYMBOLS
+EXCLUDE_SYMLINKS
+EXPAND_AS_DEFINED
+EXPAND_ONLY_PREDEF
+EXTENSION_MAPPING
+EXTERNAL_GROUPS
+EXTERNAL_PAGES
+EXTERNAL_SEARCH
+EXTERNAL_SEARCH_ID
+EXTRACT_ALL
+EXTRACT_ANON_NSPACES
+EXTRACT_LOCAL_CLASSES
+EXTRACT_LOCAL_METHODS
+EXTRACT_PACKAGE
+EXTRACT_PRIVATE
+EXTRACT_STATIC
+EXTRA_PACKAGES
+EXTRA_SEARCH_MAPPINGS
+EXT_LINKS_IN_WINDOW
+FILE_PATTERNS
+FILE_VERSION_FILTER
+FILTER_PATTERNS
+FILTER_SOURCE_FILES
+FILTER_SOURCE_PATTERNS
+FORCE_LOCAL_INCLUDES
+FORMULA_FONTSIZE
+FORMULA_TRANSPARENT
+FULL_PATH_NAMES
+GENERATE_AUTOGEN_DEF
+GENERATE_BUGLIST
+GENERATE_CHI
+GENERATE_DEPRECATEDLIST
+GENERATE_DOCBOOK
+GENERATE_DOCSET
+GENERATE_ECLIPSEHELP
+GENERATE_HTML
+GENERATE_HTMLHELP
+GENERATE_LATEX
+GENERATE_LEGEND
+GENERATE_MAN
+GENERATE_PERLMOD
+GENERATE_QHP
+GENERATE_RTF
+GENERATE_TAGFILE
+GENERATE_TESTLIST
+GENERATE_TODOLIST
+GENERATE_TREEVIEW
+GENERATE_XML
+GRAPHICAL_HIERARCHY
+GROUP_GRAPHS
+GROUP_NESTED_COMPOUNDS
+# HAVE_DOT
+HHC_LOCATION
+HIDE_COMPOUND_REFERENCE
+HIDE_FRIEND_COMPOUNDS
+HIDE_IN_BODY_DOCS
+HIDE_SCOPE_NAMES
+HIDE_UNDOC_CLASSES
+HIDE_UNDOC_MEMBERS
+HIDE_UNDOC_RELATIONS
+HTML_COLORSTYLE_GAMMA
+HTML_COLORSTYLE_HUE
+HTML_COLORSTYLE_SAT
+HTML_DYNAMIC_SECTIONS
+HTML_EXTRA_FILES
+HTML_EXTRA_STYLESHEET
+HTML_FILE_EXTENSION
+HTML_FOOTER
+HTML_HEADER
+HTML_INDEX_NUM_ENTRIES
+HTML_OUTPUT
+HTML_STYLESHEET
+HTML_TIMESTAMP
+IDL_PROPERTY_SUPPORT
+IGNORE_PREFIX
+IMAGE_PATH
+INCLUDED_BY_GRAPH
+INCLUDE_FILE_PATTERNS
+INCLUDE_GRAPH
+INCLUDE_PATH
+INHERIT_DOCS
+INLINE_GROUPED_CLASSES
+INLINE_INFO
+INLINE_INHERITED_MEMB
+INLINE_SIMPLE_STRUCTS
+INLINE_SOURCES
+INPUT
+INPUT_ENCODING
+INPUT_FILTER
+INTERACTIVE_SVG
+INTERNAL_DOCS
+JAVADOC_AUTOBRIEF
+LATEX_BATCHMODE
+LATEX_BIB_STYLE
+LATEX_CMD_NAME
+LATEX_EXTRA_FILES
+LATEX_EXTRA_STYLESHEET
+LATEX_FOOTER
+LATEX_HEADER
+LATEX_HIDE_INDICES
+LATEX_OUTPUT
+LATEX_SOURCE_CODE
+LATEX_TIMESTAMP
+LAYOUT_FILE
+LOOKUP_CACHE_SIZE
+MACRO_EXPANSION
+MAKEINDEX_CMD_NAME
+MAN_EXTENSION
+MAN_LINKS
+MAN_OUTPUT
+MAN_SUBDIR
+MARKDOWN_SUPPORT
+MATHJAX_CODEFILE
+MATHJAX_EXTENSIONS
+MATHJAX_FORMAT
+MATHJAX_RELPATH
+MAX_DOT_GRAPH_DEPTH
+MAX_INITIALIZER_LINES
+MSCFILE_DIRS
+MSCGEN_PATH
+MULTILINE_CPP_IS_BRIEF
+OPTIMIZE_FOR_FORTRAN
+OPTIMIZE_OUTPUT_FOR_C
+OPTIMIZE_OUTPUT_JAVA
+OPTIMIZE_OUTPUT_VHDL
+OUTPUT_DIRECTORY
+OUTPUT_LANGUAGE
+PAPER_TYPE
+PDF_HYPERLINKS
+PERLMOD_LATEX
+PERLMOD_MAKEVAR_PREFIX
+PERLMOD_PRETTY
+PERL_PATH
+PLANTUML_CFG_FILE
+PLANTUML_INCLUDE_PATH
+PLANTUML_JAR_PATH
+PREDEFINED
+PROJECT_BRIEF
+PROJECT_LOGO
+PROJECT_NAME
+PROJECT_NUMBER
+QCH_FILE
+QHG_LOCATION
+QHP_CUST_FILTER_ATTRS
+QHP_CUST_FILTER_NAME
+QHP_NAMESPACE
+QHP_SECT_FILTER_ATTRS
+QHP_VIRTUAL_FOLDER
+QT_AUTOBRIEF
+QUIET
+RECURSIVE
+REFERENCED_BY_RELATION
+REFERENCES_LINK_SOURCE
+REFERENCES_RELATION
+REPEAT_BRIEF
+RTF_EXTENSIONS_FILE
+RTF_HYPERLINKS
+RTF_OUTPUT
+RTF_SOURCE_CODE
+RTF_STYLESHEET_FILE
+SEARCHDATA_FILE
+SEARCHENGINE
+SEARCHENGINE_URL
+SEARCH_INCLUDES
+SEPARATE_MEMBER_PAGES
+SERVER_BASED_SEARCH
+SHORT_NAMES
+SHOW_FILES
+SHOW_GROUPED_MEMB_INC
+SHOW_INCLUDE_FILES
+SHOW_NAMESPACES
+SHOW_USED_FILES
+SIP_SUPPORT
+SKIP_FUNCTION_MACROS
+SORT_BRIEF_DOCS
+SORT_BY_SCOPE_NAME
+SORT_GROUP_NAMES
+SORT_MEMBERS_CTORS_1ST
+SORT_MEMBER_DOCS
+SOURCE_BROWSER
+SOURCE_TOOLTIPS
+STRICT_PROTO_MATCHING
+STRIP_CODE_COMMENTS
+STRIP_FROM_INC_PATH
+STRIP_FROM_PATH
+SUBGROUPING
+TAB_SIZE
+TAGFILES
+TCL_SUBST
+TEMPLATE_RELATIONS
+TOC_EXPAND
+TOC_INCLUDE_HEADINGS
+TREEVIEW_WIDTH
+TYPEDEF_HIDES_STRUCT
+UML_LIMIT_NUM_FIELDS
+UML_LOOK
+USE_HTAGS
+USE_MATHJAX
+USE_MDFILE_AS_MAINPAGE
+USE_PDFLATEX
+VERBATIM_HEADERS
+WARNINGS
+WARN_AS_ERROR
+WARN_FORMAT
+WARN_IF_DOC_ERROR
+WARN_IF_UNDOCUMENTED
+WARN_LOGFILE
+WARN_NO_PARAMDOC
+XML_OUTPUT
+XML_PROGRAMLISTING
+)
+
+set(DOXYGEN_CONFIG_FILE "${CMAKE_CURRENT_BINARY_DIR}/doxygen/doxygen.conf" CACHE PATH "Path to generated doxygen configuration file")
+
+function(add_doxygen_doc)
+    set(options)
+    set(oneValueArgs)
+    set(multiValueArgs DEPENDS ${DOXYGEN_ARGS})
+
+    cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    file(WRITE ${DOXYGEN_CONFIG_FILE} "# Auto-generated doxygen configuration file\n")
+
+    foreach(ARG ${DOXYGEN_ARGS})
+        if(PARSE_${ARG})
+            string(REPLACE ";" " " ARG_VALUE ${PARSE_${ARG}})
+            file(APPEND ${DOXYGEN_CONFIG_FILE} "\n${ARG} = ${ARG_VALUE}\n")
+        endif()
+    endforeach()
+
+    if(PARSE_OUTPUT_DIRECTORY)
+        if(NOT EXISTS ${PARSE_OUTPUT_DIRECTORY})
+            file(MAKE_DIRECTORY ${PARSE_OUTPUT_DIRECTORY})
+        endif()
+    endif()
+
+    if(DOT_EXECUTABLE)
+        file(APPEND ${DOXYGEN_CONFIG_FILE} "\nDOT_PATH = \"${DOT_EXECUTABLE}\"\n")
+        file(APPEND ${DOXYGEN_CONFIG_FILE} "\nHAVE_DOT = YES\n")
+    else()
+        file(APPEND ${DOXYGEN_CONFIG_FILE} "\nHAVE_DOT = NO\n")
+    endif()
+
+    add_custom_target(doxygen
+        ${DOXYGEN_EXECUTABLE} ${DOXYGEN_CONFIG_FILE}
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        COMMENT "Building documentation with doxygen"
+    )
+    if(PARSE_OUTPUT_DIRECTORY)
+        clean_doc_output(${PARSE_OUTPUT_DIRECTORY})
+    endif()
+    mark_as_doc(doxygen)
+    if(PARSE_DEPENDS)
+        add_dependencies(doxygen ${PARSE_DEPENDS})
+    endif()
+endfunction()
diff --git a/cmake/EnableCompilerWarnings.cmake b/cmake/EnableCompilerWarnings.cmake
new file mode 100644
index 0000000000..9f193b2090
--- /dev/null
+++ b/cmake/EnableCompilerWarnings.cmake
@@ -0,0 +1,110 @@
+################################################################################
+#
+# MIT License
+#
+# Copyright (c) 2017 Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+################################################################################
+# - Enable warning all for gcc/clang or use /W4 for visual studio
+
+## Strict warning level
+if (MSVC)
+    # Use the highest warning level for visual studio.
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /w")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /w")
+    # set(CMAKE_CXX_WARNING_LEVEL 4)
+    # if (CMAKE_CXX_FLAGS MATCHES "/W[0-4]")
+    #     string(REGEX REPLACE "/W[0-4]" "/W4" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+    # else ()
+    #     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4")
+    # endif ()
+
+    # set(CMAKE_C_WARNING_LEVEL 4)
+    # if (CMAKE_C_FLAGS MATCHES "/W[0-4]")
+    #     string(REGEX REPLACE "/W[0-4]" "/W4" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
+    # else ()
+    #     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /W4")
+    # endif ()
+
+else()
+    foreach(COMPILER C CXX)
+        set(CMAKE_COMPILER_WARNINGS)
+        # use -Wall for gcc and clang
+        list(APPEND CMAKE_COMPILER_WARNINGS
+            -Wall
+            -Wextra
+            -Wcomment
+            -Wendif-labels
+            -Wformat
+            -Winit-self
+            -Wreturn-type
+            -Wsequence-point
+            # Shadow is broken on gcc when using lambdas
+            # -Wshadow
+            -Wswitch
+            -Wtrigraphs
+            -Wundef
+            -Wuninitialized
+            -Wunreachable-code
+            -Wunused
+
+            -Wno-sign-compare
+            -Wno-extra-semi-stmt
+        )
+        if (CMAKE_${COMPILER}_COMPILER_ID MATCHES "Clang")
+            list(APPEND CMAKE_COMPILER_WARNINGS
+                -Weverything
+                -Wno-c++98-compat
+                -Wno-c++98-compat-pedantic
+                -Wno-conversion
+                -Wno-double-promotion
+                -Wno-exit-time-destructors
+                -Wno-extra-semi
+                -Wno-float-conversion
+                -Wno-gnu-anonymous-struct
+                -Wno-gnu-zero-variadic-macro-arguments
+                -Wno-missing-prototypes
+                -Wno-nested-anon-types
+                -Wno-padded
+                -Wno-return-std-move-in-c++11
+                -Wno-shorten-64-to-32
+                -Wno-sign-conversion
+                -Wno-unknown-warning-option
+                -Wno-unused-command-line-argument
+                -Wno-weak-vtables
+                -Wno-covered-switch-default
+            )
+        else()
+            if (CMAKE_${COMPILER}_COMPILER_ID MATCHES "GNU" AND ${COMPILER} MATCHES "CXX")
+                # cmake 3.5.2 does not support >=.
+                if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS "6.1")
+                    list(APPEND CMAKE_COMPILER_WARNINGS
+                        -Wno-ignored-attributes)
+                endif()
+            endif()
+            list(APPEND CMAKE_COMPILER_WARNINGS
+                -Wno-missing-field-initializers
+                -Wno-deprecated-declarations
+            )
+        endif()
+        add_definitions(${CMAKE_COMPILER_WARNINGS})
+    endforeach()
+endif ()
diff --git a/cmake/TargetFlags.cmake b/cmake/TargetFlags.cmake
deleted file mode 100644
index 4f83fb5d39..0000000000
--- a/cmake/TargetFlags.cmake
+++ /dev/null
@@ -1,50 +0,0 @@
-
-function(get_target_property2 VAR TARGET PROPERTY)
-    get_target_property(_pflags ${TARGET} ${PROPERTY})
-    if(_pflags)
-        set(${VAR} ${_pflags} PARENT_SCOPE)
-    else()
-        set(${VAR} "" PARENT_SCOPE)
-    endif()
-endfunction()
-
-
-macro(append_flags FLAGS TARGET PROPERTY PREFIX)
-    get_target_property2(_pflags ${TARGET} ${PROPERTY})
-    foreach(FLAG ${_pflags})
-        if(TARGET ${FLAG})
-            target_flags(_pflags2 ${FLAG})
-            string(APPEND ${FLAGS} " ${_pflags2}")
-        else()
-            string(APPEND ${FLAGS} " ${PREFIX}${FLAG}")
-        endif()
-    endforeach()
-endmacro()
-
-macro(append_link_flags FLAGS TARGET PROPERTY)
-    get_target_property2(_pflags ${TARGET} ${PROPERTY})
-    foreach(FLAG ${_pflags})
-        if(TARGET ${FLAG})
-            target_flags(_pflags2 ${FLAG})
-            string(APPEND ${FLAGS} " ${_pflags2}")
-        elseif(FLAG MATCHES "^-.*")
-            string(APPEND ${FLAGS} " ${FLAG}")
-        elseif(EXISTS ${FLAG})
-            string(APPEND ${FLAGS} " ${FLAG}")
-        else()
-            string(APPEND ${FLAGS} " -l${FLAG}")
-        endif()
-    endforeach()
-endmacro()
-
-function(target_flags FLAGS TARGET)
-    set(_flags)
-    append_flags(_flags ${TARGET} "INTERFACE_COMPILE_OPTIONS" "")
-    append_flags(_flags ${TARGET} "INTERFACE_COMPILE_DEFINITIONS" "-D")
-    append_flags(_flags ${TARGET} "INTERFACE_INCLUDE_DIRECTORIES" "-isystem ")
-    append_flags(_flags ${TARGET} "INTERFACE_LINK_DIRECTORIES" "-L ")
-    append_flags(_flags ${TARGET} "INTERFACE_LINK_OPTIONS" "")
-    append_link_flags(_flags ${TARGET} "INTERFACE_LINK_LIBRARIES" "")
-    # message("_flags: ${_flags}")
-    set(${FLAGS} ${_flags} PARENT_SCOPE)
-endfunction()
diff --git a/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp b/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp
index 5c582dea46..09ea16fa23 100644
--- a/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp
+++ b/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp
@@ -2,8 +2,8 @@
 #define CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1_NHWC_KYXC_NHWK_HPP
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 
 namespace ck {
 
@@ -23,9 +23,9 @@ template <typename... Wei,
           index_t GemmK1Value>
 __host__ __device__ constexpr auto
 transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk(
-    const DynamicTensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
-    const DynamicTensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
-    const DynamicTensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const TensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
     const ConvStrides& conv_strides,
     const ConvDilations& conv_dilations,
     const InLeftPads& in_left_pads,
@@ -102,7 +102,7 @@ transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk(
     const auto K0 = K / K1;
 
     // weight tensor
-    const auto wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc = transform_tensor_descriptor(
         wei_k_y_x_c_grid_desc,
         make_tuple(make_pass_through_transform(K),
                    make_embed_transform(make_tuple(YDot, YTilda),
@@ -114,28 +114,28 @@ transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk(
         make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
 
     const auto wei_k0_k1_ydotslice_xdotslice_c_grid_desc =
-        transform_dynamic_tensor_descriptor(wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc,
-                                            make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
-                                                       make_slice_transform(YDot, I0, YDotSlice),
-                                                       make_slice_transform(XDot, I0, XDotSlice),
-                                                       make_freeze_transform(IYTilda),
-                                                       make_freeze_transform(IXTilda),
-                                                       make_pass_through_transform(C)),
-                                            make_tuple(Sequence<0>{},
-                                                       Sequence<1>{},
-                                                       Sequence<3>{},
-                                                       Sequence<2>{},
-                                                       Sequence<4>{},
-                                                       Sequence<5>{}),
-                                            make_tuple(Sequence<0, 1>{},
-                                                       Sequence<2>{},
-                                                       Sequence<3>{},
-                                                       Sequence<>{},
-                                                       Sequence<>{},
-                                                       Sequence<4>{}));
+        transform_tensor_descriptor(wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
+                                               make_slice_transform(YDot, I0, YDotSlice),
+                                               make_slice_transform(XDot, I0, XDotSlice),
+                                               make_freeze_transform(IYTilda),
+                                               make_freeze_transform(IXTilda),
+                                               make_pass_through_transform(C)),
+                                    make_tuple(Sequence<0>{},
+                                               Sequence<1>{},
+                                               Sequence<3>{},
+                                               Sequence<2>{},
+                                               Sequence<4>{},
+                                               Sequence<5>{}),
+                                    make_tuple(Sequence<0, 1>{},
+                                               Sequence<2>{},
+                                               Sequence<3>{},
+                                               Sequence<>{},
+                                               Sequence<>{},
+                                               Sequence<4>{}));
 
 #if 1
-    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
         wei_k0_k1_ydotslice_xdotslice_c_grid_desc,
         make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)),
                    make_pass_through_transform(C),
@@ -143,7 +143,7 @@ transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk(
         make_tuple(Sequence<2, 3, 0>{}, Sequence<4>{}, Sequence<1>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
 #else
-    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
         wei_k0_k1_ydotslice_xdotslice_c_grid_desc,
         make_tuple(make_merge_transform(make_tuple(K0, YDotSlice, XDotSlice)),
                    make_pass_through_transform(C),
@@ -154,7 +154,7 @@ transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk(
 
     // output tensor
     // this add padding check
-    const auto out_n_hop_wop_k_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto out_n_hop_wop_k_grid_desc = transform_tensor_descriptor(
         out_n_ho_wo_k_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_pad_transform(Ho, I0, I0),
@@ -163,7 +163,7 @@ transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk(
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
 
-    const auto out_n_ydot_htilda_xdot_wtilda_k_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto out_n_ydot_htilda_xdot_wtilda_k_grid_desc = transform_tensor_descriptor(
         out_n_hop_wop_k_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_embed_transform(make_tuple(YDot, HTilda),
@@ -175,7 +175,7 @@ transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk(
         make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
 
     const auto out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc =
-        transform_dynamic_tensor_descriptor(
+        transform_tensor_descriptor(
             out_n_ydot_htilda_xdot_wtilda_k_grid_desc,
             make_tuple(make_pass_through_transform(N),
                        make_slice_transform(YDot, I0, YDotSlice),
@@ -197,7 +197,7 @@ transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk(
                        Sequence<5, 6>{}));
 
 #if 1
-    const auto out_gemmk0_gemmn_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto out_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
         out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc,
         make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)),
                    make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice)),
@@ -205,7 +205,7 @@ transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk(
         make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}, Sequence<6>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
 #else
-    const auto out_gemmk0_gemmn_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto out_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
         out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc,
         make_tuple(make_merge_transform(make_tuple(K0, YDotSlice, XDotSlice)),
                    make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice)),
@@ -215,7 +215,7 @@ transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk(
 #endif
 
     // input tensor
-    const auto in_n_hip_wip_c_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
         in_n_hi_wi_c_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_pad_transform(Hi, InLeftPadH, InRightPadH),
@@ -224,7 +224,7 @@ transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk(
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
 
-    const auto in_n_ytilda_htilda_xtilda_wtilda_c_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n_ytilda_htilda_xtilda_wtilda_c_grid_desc = transform_tensor_descriptor(
         in_n_hip_wip_c_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_embed_transform(make_tuple(YTilda, HTilda),
@@ -235,7 +235,7 @@ transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk(
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
         make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
 
-    const auto in_n_htildaslice_wtildaslice_c_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n_htildaslice_wtildaslice_c_grid_desc = transform_tensor_descriptor(
         in_n_ytilda_htilda_xtilda_wtilda_c_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_freeze_transform(IYTilda),
@@ -256,7 +256,7 @@ transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk(
                    Sequence<2>{},
                    Sequence<3>{}));
 
-    const auto in_gemmm_gemmn_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
         in_n_htildaslice_wtildaslice_c_grid_desc,
         make_tuple(make_pass_through_transform(C),
                    make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice))),
diff --git a/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp b/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp
index 377a1ac29b..9c60e8c3ac 100644
--- a/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp
+++ b/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp
@@ -2,8 +2,8 @@
 #define CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1R2_NHWC_KYXC_NHWK_HPP
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 
 namespace ck {
 
@@ -26,9 +26,9 @@ template <typename... Wei,
           index_t GemmK1Value>
 __host__ __device__ constexpr auto
 transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk(
-    const DynamicTensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
-    const DynamicTensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
-    const DynamicTensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const TensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
     const ConvStrides& conv_strides,
     const ConvDilations& conv_dilations,
     const InLeftPads& in_left_pads,
@@ -106,7 +106,7 @@ transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk(
 
     // A: output tensor
     // this add padding check
-    const auto out_n_hop_wop_k_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto out_n_hop_wop_k_grid_desc = transform_tensor_descriptor(
         out_n_ho_wo_k_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_pad_transform(Ho, I0, I0),
@@ -115,7 +115,7 @@ transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk(
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
 
-    const auto out_n_ydot_htilda_xdot_wtilda_k_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto out_n_ydot_htilda_xdot_wtilda_k_grid_desc = transform_tensor_descriptor(
         out_n_hop_wop_k_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_embed_transform(make_tuple(YDot, HTilda),
@@ -127,7 +127,7 @@ transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk(
         make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
 
     const auto out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc =
-        transform_dynamic_tensor_descriptor(
+        transform_tensor_descriptor(
             out_n_ydot_htilda_xdot_wtilda_k_grid_desc,
             make_tuple(make_pass_through_transform(N),
                        make_slice_transform(YDot, I0, YDotSlice),
@@ -149,7 +149,7 @@ transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk(
                        Sequence<5, 6>{}));
 
 #if 1
-    const auto out_gemmk0_gemmm_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto out_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
         out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc,
         make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)),
                    make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice)),
@@ -157,7 +157,7 @@ transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk(
         make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}, Sequence<6>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
 #else
-    const auto out_gemmk0_gemmm_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto out_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
         out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc,
         make_tuple(make_merge_transform(make_tuple(K0, YDotSlice, XDotSlice)),
                    make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice)),
@@ -167,7 +167,7 @@ transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk(
 #endif
 
     // B: weight tensor
-    const auto wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc = transform_tensor_descriptor(
         wei_k_y_x_c_grid_desc,
         make_tuple(make_pass_through_transform(K),
                    make_embed_transform(make_tuple(YDot, YTilda),
@@ -179,28 +179,28 @@ transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk(
         make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
 
     const auto wei_k0_k1_ydotslice_xdotslice_c_grid_desc =
-        transform_dynamic_tensor_descriptor(wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc,
-                                            make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
-                                                       make_slice_transform(YDot, I0, YDotSlice),
-                                                       make_slice_transform(XDot, I0, XDotSlice),
-                                                       make_freeze_transform(IYTilda),
-                                                       make_freeze_transform(IXTilda),
-                                                       make_pass_through_transform(C)),
-                                            make_tuple(Sequence<0>{},
-                                                       Sequence<1>{},
-                                                       Sequence<3>{},
-                                                       Sequence<2>{},
-                                                       Sequence<4>{},
-                                                       Sequence<5>{}),
-                                            make_tuple(Sequence<0, 1>{},
-                                                       Sequence<2>{},
-                                                       Sequence<3>{},
-                                                       Sequence<>{},
-                                                       Sequence<>{},
-                                                       Sequence<4>{}));
+        transform_tensor_descriptor(wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
+                                               make_slice_transform(YDot, I0, YDotSlice),
+                                               make_slice_transform(XDot, I0, XDotSlice),
+                                               make_freeze_transform(IYTilda),
+                                               make_freeze_transform(IXTilda),
+                                               make_pass_through_transform(C)),
+                                    make_tuple(Sequence<0>{},
+                                               Sequence<1>{},
+                                               Sequence<3>{},
+                                               Sequence<2>{},
+                                               Sequence<4>{},
+                                               Sequence<5>{}),
+                                    make_tuple(Sequence<0, 1>{},
+                                               Sequence<2>{},
+                                               Sequence<3>{},
+                                               Sequence<>{},
+                                               Sequence<>{},
+                                               Sequence<4>{}));
 
 #if 1
-    const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
         wei_k0_k1_ydotslice_xdotslice_c_grid_desc,
         make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)),
                    make_pass_through_transform(C),
@@ -208,7 +208,7 @@ transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk(
         make_tuple(Sequence<2, 3, 0>{}, Sequence<4>{}, Sequence<1>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
 #else
-    const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
         wei_k0_k1_ydotslice_xdotslice_c_grid_desc,
         make_tuple(make_merge_transform(make_tuple(K0, YDotSlice, XDotSlice)),
                    make_pass_through_transform(C),
@@ -218,7 +218,7 @@ transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk(
 #endif
 
     // C: input tensor
-    const auto in_n_hip_wip_c_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
         in_n_hi_wi_c_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_pad_transform(Hi, InLeftPadH, InRightPadH),
@@ -227,7 +227,7 @@ transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk(
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
 
-    const auto in_n_ytilda_htilda_xtilda_wtilda_c_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n_ytilda_htilda_xtilda_wtilda_c_grid_desc = transform_tensor_descriptor(
         in_n_hip_wip_c_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_embed_transform(make_tuple(YTilda, HTilda),
@@ -238,7 +238,7 @@ transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk(
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
         make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
 
-    const auto in_n_htildaslice_wtildaslice_c_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n_htildaslice_wtildaslice_c_grid_desc = transform_tensor_descriptor(
         in_n_ytilda_htilda_xtilda_wtilda_c_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_freeze_transform(IYTilda),
@@ -259,7 +259,7 @@ transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk(
                    Sequence<2>{},
                    Sequence<3>{}));
 
-    const auto in_gemmm_gemmn_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
         in_n_htildaslice_wtildaslice_c_grid_desc,
         make_tuple(make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice)),
                    make_pass_through_transform(C)),
diff --git a/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp
index 404129365f..093a46256d 100644
--- a/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -2,8 +2,8 @@
 #define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4_NCHW_KCYX_NKHW_HPP
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 
 namespace ck {
 
@@ -18,9 +18,9 @@ template <typename... Wei,
           typename InLeftPads,
           typename InRightPads>
 __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(
-    const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
-    const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
-    const DynamicTensorDescriptor<Out...>& out_n_k_ho_wo_global_desc,
+    const TensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
+    const TensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
+    const TensorDescriptor<Out...>& out_n_k_ho_wo_global_desc,
     const ConvStrides& conv_strides,
     const ConvDilations& conv_dilations,
     const InLeftPads& in_left_pads,
@@ -57,14 +57,14 @@ __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_
     const auto InRightPadW = in_right_pads[I1];
 
     // weight tensor
-    const auto wei_gemmk_gemmm_global_desc = transform_dynamic_tensor_descriptor(
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C * Y * X)),
+    const auto wei_gemmk_gemmm_global_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)),
         make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
         make_tuple(Sequence<0>{}, Sequence<1>{}),
         make_tuple(Sequence<1>{}, Sequence<0>{}));
 
     // input tensor
-    const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor(
         in_n_c_hi_wi_global_desc,
         make_tuple(make_pass_through_transform(N),
                    make_pass_through_transform(C),
@@ -73,7 +73,7 @@ __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
 
-    const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor(
         in_n_c_hip_wip_global_desc,
         make_tuple(make_pass_through_transform(N),
                    make_pass_through_transform(C),
@@ -83,15 +83,15 @@ __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
 
     const auto in_gemmk_gemmn_global_desc =
-        transform_dynamic_tensor_descriptor(in_n_c_y_ho_x_wo_global_desc,
-                                            make_tuple(make_merge_transform(make_tuple(C, Y, X)),
-                                                       make_merge_transform(make_tuple(N, Ho, Wo))),
-                                            make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+        transform_tensor_descriptor(in_n_c_y_ho_x_wo_global_desc,
+                                    make_tuple(make_merge_transform(make_tuple(C, Y, X)),
+                                               make_merge_transform(make_tuple(N, Ho, Wo))),
+                                    make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
 
     // output tensor
-    const auto out_gemmm_gemmn_global_desc = transform_dynamic_tensor_descriptor(
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K, Ho * Wo)),
+    const auto out_gemmm_gemmn_global_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho * Wo)),
         make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(N, Ho * Wo))),
         make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}));
@@ -109,9 +109,9 @@ template <typename... Wei,
           typename InRightPads>
 __host__ __device__ constexpr auto
 transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_no_pad(
-    const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
-    const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
-    const DynamicTensorDescriptor<Out...>& out_n_k_ho_wo_global_desc,
+    const TensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
+    const TensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
+    const TensorDescriptor<Out...>& out_n_k_ho_wo_global_desc,
     const ConvStrides& conv_strides,
     const ConvDilations& conv_dilations,
     const InLeftPads& in_left_pads,
@@ -126,9 +126,6 @@ transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_no_pad(
     const auto C = in_n_c_hi_wi_global_desc.GetLength(I1);
     const auto K = out_n_k_ho_wo_global_desc.GetLength(I1);
 
-    const auto Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
-    const auto Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
-
     const auto Ho = out_n_k_ho_wo_global_desc.GetLength(I2);
     const auto Wo = out_n_k_ho_wo_global_desc.GetLength(I3);
 
@@ -150,14 +147,14 @@ transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_no_pad(
     assert(InLeftPadH == 0 && InLeftPadW == 0 && InRightPadH == 0 && InRightPadW == 0);
 
     // weight tensor
-    const auto wei_gemmk_gemmm_global_desc = transform_dynamic_tensor_descriptor(
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C * Y * X)),
+    const auto wei_gemmk_gemmm_global_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)),
         make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
         make_tuple(Sequence<0>{}, Sequence<1>{}),
         make_tuple(Sequence<1>{}, Sequence<0>{}));
 
     // input tensor
-    const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor(
         in_n_c_hi_wi_global_desc,
         make_tuple(make_pass_through_transform(N),
                    make_pass_through_transform(C),
@@ -167,15 +164,15 @@ transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_no_pad(
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
 
     const auto in_gemmk_gemmn_global_desc =
-        transform_dynamic_tensor_descriptor(in_n_c_y_ho_x_wo_global_desc,
-                                            make_tuple(make_merge_transform(make_tuple(C, Y, X)),
-                                                       make_merge_transform(make_tuple(N, Ho, Wo))),
-                                            make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+        transform_tensor_descriptor(in_n_c_y_ho_x_wo_global_desc,
+                                    make_tuple(make_merge_transform(make_tuple(C, Y, X)),
+                                               make_merge_transform(make_tuple(N, Ho, Wo))),
+                                    make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
 
     // output tensor
-    const auto out_gemmm_gemmn_global_desc = transform_dynamic_tensor_descriptor(
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K, Ho * Wo)),
+    const auto out_gemmm_gemmn_global_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho * Wo)),
         make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(N, Ho * Wo))),
         make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}));
@@ -192,9 +189,9 @@ template <typename... Wei,
           typename InLeftPads,
           typename InRightPads>
 __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_1x1(
-    const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
-    const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
-    const DynamicTensorDescriptor<Out...>& out_n_k_ho_wo_global_desc,
+    const TensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
+    const TensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
+    const TensorDescriptor<Out...>& out_n_k_ho_wo_global_desc,
     const ConvStrides& conv_strides,
     const ConvDilations& conv_dilations,
     const InLeftPads& in_left_pads,
@@ -209,9 +206,6 @@ __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_
     const auto C = in_n_c_hi_wi_global_desc.GetLength(I1);
     const auto K = out_n_k_ho_wo_global_desc.GetLength(I1);
 
-    const auto Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
-    const auto Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
-
     const auto Ho = out_n_k_ho_wo_global_desc.GetLength(I2);
     const auto Wo = out_n_k_ho_wo_global_desc.GetLength(I3);
 
@@ -235,22 +229,22 @@ __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_
            InRightPadW == 0);
 
     // weight tensor
-    const auto wei_gemmk_gemmm_global_desc = transform_dynamic_tensor_descriptor(
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C)),
+    const auto wei_gemmk_gemmm_global_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, C)),
         make_tuple(make_pass_through_transform(K), make_pass_through_transform(C)),
         make_tuple(Sequence<0>{}, Sequence<1>{}),
         make_tuple(Sequence<1>{}, Sequence<0>{}));
 
     // input tensor
-    const auto in_gemmk_gemmn_global_desc = transform_dynamic_tensor_descriptor(
+    const auto in_gemmk_gemmn_global_desc = transform_tensor_descriptor(
         in_n_c_hi_wi_global_desc,
         make_tuple(make_pass_through_transform(C), make_merge_transform(make_tuple(N, Ho, Wo))),
         make_tuple(Sequence<1>{}, Sequence<0, 2, 3>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}));
 
     // output tensor
-    const auto out_gemmm_gemmn_global_desc = transform_dynamic_tensor_descriptor(
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K, Ho * Wo)),
+    const auto out_gemmm_gemmn_global_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho * Wo)),
         make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(N, Ho * Wo))),
         make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}));
diff --git a/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp
index 79051d9512..9aa27884da 100644
--- a/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp
+++ b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp
@@ -2,8 +2,8 @@
 #define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4_NHWC_KYXC_NHWK_HPP
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 
 namespace ck {
 
@@ -18,9 +18,9 @@ template <typename... Wei,
           typename InLeftPads,
           typename InRightPads>
 __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk_pad(
-    const DynamicTensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
-    const DynamicTensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
-    const DynamicTensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const TensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
     const ConvStrides& conv_strides,
     const ConvDilations& conv_dilations,
     const InLeftPads& in_left_pads,
@@ -57,14 +57,14 @@ __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_
     const auto InRightPadW = in_right_pads[I1];
 
     // weight tensor
-    const auto wei_gemmk_gemmm_grid_desc = transform_dynamic_tensor_descriptor(
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, Y * X * C)),
+    const auto wei_gemmk_gemmm_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C)),
         make_tuple(make_pass_through_transform(K), make_pass_through_transform(Y * X * C)),
         make_tuple(Sequence<0>{}, Sequence<1>{}),
         make_tuple(Sequence<1>{}, Sequence<0>{}));
 
     // input tensor
-    const auto in_n_hip_wip_c_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
         in_n_hi_wi_c_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_pad_transform(Hi, InLeftPadH, InRightPadH),
@@ -73,7 +73,7 @@ __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
 
-    const auto in_n_y_ho_x_wo_c_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
         in_n_hip_wip_c_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
@@ -83,15 +83,15 @@ __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_
         make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
 
     const auto in_gemmk_gemmn_grid_desc =
-        transform_dynamic_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
-                                            make_tuple(make_merge_transform(make_tuple(Y, X, C)),
-                                                       make_merge_transform(make_tuple(N, Ho, Wo))),
-                                            make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+        transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
+                                    make_tuple(make_merge_transform(make_tuple(Y, X, C)),
+                                               make_merge_transform(make_tuple(N, Ho, Wo))),
+                                    make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
 
     // output tensor
-    const auto out_gemmm_gemmn_grid_desc = transform_dynamic_tensor_descriptor(
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N * Ho * Wo, K)),
+    const auto out_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K)),
         make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(K)),
         make_tuple(Sequence<0>{}, Sequence<1>{}),
         make_tuple(Sequence<1>{}, Sequence<0>{}));
@@ -108,9 +108,9 @@ template <typename... Wei,
           typename InLeftPads,
           typename InRightPads>
 __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk_1x1(
-    const DynamicTensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
-    const DynamicTensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
-    const DynamicTensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const TensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
     const ConvStrides& conv_strides,
     const ConvDilations& conv_dilations,
     const InLeftPads& in_left_pads,
@@ -125,9 +125,6 @@ __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_
     const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
     const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
 
-    const auto Hi = in_n_hi_wi_c_grid_desc.GetLength(I1);
-    const auto Wi = in_n_hi_wi_c_grid_desc.GetLength(I2);
-
     const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
     const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
 
@@ -151,22 +148,22 @@ __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_
            InRightPadW == 0);
 
     // weight tensor
-    const auto wei_gemmk_gemmm_grid_desc = transform_dynamic_tensor_descriptor(
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C)),
+    const auto wei_gemmk_gemmm_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, C)),
         make_tuple(make_pass_through_transform(K), make_pass_through_transform(C)),
         make_tuple(Sequence<0>{}, Sequence<1>{}),
         make_tuple(Sequence<1>{}, Sequence<0>{}));
 
     // input tensor
-    const auto in_gemmk_gemmn_grid_desc = transform_dynamic_tensor_descriptor(
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N * Ho * Wo, C)),
+    const auto in_gemmk_gemmn_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, C)),
         make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(C)),
         make_tuple(Sequence<0>{}, Sequence<1>{}),
         make_tuple(Sequence<1>{}, Sequence<0>{}));
 
     // output tensor
-    const auto out_gemmm_gemmn_grid_desc = transform_dynamic_tensor_descriptor(
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N * Ho * Wo, K)),
+    const auto out_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K)),
         make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(K)),
         make_tuple(Sequence<0>{}, Sequence<1>{}),
         make_tuple(Sequence<1>{}, Sequence<0>{}));
diff --git a/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
index 49ae26518e..16ae8b470d 100644
--- a/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
@@ -2,8 +2,8 @@
 #define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R2_NCHW_KCYX_NKHW_HPP
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 
 namespace ck {
 
@@ -20,9 +20,9 @@ template <typename... Wei,
           index_t GemmK1Value>
 __host__ __device__ constexpr auto
 transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(
-    const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_grid_desc,
-    const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_grid_desc,
-    const DynamicTensorDescriptor<Out...>& out_n_k_ho_wo_grid_desc,
+    const TensorDescriptor<Wei...>& wei_k_c_y_x_grid_desc,
+    const TensorDescriptor<In...>& in_n_c_hi_wi_grid_desc,
+    const TensorDescriptor<Out...>& out_n_k_ho_wo_grid_desc,
     const ConvStrides& conv_strides,
     const ConvDilations& conv_dilations,
     const InLeftPads& in_left_pads,
@@ -67,21 +67,21 @@ transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(
     const auto GemmK0 = GemmK / GemmK1;
 
     // weight tensor
-    const auto wei_gemmk_gemmm_grid_desc = transform_dynamic_tensor_descriptor(
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C * Y * X)),
+    const auto wei_gemmk_gemmm_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)),
         make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
         make_tuple(Sequence<0>{}, Sequence<1>{}),
         make_tuple(Sequence<1>{}, Sequence<0>{}));
 
-    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
-        wei_gemmk_gemmm_grid_desc,
-        make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
-                   make_pass_through_transform(GemmM)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc =
+        transform_tensor_descriptor(wei_gemmk_gemmm_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                                               make_pass_through_transform(GemmM)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
     // input tensor
-    const auto in_n_c_hip_wip_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n_c_hip_wip_grid_desc = transform_tensor_descriptor(
         in_n_c_hi_wi_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_pass_through_transform(C),
@@ -90,7 +90,7 @@ transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
 
-    const auto in_n_c_y_ho_x_wo_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n_c_y_ho_x_wo_grid_desc = transform_tensor_descriptor(
         in_n_c_hip_wip_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_pass_through_transform(C),
@@ -100,22 +100,22 @@ transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
 
     const auto in_gemmk_gemmn_grid_desc =
-        transform_dynamic_tensor_descriptor(in_n_c_y_ho_x_wo_grid_desc,
-                                            make_tuple(make_merge_transform(make_tuple(C, Y, X)),
-                                                       make_merge_transform(make_tuple(N, Ho, Wo))),
-                                            make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-    const auto in_gemmk0_gemmn_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
-        in_gemmk_gemmn_grid_desc,
-        make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
-                   make_pass_through_transform(GemmN)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        transform_tensor_descriptor(in_n_c_y_ho_x_wo_grid_desc,
+                                    make_tuple(make_merge_transform(make_tuple(C, Y, X)),
+                                               make_merge_transform(make_tuple(N, Ho, Wo))),
+                                    make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    const auto in_gemmk0_gemmn_gemmk1_grid_desc =
+        transform_tensor_descriptor(in_gemmk_gemmn_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                                               make_pass_through_transform(GemmN)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
     // output tensor
-    const auto out_gemmm_gemmn_grid_desc = transform_dynamic_tensor_descriptor(
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K, Ho * Wo)),
+    const auto out_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho * Wo)),
         make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(N, Ho * Wo))),
         make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}));
diff --git a/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp
index 5814e66766..e81c87d046 100644
--- a/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp
+++ b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp
@@ -2,8 +2,8 @@
 #define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R2_NHWC_KYXC_NHWK_HPP
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 
 namespace ck {
 
@@ -20,9 +20,9 @@ template <typename... Wei,
           index_t GemmK1Value>
 __host__ __device__ constexpr auto
 transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk_pad(
-    const DynamicTensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
-    const DynamicTensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
-    const DynamicTensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const TensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
     const ConvStrides& conv_strides,
     const ConvDilations& conv_dilations,
     const InLeftPads& in_left_pads,
@@ -67,21 +67,21 @@ transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk_pad(
     const auto GemmK0 = GemmK / GemmK1;
 
     // weight tensor
-    const auto wei_gemmk_gemmm_grid_desc = transform_dynamic_tensor_descriptor(
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, Y * X * C)),
+    const auto wei_gemmk_gemmm_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C)),
         make_tuple(make_pass_through_transform(K), make_pass_through_transform(Y * X * C)),
         make_tuple(Sequence<0>{}, Sequence<1>{}),
         make_tuple(Sequence<1>{}, Sequence<0>{}));
 
-    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
-        wei_gemmk_gemmm_grid_desc,
-        make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
-                   make_pass_through_transform(GemmM)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc =
+        transform_tensor_descriptor(wei_gemmk_gemmm_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                                               make_pass_through_transform(GemmM)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
     // input tensor
-    const auto in_n_hip_wip_c_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
         in_n_hi_wi_c_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_pad_transform(Hi, InLeftPadH, InRightPadH),
@@ -90,7 +90,7 @@ transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk_pad(
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
 
-    const auto in_n_y_ho_x_wo_c_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
         in_n_hip_wip_c_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
@@ -100,22 +100,22 @@ transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk_pad(
         make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
 
     const auto in_gemmk_gemmn_grid_desc =
-        transform_dynamic_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
-                                            make_tuple(make_merge_transform(make_tuple(Y, X, C)),
-                                                       make_merge_transform(make_tuple(N, Ho, Wo))),
-                                            make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-    const auto in_gemmk0_gemmn_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
-        in_gemmk_gemmn_grid_desc,
-        make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
-                   make_pass_through_transform(GemmN)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
+                                    make_tuple(make_merge_transform(make_tuple(Y, X, C)),
+                                               make_merge_transform(make_tuple(N, Ho, Wo))),
+                                    make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    const auto in_gemmk0_gemmn_gemmk1_grid_desc =
+        transform_tensor_descriptor(in_gemmk_gemmn_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                                               make_pass_through_transform(GemmN)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
     // output tensor
-    const auto out_gemmm_gemmn_grid_desc = transform_dynamic_tensor_descriptor(
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N * Ho * Wo, K)),
+    const auto out_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K)),
         make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(K)),
         make_tuple(Sequence<0>{}, Sequence<1>{}),
         make_tuple(Sequence<1>{}, Sequence<0>{}));
diff --git a/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp
index ad9d99f4e7..b0b07505e5 100644
--- a/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp
+++ b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp
@@ -2,8 +2,8 @@
 #define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R4_NHWC_KYXC_NHWK_HPP
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 
 namespace ck {
 
@@ -23,9 +23,9 @@ template <typename... In,
           index_t GemmK1Value>
 __host__ __device__ constexpr auto
 transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad(
-    const DynamicTensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
-    const DynamicTensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
-    const DynamicTensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const TensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
     const ConvStrides& conv_strides,
     const ConvDilations& conv_dilations,
     const InLeftPads& in_left_pads,
@@ -70,7 +70,7 @@ transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad(
     const auto GemmK0 = GemmK / GemmK1;
 
     // A: input tensor
-    const auto in_n_hip_wip_c_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
         in_n_hi_wi_c_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_pad_transform(Hi, InLeftPadH, InRightPadH),
@@ -79,7 +79,7 @@ transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad(
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
 
-    const auto in_n_y_ho_x_wo_c_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
         in_n_hip_wip_c_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
@@ -89,36 +89,36 @@ transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad(
         make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
 
     const auto in_gemmk_gemmm_grid_desc =
-        transform_dynamic_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
-                                            make_tuple(make_merge_transform(make_tuple(Y, X, C)),
-                                                       make_merge_transform(make_tuple(N, Ho, Wo))),
-                                            make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-    const auto in_gemmk0_gemmm_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
-        in_gemmk_gemmm_grid_desc,
-        make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
-                   make_pass_through_transform(GemmM)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
+                                    make_tuple(make_merge_transform(make_tuple(Y, X, C)),
+                                               make_merge_transform(make_tuple(N, Ho, Wo))),
+                                    make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    const auto in_gemmk0_gemmm_gemmk1_grid_desc =
+        transform_tensor_descriptor(in_gemmk_gemmm_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                                               make_pass_through_transform(GemmM)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
     // B: weight tensor
-    const auto wei_gemmk_gemmn_grid_desc = transform_dynamic_tensor_descriptor(
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, Y * X * C)),
+    const auto wei_gemmk_gemmn_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C)),
         make_tuple(make_pass_through_transform(K), make_pass_through_transform(Y * X * C)),
         make_tuple(Sequence<0>{}, Sequence<1>{}),
         make_tuple(Sequence<1>{}, Sequence<0>{}));
 
-    const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
-        wei_gemmk_gemmn_grid_desc,
-        make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
-                   make_pass_through_transform(GemmN)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    const auto wei_gemmk0_gemmn_gemmk1_grid_desc =
+        transform_tensor_descriptor(wei_gemmk_gemmn_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                                               make_pass_through_transform(GemmN)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
     // C: output tensor
-    const auto out_gemmm_gemmn_grid_desc = transform_dynamic_tensor_descriptor(
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N * Ho * Wo, K)),
+    const auto out_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K)),
         make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(K)),
         make_tuple(Sequence<0>{}, Sequence<1>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}));
diff --git a/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp
index e709f768cb..f5cb7f4877 100644
--- a/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp
@@ -2,8 +2,8 @@
 #define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_CONTRACTION_V6R1_NCHW_KCYX_NKHW_HPP
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 
 namespace ck {
 
@@ -24,9 +24,9 @@ template <typename... Wei,
           typename C0Type>
 __host__ __device__ constexpr auto
 transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad(
-    const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_grid_desc,
-    const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_grid_desc,
-    const DynamicTensorDescriptor<Out...>& out_n_k_ho_wo_grid_desc,
+    const TensorDescriptor<Wei...>& wei_k_c_y_x_grid_desc,
+    const TensorDescriptor<In...>& in_n_c_hi_wi_grid_desc,
+    const TensorDescriptor<Out...>& out_n_k_ho_wo_grid_desc,
     const ConvStrides& conv_strides,
     const ConvDilations& conv_dilations,
     const InLeftPads& in_left_pads,
@@ -68,15 +68,15 @@ transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad(
     const auto C1 = C / C0;
 
     // weight tensor
-    const auto wei_gk0_gm0_gm1_gk1_grid_desc = transform_dynamic_tensor_descriptor(
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C * Y * X)),
-        make_tuple(make_unmerge_transform(make_tuple(I1, K)),
-                   make_unmerge_transform(make_tuple(C0, C1 * Y * X))),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<1, 2>{}, Sequence<3, 0>{}));
+    const auto wei_gk0_gm0_gm1_gk1_grid_desc =
+        transform_tensor_descriptor(make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)),
+                                    make_tuple(make_unmerge_transform(make_tuple(I1, K)),
+                                               make_unmerge_transform(make_tuple(C0, C1 * Y * X))),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<1, 2>{}, Sequence<3, 0>{}));
 
     // input tensor
-    const auto in_n_c_hip_wip_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n_c_hip_wip_grid_desc = transform_tensor_descriptor(
         in_n_c_hi_wi_grid_desc,
         make_tuple(make_pass_through_transform(N),
                    make_pass_through_transform(C),
@@ -85,7 +85,7 @@ transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad(
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
 
-    const auto in_n0_n1_c0_c1_y_ho_x_wo_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_n0_n1_c0_c1_y_ho_x_wo_grid_desc = transform_tensor_descriptor(
         in_n_c_hip_wip_grid_desc,
         make_tuple(make_unmerge_transform(make_tuple(N0, N1)),
                    make_unmerge_transform(make_tuple(C0, C1)),
@@ -94,7 +94,7 @@ transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad(
         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
         make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}, Sequence<6, 7>{}));
 
-    const auto in_gk0_gn0_gn1_gk1_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto in_gk0_gn0_gn1_gk1_grid_desc = transform_tensor_descriptor(
         in_n0_n1_c0_c1_y_ho_x_wo_grid_desc,
         make_tuple(make_merge_transform(make_tuple(C1, Y, X)),
                    make_pass_through_transform(N0),
@@ -105,17 +105,17 @@ transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad(
 
     // output tensor
     const auto out_n_k_howo_grid_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K, Ho * Wo));
+        make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho * Wo));
 
-    const auto out_n0_n1_1_k_howo_grid_desc = transform_dynamic_tensor_descriptor(
-        out_n_k_howo_grid_desc,
-        make_tuple(make_unmerge_transform(make_tuple(N0, N1)),
-                   make_unmerge_transform(make_tuple(I1, K)),
-                   make_pass_through_transform(Ho * Wo)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-        make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}, Sequence<4>{}));
+    const auto out_n0_n1_1_k_howo_grid_desc =
+        transform_tensor_descriptor(out_n_k_howo_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(N0, N1)),
+                                               make_unmerge_transform(make_tuple(I1, K)),
+                                               make_pass_through_transform(Ho * Wo)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                                    make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}, Sequence<4>{}));
 
-    const auto out_gm0_gm1_gn0_gn1_grid_desc = transform_dynamic_tensor_descriptor(
+    const auto out_gm0_gm1_gn0_gn1_grid_desc = transform_tensor_descriptor(
         out_n0_n1_1_k_howo_grid_desc,
         make_tuple(make_pass_through_transform(I1),
                    make_pass_through_transform(K),
diff --git a/composable_kernel/include/tensor_description/cluster_descriptor.hpp b/composable_kernel/include/tensor_description/cluster_descriptor.hpp
index c3523623d9..d69bfb70c1 100644
--- a/composable_kernel/include/tensor_description/cluster_descriptor.hpp
+++ b/composable_kernel/include/tensor_description/cluster_descriptor.hpp
@@ -8,7 +8,7 @@ namespace ck {
 
 template <typename Lengths,
           typename ArrangeOrder = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type>
-__host__ __device__ constexpr auto make_cluster_descriptor_v2(
+__host__ __device__ constexpr auto make_cluster_descriptor(
     const Lengths& lengths,
     ArrangeOrder order = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type{})
 {
diff --git a/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp b/composable_kernel/include/tensor_description/multi_index_transform.hpp
similarity index 90%
rename from composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
rename to composable_kernel/include/tensor_description/multi_index_transform.hpp
index 967517bef7..42a5a875b7 100644
--- a/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
+++ b/composable_kernel/include/tensor_description/multi_index_transform.hpp
@@ -1,5 +1,5 @@
-#ifndef CK_DYNAMIC_MULTI_INDEX_TRANSFORM_HPP
-#define CK_DYNAMIC_MULTI_INDEX_TRANSFORM_HPP
+#ifndef CK_MULTI_INDEX_TRANSFORM_HPP
+#define CK_MULTI_INDEX_TRANSFORM_HPP
 
 #include "common_header.hpp"
 #include "multi_index.hpp"
@@ -7,7 +7,7 @@
 namespace ck {
 
 template <typename LowLength>
-struct DynamicPassThrough
+struct PassThrough
 {
     using LowerIndex = MultiIndex<1>;
     using UpperIndex = MultiIndex<1>;
@@ -16,9 +16,9 @@ struct DynamicPassThrough
 
     UpLengths up_lengths_;
 
-    __host__ __device__ constexpr DynamicPassThrough() = default;
+    __host__ __device__ constexpr PassThrough() = default;
 
-    __host__ __device__ constexpr DynamicPassThrough(const LowLength& low_length)
+    __host__ __device__ constexpr PassThrough(const LowLength& low_length)
         : up_lengths_{make_tuple(low_length)}
     {
     }
@@ -82,33 +82,36 @@ struct DynamicPassThrough
     __host__ __device__ void Print() const
     {
         printf("{");
-        printf("DynamicPassThrough, ");
+        printf("PassThrough, ");
         printf("up_lengths_");
         print_multi_index(up_lengths_);
         printf("}");
     }
 };
 
-template <typename LowLength, typename LeftPad, typename RightPad, bool SkipIsValidCheck = false>
-struct DynamicPad
+template <typename LowLength,
+          typename LeftPadLength,
+          typename RightPadLength,
+          bool SkipIsValidCheck = false>
+struct Pad
 {
     using LowerIndex = MultiIndex<1>;
     using UpperIndex = MultiIndex<1>;
 
-    using UpLengths = decltype(make_tuple(LowLength{} + LeftPad{} + RightPad{}));
+    using UpLengths = decltype(make_tuple(LowLength{} + LeftPadLength{} + RightPadLength{}));
 
     UpLengths up_lengths_;
-    LeftPad left_pad_;
-    RightPad right_pad_;
+    LeftPadLength left_pad_length_;
+    RightPadLength right_pad_length_;
 
-    __host__ __device__ constexpr DynamicPad() = default;
+    __host__ __device__ constexpr Pad() = default;
 
-    __host__ __device__ constexpr DynamicPad(const LowLength& low_length,
-                                             const LeftPad& left_pad,
-                                             const RightPad& right_pad)
-        : up_lengths_{make_tuple(low_length + left_pad + right_pad)},
-          left_pad_{left_pad},
-          right_pad_{right_pad}
+    __host__ __device__ constexpr Pad(const LowLength& low_length,
+                                      const LeftPadLength& left_pad_length,
+                                      const RightPadLength& right_pad_length)
+        : up_lengths_{make_tuple(low_length + left_pad_length + right_pad_length)},
+          left_pad_length_{left_pad_length},
+          right_pad_length_{right_pad_length}
     {
     }
 
@@ -125,7 +128,7 @@ struct DynamicPad
         static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
                       "wrong! inconsistent # of dimension");
 
-        idx_low(Number<0>{}) = idx_up[Number<0>{}] - left_pad_;
+        idx_low(Number<0>{}) = idx_up[Number<0>{}] - left_pad_length_;
     }
 
     template <typename LowIdxDiff,
@@ -161,45 +164,46 @@ struct DynamicPad
     __host__ __device__ constexpr bool
     IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& idx_up) const
     {
-        return SkipIsValidCheck || ((idx_up[Number<0>{}] >= left_pad_) &&
-                                    (idx_up[Number<0>{}] < up_lengths_[Number<0>{}] - right_pad_));
+        return SkipIsValidCheck ||
+               ((idx_up[Number<0>{}] >= left_pad_length_) &&
+                (idx_up[Number<0>{}] < up_lengths_[Number<0>{}] - right_pad_length_));
     }
 
     __host__ __device__ static constexpr bool IsKnownAtCompileTime()
     {
         return is_known_at_compile_time<UpLengths>::value &&
-               is_known_at_compile_time<LeftPad>::value &&
-               is_known_at_compile_time<RightPad>::value;
+               is_known_at_compile_time<LeftPadLength>::value &&
+               is_known_at_compile_time<RightPadLength>::value;
     }
 
     __host__ __device__ void Print() const
     {
         printf("{");
-        printf("DynamicPad, ");
+        printf("Pad, ");
         printf("up_lengths_");
         print_multi_index(up_lengths_);
-        printf("left_pad_ %d", index_t{left_pad_});
-        printf("right_pad_ %d", index_t{right_pad_});
+        printf("left_pad_length %d", index_t{left_pad_length_});
+        printf("right_pad_length %d", index_t{right_pad_length_});
         printf("}");
     }
 };
 
-template <typename LowLength, typename LeftPad, bool SkipIsValidCheck = false>
-struct DynamicLeftPad
+template <typename LowLength, typename LeftPadLength, bool SkipIsValidCheck = false>
+struct LeftPad
 {
     using LowerIndex = MultiIndex<1>;
     using UpperIndex = MultiIndex<1>;
 
-    using UpLengths = decltype(make_tuple(LowLength{} + LeftPad{}));
+    using UpLengths = decltype(make_tuple(LowLength{} + LeftPadLength{}));
 
     UpLengths up_lengths_;
-    LeftPad left_pad_;
+    LeftPadLength left_pad_length_;
 
-    __host__ __device__ constexpr DynamicLeftPad() = default;
+    __host__ __device__ constexpr LeftPad() = default;
 
-    __host__ __device__ constexpr DynamicLeftPad(const LowLength& low_length,
-                                                 const LeftPad& left_pad)
-        : up_lengths_{make_tuple(low_length + left_pad)}, left_pad_{left_pad}
+    __host__ __device__ constexpr LeftPad(const LowLength& low_length,
+                                          const LeftPadLength& left_pad_length)
+        : up_lengths_{make_tuple(low_length + left_pad_length)}, left_pad_length_{left_pad_length}
     {
     }
 
@@ -216,7 +220,7 @@ struct DynamicLeftPad
         static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
                       "wrong! inconsistent # of dimension");
 
-        idx_low(Number<0>{}) = idx_up[Number<0>{}] - left_pad_;
+        idx_low(Number<0>{}) = idx_up[Number<0>{}] - left_pad_length_;
     }
 
     template <typename LowIdxDiff,
@@ -252,45 +256,45 @@ struct DynamicLeftPad
     __host__ __device__ constexpr bool
     IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& idx_up) const
     {
-        return SkipIsValidCheck || (idx_up[Number<0>{}] >= left_pad_);
+        return SkipIsValidCheck || (idx_up[Number<0>{}] >= left_pad_length_);
     }
 
     __host__ __device__ static constexpr bool IsKnownAtCompileTime()
     {
         return is_known_at_compile_time<UpLengths>::value &&
-               is_known_at_compile_time<LeftPad>::value;
+               is_known_at_compile_time<LeftPadLength>::value;
     }
 
     __host__ __device__ void Print() const
     {
         printf("{");
-        printf("DynamicLeftPad, ");
+        printf("LeftPad, ");
         printf("up_lengths_");
         print_multi_index(up_lengths_);
-        printf("left_pad_ %d", index_t{left_pad_});
+        printf("left_pad_length_ %d", index_t{left_pad_length_});
         printf("}");
     }
 };
 
-template <typename LowLength, typename RightPad, bool SkipIsValidCheck = false>
-struct DynamicRightPad
+template <typename LowLength, typename RightPadLength, bool SkipIsValidCheck = false>
+struct RightPad
 {
     using LowerIndex = MultiIndex<1>;
     using UpperIndex = MultiIndex<1>;
 
-    using UpLengths = decltype(make_tuple(LowLength{} + RightPad{}));
+    using UpLengths = decltype(make_tuple(LowLength{} + RightPadLength{}));
 
     UpLengths up_lengths_;
     LowLength low_length_;
-    RightPad right_pad_;
+    RightPadLength right_pad_length_;
 
-    __host__ __device__ constexpr DynamicRightPad() = default;
+    __host__ __device__ constexpr RightPad() = default;
 
-    __host__ __device__ constexpr DynamicRightPad(const LowLength& low_length,
-                                                  const RightPad& right_pad)
-        : up_lengths_{make_tuple(low_length + right_pad)},
+    __host__ __device__ constexpr RightPad(const LowLength& low_length,
+                                           const RightPadLength& right_pad_length)
+        : up_lengths_{make_tuple(low_length + right_pad_length)},
           low_length_{low_length},
-          right_pad_{right_pad}
+          right_pad_length_{right_pad_length}
     {
     }
 
@@ -350,17 +354,17 @@ struct DynamicRightPad
     {
         return is_known_at_compile_time<UpLengths>::value &&
                is_known_at_compile_time<LowLength>::value &&
-               is_known_at_compile_time<RightPad>::value;
+               is_known_at_compile_time<RightPadLength>::value;
     }
 
     __host__ __device__ void Print() const
     {
         printf("{");
-        printf("DynamicRightPad, ");
+        printf("RightPad, ");
         printf("up_lengths_");
         print_multi_index(up_lengths_);
         printf("low_length_ %d", index_t{low_length_});
-        printf("left_pad_ %d", index_t{right_pad_});
+        printf("left_pad_length_ %d", index_t{right_pad_length_});
         printf("}");
     }
 };
@@ -373,8 +377,8 @@ struct DynamicRightPad
 //   at compile-time
 template <typename UpLengths,
           typename Coefficients,
-          typename std::enable_if<UpLengths::Size() == Coefficients::Size(), bool>::type = false>
-struct DynamicEmbed
+          typename enable_if<UpLengths::Size() == Coefficients::Size(), bool>::type = false>
+struct Embed
 {
     static constexpr index_t NDimUp = UpLengths::Size();
 
@@ -384,10 +388,10 @@ struct DynamicEmbed
     UpLengths up_lengths_;
     Coefficients coefficients_;
 
-    __host__ __device__ constexpr DynamicEmbed() = default;
+    __host__ __device__ constexpr Embed() = default;
 
-    __host__ __device__ constexpr DynamicEmbed(const UpLengths& up_lengths,
-                                               const Coefficients& coefficients)
+    __host__ __device__ constexpr Embed(const UpLengths& up_lengths,
+                                        const Coefficients& coefficients)
         : up_lengths_{up_lengths}, coefficients_{coefficients}
     {
     }
@@ -458,7 +462,7 @@ struct DynamicEmbed
     __host__ __device__ void Print() const
     {
         printf("{");
-        printf("DynamicEmbed, ");
+        printf("Embed, ");
         printf("up_lengths_ ");
         print_multi_index(up_lengths_);
         printf("coefficients_ ");
@@ -470,30 +474,30 @@ struct DynamicEmbed
 // Implementation of "Merge" transformation primitive that uses regular to do lowering of
 // multi-index and use carry-and-borrow check to do lowering of multi-index delta
 template <typename LowLengths>
-struct DynamicMerge_v1_carry_check
+struct Merge_v1_carry_check
 {
     static constexpr index_t NDimLow = LowLengths::Size();
 
     using LowerIndex = MultiIndex<NDimLow>;
     using UpperIndex = MultiIndex<1>;
 
-    using LowLengthsScan = decltype(
-        container_reverse_exclusive_scan(LowLengths{}, math::multiplies_v2{}, Number<1>{}));
+    using LowLengthsScan =
+        decltype(container_reverse_exclusive_scan(LowLengths{}, math::multiplies{}, Number<1>{}));
 
     using UpLengths =
-        decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies_v2{}, Number<1>{})));
+        decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies{}, Number<1>{})));
 
     LowLengths low_lengths_;
     LowLengthsScan low_lengths_scan_;
     UpLengths up_lengths_;
 
-    __host__ __device__ constexpr DynamicMerge_v1_carry_check() = default;
+    __host__ __device__ constexpr Merge_v1_carry_check() = default;
 
-    __host__ __device__ constexpr DynamicMerge_v1_carry_check(const LowLengths& low_lengths)
+    __host__ __device__ constexpr Merge_v1_carry_check(const LowLengths& low_lengths)
         : low_lengths_{low_lengths},
           low_lengths_scan_{
-              container_reverse_exclusive_scan(low_lengths, math::multiplies_v2{}, Number<1>{})},
-          up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies_v2{}, Number<1>{}))}
+              container_reverse_exclusive_scan(low_lengths, math::multiplies{}, Number<1>{})},
+          up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies{}, Number<1>{}))}
     {
         static_assert(LowerIndex::Size() == NDimLow, "wrong!");
     }
@@ -555,7 +559,7 @@ struct DynamicMerge_v1_carry_check
         LowerIndex idx_low_length_minus_idx_diff_low_const;
         LowerIndex idx_low_length_plus_idx_diff_low_const;
 
-#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
+#if !CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
         index_t tmp = idx_diff_up[Number<0>{}];
 
         static_for<0, NDimLow - 1, 1>{}([&](auto i) {
@@ -698,7 +702,7 @@ struct DynamicMerge_v1_carry_check
         LowerIndex idx_low_length_minus_idx_diff_low_const;
         LowerIndex idx_low_length_plus_idx_diff_low_const;
 
-#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
+#if !CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
         index_t tmp = idx_diff_up[Number<0>{}];
 
         static_for<0, NDimLow - 1, 1>{}([&](auto i) {
@@ -838,7 +842,7 @@ struct DynamicMerge_v1_carry_check
         //   very expensive.
         LowerIndex idx_diff_low_const;
 
-#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
+#if !CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
         index_t tmp = idx_diff_up[Number<0>{}];
 
         static_for<0, NDimLow - 1, 1>{}([&](auto i) {
@@ -981,7 +985,7 @@ struct DynamicMerge_v1_carry_check
     __host__ __device__ void Print() const
     {
         printf("{");
-        printf("DynamicMerge_v1_carry_check, ");
+        printf("Merge_v1_carry_check, ");
         printf("low_lengths_ ");
         print_multi_index(low_lengths_);
         printf("low_lengths_scan_ ");
@@ -1025,7 +1029,7 @@ struct lambda_merge_generate_MagicDivision_calculate_magic_shift
 //   5. When upper-index is int32_t type (when index_t is int32_t), its value need to be
 //   non-negative.
 template <typename LowLengths>
-struct DynamicMerge_v2_magic_division
+struct Merge_v2_magic_division
 {
     static constexpr index_t NDimLow = LowLengths::Size();
 
@@ -1033,7 +1037,7 @@ struct DynamicMerge_v2_magic_division
     using UpperIndex = MultiIndex<1>;
 
     using UpLengths =
-        decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies_v2{}, Number<1>{})));
+        decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies{}, Number<1>{})));
 
     using LowLengthsMagicDivisorMultipiler = decltype(
         generate_tuple(lambda_merge_generate_MagicDivision_calculate_magic_multiplier<LowLengths>{},
@@ -1048,9 +1052,9 @@ struct DynamicMerge_v2_magic_division
     LowLengthsMagicDivisorShift low_lengths_magic_divisor_shift_;
     UpLengths up_lengths_;
 
-    __host__ __device__ constexpr DynamicMerge_v2_magic_division() = default;
+    __host__ __device__ constexpr Merge_v2_magic_division() = default;
 
-    __host__ __device__ constexpr DynamicMerge_v2_magic_division(const LowLengths& low_lengths)
+    __host__ __device__ constexpr Merge_v2_magic_division(const LowLengths& low_lengths)
         : low_lengths_{low_lengths},
           low_lengths_magic_divisor_multiplier_{generate_tuple(
               [&](auto i) { return MagicDivision::CalculateMagicMultiplier(low_lengths[i]); },
@@ -1058,7 +1062,7 @@ struct DynamicMerge_v2_magic_division
           low_lengths_magic_divisor_shift_{generate_tuple(
               [&](auto i) { return MagicDivision::CalculateMagicShift(low_lengths[i]); },
               Number<NDimLow>{})},
-          up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies_v2{}, Number<1>{}))}
+          up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies{}, Number<1>{}))}
     {
         static_assert(LowerIndex::Size() == NDimLow, "wrong!");
     }
@@ -1151,7 +1155,7 @@ struct DynamicMerge_v2_magic_division
     __host__ __device__ void Print() const
     {
         printf("{");
-        printf("DynamicMerge_v2_magic_division, ");
+        printf("Merge_v2_magic_division, ");
         printf("low_lengths_ ");
         print_multi_index(low_lengths_);
         printf("low_lengths_magic_divisor_multiplier_ ");
@@ -1177,18 +1181,18 @@ struct DynamicMerge_v2_magic_division
 //   5. When upper-index is int32_t type (when index_t is int32_t), its value need to be
 //   non-negative.
 template <typename LowLengths>
-struct DynamicMerge_v2r2_magic_division
+struct Merge_v2r2_magic_division
 {
     static constexpr index_t NDimLow = LowLengths::Size();
 
     using LowerIndex = MultiIndex<NDimLow>;
     using UpperIndex = MultiIndex<1>;
 
-    using LowLengthsScan = decltype(
-        container_reverse_exclusive_scan(LowLengths{}, math::multiplies_v2{}, Number<1>{}));
+    using LowLengthsScan =
+        decltype(container_reverse_exclusive_scan(LowLengths{}, math::multiplies{}, Number<1>{}));
 
     using UpLengths =
-        decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies_v2{}, Number<1>{})));
+        decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies{}, Number<1>{})));
 
     using LowLengthsScanMagicDivisorMultipiler = decltype(generate_tuple(
         lambda_merge_generate_MagicDivision_calculate_magic_multiplier<LowLengthsScan>{},
@@ -1204,19 +1208,19 @@ struct DynamicMerge_v2r2_magic_division
     LowLengthsScanMagicDivisorShift low_lengths_scan_magic_divisor_shift_;
     UpLengths up_lengths_;
 
-    __host__ __device__ constexpr DynamicMerge_v2r2_magic_division() = default;
+    __host__ __device__ constexpr Merge_v2r2_magic_division() = default;
 
-    __host__ __device__ constexpr DynamicMerge_v2r2_magic_division(const LowLengths& low_lengths)
+    __host__ __device__ constexpr Merge_v2r2_magic_division(const LowLengths& low_lengths)
         : low_lengths_{low_lengths},
           low_lengths_scan_{
-              container_reverse_exclusive_scan(low_lengths, math::multiplies_v2{}, Number<1>{})},
+              container_reverse_exclusive_scan(low_lengths, math::multiplies{}, Number<1>{})},
           low_lengths_scan_magic_divisor_multiplier_{generate_tuple(
               [&](auto i) { return MagicDivision::CalculateMagicMultiplier(low_lengths_scan_[i]); },
               Number<NDimLow>{})},
           low_lengths_scan_magic_divisor_shift_{generate_tuple(
               [&](auto i) { return MagicDivision::CalculateMagicShift(low_lengths_scan_[i]); },
               Number<NDimLow>{})},
-          up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies_v2{}, Number<1>{}))}
+          up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies{}, Number<1>{}))}
     {
         static_assert(LowerIndex::Size() == NDimLow, "wrong!");
     }
@@ -1308,7 +1312,7 @@ struct DynamicMerge_v2r2_magic_division
     __host__ __device__ void Print() const
     {
         printf("{");
-        printf("DynamicMerge_v2r2_magic_division, ");
+        printf("Merge_v2r2_magic_division, ");
         printf("low_lengths_ ");
         print_multi_index(low_lengths_);
         printf("low_lengths_scan ");
@@ -1324,7 +1328,7 @@ struct DynamicMerge_v2r2_magic_division
 };
 
 template <typename UpLengths, bool Use24BitIntegerCalculation>
-struct DynamicUnMerge
+struct UnMerge
 {
     static constexpr index_t NDimUp = UpLengths::Size();
 
@@ -1332,17 +1336,17 @@ struct DynamicUnMerge
     using UpperIndex = MultiIndex<NDimUp>;
 
     using UpLengthsScan =
-        decltype(container_reverse_exclusive_scan(UpLengths{}, math::multiplies_v2{}, Number<1>{}));
+        decltype(container_reverse_exclusive_scan(UpLengths{}, math::multiplies{}, Number<1>{}));
 
     UpLengths up_lengths_;
     UpLengthsScan up_lengths_scan_;
 
-    __host__ __device__ constexpr DynamicUnMerge() = default;
+    __host__ __device__ constexpr UnMerge() = default;
 
-    __host__ __device__ constexpr DynamicUnMerge(const UpLengths& up_lengths)
+    __host__ __device__ constexpr UnMerge(const UpLengths& up_lengths)
         : up_lengths_{up_lengths},
           up_lengths_scan_{
-              container_reverse_exclusive_scan(up_lengths, math::multiplies_v2{}, Number<1>{})}
+              container_reverse_exclusive_scan(up_lengths, math::multiplies{}, Number<1>{})}
     {
     }
 
@@ -1414,7 +1418,7 @@ struct DynamicUnMerge
     __host__ __device__ void Print() const
     {
         printf("{");
-        printf("DynamicUnMerge, ");
+        printf("UnMerge, ");
         printf("up_lengths_");
         print_multi_index(up_lengths_);
         printf("up_lengths_scan_");
@@ -1424,13 +1428,13 @@ struct DynamicUnMerge
 };
 
 template <typename LowerIndex>
-struct DynamicFreeze
+struct Freeze
 {
     LowerIndex low_idx_;
 
-    __host__ __device__ constexpr DynamicFreeze() = default;
+    __host__ __device__ constexpr Freeze() = default;
 
-    __host__ __device__ constexpr DynamicFreeze(const LowerIndex& low_idx) : low_idx_{low_idx} {}
+    __host__ __device__ constexpr Freeze(const LowerIndex& low_idx) : low_idx_{low_idx} {}
 
     __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
 
@@ -1483,22 +1487,22 @@ struct DynamicFreeze
 
     __host__ __device__ void Print() const
     {
-        printf("DynamicFreeze");
+        printf("Freeze");
         printf("low_idx_ %d", index_t{low_idx_});
     }
 };
 
 // Insert a dangling upper dimension without lower dimension
 template <typename UpperLength>
-struct DynamicInsert
+struct Insert
 {
     using UpLengths = decltype(make_tuple(UpperLength{}));
 
     UpLengths up_lengths_;
 
-    __host__ __device__ constexpr DynamicInsert() = default;
+    __host__ __device__ constexpr Insert() = default;
 
-    __host__ __device__ constexpr DynamicInsert(const UpperLength& up_length)
+    __host__ __device__ constexpr Insert(const UpperLength& up_length)
         : up_lengths_{make_tuple(up_length)}
     {
     }
@@ -1550,13 +1554,13 @@ struct DynamicInsert
 
     __host__ __device__ void Print() const
     {
-        printf("DynamicInsert");
+        printf("Insert");
         print_multi_index(up_lengths_);
     }
 };
 
 template <typename VectorSize, typename UpLength>
-struct DynamicVectorize
+struct Vectorize
 {
     using LowerIndex = MultiIndex<1>;
     using UpperIndex = MultiIndex<1>;
@@ -1566,10 +1570,10 @@ struct DynamicVectorize
     UpLengths up_lengths_;
     VectorSize vector_size_;
 
-    __host__ __device__ constexpr DynamicVectorize() = default;
+    __host__ __device__ constexpr Vectorize() = default;
 
-    __host__ __device__ constexpr DynamicVectorize(const VectorSize& vector_size,
-                                                   const UpLength& up_length)
+    __host__ __device__ constexpr Vectorize(const VectorSize& vector_size,
+                                            const UpLength& up_length)
         : vector_size_{vector_size}, up_lengths_{make_tuple(up_length)}
     {
     }
@@ -1633,7 +1637,7 @@ struct DynamicVectorize
     __host__ __device__ void Print() const
     {
         printf("{");
-        printf("DynamicVectorize, ");
+        printf("Vectorize, ");
         printf("up_lengths_");
         print_multi_index(up_lengths_);
         printf("}");
@@ -1641,7 +1645,7 @@ struct DynamicVectorize
 };
 
 template <typename LowLength, typename SliceBegin, typename SliceEnd>
-struct DynamicSlice
+struct Slice
 {
     using LowerIndex = MultiIndex<1>;
     using UpperIndex = MultiIndex<1>;
@@ -1652,11 +1656,11 @@ struct DynamicSlice
     SliceBegin slice_begin_;
     SliceEnd slice_end_;
 
-    __host__ __device__ constexpr DynamicSlice() = default;
+    __host__ __device__ constexpr Slice() = default;
 
-    __host__ __device__ constexpr DynamicSlice(const LowLength&,
-                                               const SliceBegin& slice_begin,
-                                               const SliceEnd& slice_end)
+    __host__ __device__ constexpr Slice(const LowLength&,
+                                        const SliceBegin& slice_begin,
+                                        const SliceEnd& slice_end)
         : up_lengths_{make_tuple(slice_end - slice_begin)},
           slice_begin_{slice_begin},
           slice_end_{slice_end}
@@ -1724,7 +1728,7 @@ struct DynamicSlice
     __host__ __device__ void Print() const
     {
         printf("{");
-        printf("DynamicSlice, ");
+        printf("Slice, ");
         printf("up_lengths_");
         print_multi_index(up_lengths_);
         printf("slice_begin_ %d", index_t{slice_begin_});
diff --git a/composable_kernel/include/tensor_description/dynamic_multi_index_transform_helper.hpp b/composable_kernel/include/tensor_description/multi_index_transform_helper.hpp
similarity index 63%
rename from composable_kernel/include/tensor_description/dynamic_multi_index_transform_helper.hpp
rename to composable_kernel/include/tensor_description/multi_index_transform_helper.hpp
index b3e1c60485..6d4e01888b 100644
--- a/composable_kernel/include/tensor_description/dynamic_multi_index_transform_helper.hpp
+++ b/composable_kernel/include/tensor_description/multi_index_transform_helper.hpp
@@ -1,15 +1,15 @@
-#ifndef CK_DYNAMIC_MULTI_INDEX_TRANSFORM_HELPER_HPP
-#define CK_DYNAMIC_MULTI_INDEX_TRANSFORM_HELPER_HPP
+#ifndef CK_MULTI_INDEX_TRANSFORM_HELPER_HPP
+#define CK_MULTI_INDEX_TRANSFORM_HELPER_HPP
 
 #include "common_header.hpp"
-#include "dynamic_multi_index_transform.hpp"
+#include "multi_index_transform.hpp"
 
 namespace ck {
 
 template <typename LowLength>
 __host__ __device__ constexpr auto make_pass_through_transform(const LowLength& low_length)
 {
-    return DynamicPassThrough<LowLength>{low_length};
+    return PassThrough<LowLength>{low_length};
 }
 
 template <typename LowLength, typename LeftPad, typename RightPad, bool SkipIsValidCheck = false>
@@ -19,47 +19,46 @@ make_pad_transform(const LowLength& low_length,
                    const RightPad& right_pad,
                    integral_constant<bool, SkipIsValidCheck> = integral_constant<bool, false>{})
 {
-    return DynamicPad<LowLength, LeftPad, RightPad, SkipIsValidCheck>{
-        low_length, left_pad, right_pad};
+    return Pad<LowLength, LeftPad, RightPad, SkipIsValidCheck>{low_length, left_pad, right_pad};
 }
 
-template <typename LowLength, typename LeftPad, bool SkipIsValidCheck = false>
+template <typename LowLength, typename LeftPadLength, bool SkipIsValidCheck = false>
 __host__ __device__ constexpr auto make_left_pad_transform(
     const LowLength& low_length,
-    const LeftPad& left_pad,
+    const LeftPadLength& left_pad,
     integral_constant<bool, SkipIsValidCheck> = integral_constant<bool, false>{})
 {
-    return DynamicLeftPad<LowLength, LeftPad, SkipIsValidCheck>{low_length, left_pad};
+    return LeftPad<LowLength, LeftPadLength, SkipIsValidCheck>{low_length, left_pad};
 }
 
-template <typename LowLength, typename RightPad, bool SkipIsValidCheck>
+template <typename LowLength, typename RightPadLength, bool SkipIsValidCheck>
 __host__ __device__ constexpr auto make_right_pad_transform(
     const LowLength& low_length,
-    const RightPad& right_pad,
+    const RightPadLength& right_pad,
     integral_constant<bool, SkipIsValidCheck> = integral_constant<bool, false>{})
 {
-    return DynamicRightPad<LowLength, RightPad, SkipIsValidCheck>{low_length, right_pad};
+    return RightPad<LowLength, RightPadLength, SkipIsValidCheck>{low_length, right_pad};
 }
 
 template <typename UpLengths,
           typename Coefficients,
-          typename std::enable_if<UpLengths::Size() == Coefficients::Size(), bool>::type = false>
+          typename enable_if<UpLengths::Size() == Coefficients::Size(), bool>::type = false>
 __host__ __device__ constexpr auto make_embed_transform(const UpLengths& up_lengths,
                                                         const Coefficients& coefficients)
 {
-    return DynamicEmbed<UpLengths, Coefficients>{up_lengths, coefficients};
+    return Embed<UpLengths, Coefficients>{up_lengths, coefficients};
 }
 
 template <typename LowLengths>
 __host__ __device__ constexpr auto make_merge_transform(const LowLengths& low_lengths)
 {
 #if !CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION
-    return DynamicMerge_v1_carry_check<LowLengths>{low_lengths};
+    return Merge_v1_carry_check<LowLengths>{low_lengths};
 #else
 #if 1
-    return DynamicMerge_v2_magic_division<LowLengths>{low_lengths};
+    return Merge_v2_magic_division<LowLengths>{low_lengths};
 #else
-    return DynamicMerge_v2r2_magic_division<LowLengths>{low_lengths};
+    return Merge_v2r2_magic_division<LowLengths>{low_lengths};
 #endif
 #endif
 }
@@ -68,7 +67,7 @@ template <typename LowLengths>
 __host__ __device__ constexpr auto
 make_merge_transform_v2_magic_division(const LowLengths& low_lengths)
 {
-    return DynamicMerge_v2_magic_division<LowLengths>{low_lengths};
+    return Merge_v2_magic_division<LowLengths>{low_lengths};
 }
 
 template <typename UpLengths, bool Use24BitIntegerCalculation = false>
@@ -76,13 +75,13 @@ __host__ __device__ constexpr auto make_unmerge_transform(
     const UpLengths& up_lengths,
     integral_constant<bool, Use24BitIntegerCalculation> = integral_constant<bool, false>{})
 {
-    return DynamicUnMerge<UpLengths, Use24BitIntegerCalculation>{up_lengths};
+    return UnMerge<UpLengths, Use24BitIntegerCalculation>{up_lengths};
 }
 
 template <typename LowerIndex>
 __host__ __device__ constexpr auto make_freeze_transform(const LowerIndex& low_idx)
 {
-    return DynamicFreeze<LowerIndex>{low_idx};
+    return Freeze<LowerIndex>{low_idx};
 }
 
 template <typename LowLength, typename SliceBegin, typename SliceEnd>
@@ -90,14 +89,14 @@ __host__ __device__ constexpr auto make_slice_transform(const LowLength& low_len
                                                         const SliceBegin& slice_begin,
                                                         const SliceEnd& slice_end)
 {
-    return DynamicSlice<LowLength, SliceBegin, SliceEnd>{low_length, slice_begin, slice_end};
+    return Slice<LowLength, SliceBegin, SliceEnd>{low_length, slice_begin, slice_end};
 }
 
 template <typename VectorSize, typename UpLength>
 __host__ __device__ constexpr auto make_vectorize_transform(const VectorSize& vector_size,
                                                             const UpLength& up_length)
 {
-    return DynamicVectorize<VectorSize, UpLength>{vector_size, up_length};
+    return Vectorize<VectorSize, UpLength>{vector_size, up_length};
 }
 
 } // namespace ck
diff --git a/composable_kernel/include/tensor_description/tensor_adaptor.hpp b/composable_kernel/include/tensor_description/tensor_adaptor.hpp
index 6affe6141f..3b647e433a 100644
--- a/composable_kernel/include/tensor_description/tensor_adaptor.hpp
+++ b/composable_kernel/include/tensor_description/tensor_adaptor.hpp
@@ -2,8 +2,8 @@
 #define CK_TENSOR_ADAPTOR_HPP
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 
 namespace ck {
 
@@ -64,7 +64,7 @@ struct TensorAdaptor
             Number<ndim_top_>{});
 
         // TODO: make container_reduce support tuple of Number and index_t
-        return container_reduce(lengths, math::multiplies_v2{}, Number<1>{});
+        return container_reduce(lengths, math::multiplies{}, Number<1>{});
     }
 
     template <index_t IDim>
@@ -454,9 +454,7 @@ __host__ __device__ constexpr auto make_single_stage_tensor_adaptor(const Transf
                          remove_cv_t<decltype(top_dim_hidden_ids)>>{transforms};
 }
 
-template <typename X,
-          typename... Xs,
-          typename std::enable_if<sizeof...(Xs) >= 2, bool>::type = false>
+template <typename X, typename... Xs, typename enable_if<sizeof...(Xs) >= 2, bool>::type = false>
 __host__ __device__ constexpr auto chain_tensor_adaptors(const X& x, const Xs&... xs)
 {
     return chain_tensor_adaptors(x, chain_tensor_adaptors(xs...));
diff --git a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp b/composable_kernel/include/tensor_description/tensor_descriptor.hpp
similarity index 85%
rename from composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
rename to composable_kernel/include/tensor_description/tensor_descriptor.hpp
index b9ca26c879..a6a57ba63b 100644
--- a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
+++ b/composable_kernel/include/tensor_description/tensor_descriptor.hpp
@@ -1,16 +1,16 @@
-#ifndef CK_DYNAMIC_TENSOR_DESCRIPTOR_HPP
-#define CK_DYNAMIC_TENSOR_DESCRIPTOR_HPP
+#ifndef CK_TENSOR_DESCRIPTOR_HPP
+#define CK_TENSOR_DESCRIPTOR_HPP
 
 #include "common_header.hpp"
-#include "dynamic_multi_index_transform.hpp"
+#include "multi_index_transform.hpp"
 
 namespace ck {
 
 template <index_t NDimHidden, typename VisibleDimensionIds>
-struct DynamicTensorCoordinate;
+struct TensorCoordinate;
 
 template <index_t NTransform, index_t NDimVisible, typename UpdateLowerIndexHack>
-struct DynamicTensorCoordinateIterator;
+struct TensorCoordinateStep;
 
 // Transforms: Tuple<transforms...>
 // LowerDimensionIdss : Tuple<Sequence<...>, ...>
@@ -21,7 +21,7 @@ template <typename Transforms,
           typename UpperDimensionIdss,
           typename VisibleDimensionIds,
           typename ElementSpaceSize>
-struct DynamicTensorDescriptor
+struct TensorDescriptor
 {
     // TODO make these private
     __host__ __device__ static constexpr index_t GetNumOfTransform() { return Transforms::Size(); }
@@ -69,7 +69,7 @@ struct DynamicTensorDescriptor
             Number<ndim_visible_>{});
 
         // TODO: make container_reduce support tuple of Number and index_t
-        return container_reduce(lengths, math::multiplies_v2{}, Number<1>{});
+        return container_reduce(lengths, math::multiplies{}, Number<1>{});
     }
 
     template <index_t IDim>
@@ -105,16 +105,16 @@ struct DynamicTensorDescriptor
 
     using VisibleIndex = MultiIndex<ndim_visible_>;
     using HiddenIndex  = MultiIndex<ndim_hidden_>;
-    using Coordinate   = DynamicTensorCoordinate<ndim_hidden_, VisibleDimensionIds>;
+    using Coordinate   = TensorCoordinate<ndim_hidden_, VisibleDimensionIds>;
 
     // may be index_t or Number<>
     using ElementSize = remove_cv_t<decltype(InitializeElementSize(Transforms{}))>;
 
     public:
-    __host__ __device__ constexpr DynamicTensorDescriptor() = default;
+    __host__ __device__ constexpr TensorDescriptor() = default;
 
-    __host__ __device__ constexpr DynamicTensorDescriptor(const Transforms& transforms,
-                                                          ElementSpaceSize element_space_size)
+    __host__ __device__ constexpr TensorDescriptor(const Transforms& transforms,
+                                                   ElementSpaceSize element_space_size)
         : transforms_{transforms},
           element_size_{InitializeElementSize(transforms)},
           element_space_size_{element_space_size}
@@ -159,7 +159,7 @@ struct DynamicTensorDescriptor
     {
         static_assert(Idx::Size() == GetNumOfDimension(), "wrong! inconsistent # of dimension");
 
-        return make_dynamic_tensor_coordinate(*this, idx).GetOffset();
+        return make_tensor_coordinate(*this, idx).GetOffset();
     }
 
     // TODO make these private
@@ -196,7 +196,7 @@ struct DynamicTensorDescriptor
     __host__ __device__ void Print() const
     {
         printf("{");
-        printf("DynamicTensorDescriptor, ");
+        printf("TensorDescriptor, ");
         static_for<0, ntransform_, 1>{}([&](auto i) {
             printf("transforms: ");
             transforms_[i].Print();
@@ -217,7 +217,7 @@ struct DynamicTensorDescriptor
 };
 
 template <index_t NDimHidden, typename VisibleDimensionIds>
-struct DynamicTensorCoordinate
+struct TensorCoordinate
 {
     // TODO make these private
     static constexpr index_t ndim_visible_ = VisibleDimensionIds::Size();
@@ -226,9 +226,9 @@ struct DynamicTensorCoordinate
     using VisibleIndex = MultiIndex<ndim_visible_>;
 
     public:
-    __host__ __device__ constexpr DynamicTensorCoordinate() = default;
+    __host__ __device__ constexpr TensorCoordinate() = default;
 
-    __host__ __device__ constexpr DynamicTensorCoordinate(const HiddenIndex& idx_hidden)
+    __host__ __device__ constexpr TensorCoordinate(const HiddenIndex& idx_hidden)
         : idx_hidden_{idx_hidden}
     {
     }
@@ -252,16 +252,16 @@ struct DynamicTensorCoordinate
 };
 
 template <index_t NTransform, index_t NDimVisible, typename UpdateLowerIndexHack>
-struct DynamicTensorCoordinateIterator
+struct TensorCoordinateStep
 {
     // TODO make these private
     using VisibleIndex = MultiIndex<NDimVisible>;
 
     public:
-    __host__ __device__ constexpr DynamicTensorCoordinateIterator() = default;
+    __host__ __device__ constexpr TensorCoordinateStep() = default;
 
-    __host__ __device__ constexpr DynamicTensorCoordinateIterator(
-        const VisibleIndex& idx_diff_visible, const MultiIndex<NTransform>& do_transforms)
+    __host__ __device__ constexpr TensorCoordinateStep(const VisibleIndex& idx_diff_visible,
+                                                       const MultiIndex<NTransform>& do_transforms)
         : idx_diff_visible_{idx_diff_visible}, do_transforms_{do_transforms}
     {
     }
@@ -283,7 +283,7 @@ struct DynamicTensorCoordinateIterator
 
 // TODO: How to fix this? It uses an struct instead of lambda because lambda
 // doesn't have constructor, and to put it outside the scope where it is used
-// (transform_dynamic_tensor_descriptor) because template cannot be defined inside a function
+// (transform_tensor_descriptor) because template cannot be defined inside a function
 // template
 template <typename NewTransforms>
 struct lambda_get_up_dim_num
@@ -301,10 +301,10 @@ template <typename OldTensorDescriptor,
           typename NewLowerDimensionOldVisibleIdss,
           typename NewUpperDimensionNewVisibleIdss>
 __host__ __device__ constexpr auto
-transform_dynamic_tensor_descriptor(const OldTensorDescriptor& old_tensor_desc,
-                                    const NewTransforms& new_transforms,
-                                    NewLowerDimensionOldVisibleIdss,
-                                    NewUpperDimensionNewVisibleIdss)
+transform_tensor_descriptor(const OldTensorDescriptor& old_tensor_desc,
+                            const NewTransforms& new_transforms,
+                            NewLowerDimensionOldVisibleIdss,
+                            NewUpperDimensionNewVisibleIdss)
 {
     // sanity check
     {
@@ -376,17 +376,17 @@ transform_dynamic_tensor_descriptor(const OldTensorDescriptor& old_tensor_desc,
 
     const auto element_space_size = old_tensor_desc.GetElementSpaceSize();
 
-    return DynamicTensorDescriptor<remove_cv_t<decltype(all_transforms)>,
-                                   remove_cv_t<decltype(all_low_dim_hidden_idss)>,
-                                   remove_cv_t<decltype(all_up_dim_hidden_idss)>,
-                                   remove_cv_t<decltype(new_visible_dim_hidden_ids)>,
-                                   remove_cv_t<decltype(element_space_size)>>{all_transforms,
-                                                                              element_space_size};
+    return TensorDescriptor<remove_cv_t<decltype(all_transforms)>,
+                            remove_cv_t<decltype(all_low_dim_hidden_idss)>,
+                            remove_cv_t<decltype(all_up_dim_hidden_idss)>,
+                            remove_cv_t<decltype(new_visible_dim_hidden_ids)>,
+                            remove_cv_t<decltype(element_space_size)>>{all_transforms,
+                                                                       element_space_size};
 }
 
 template <typename TensorDesc, typename VisibleIndex>
-__host__ __device__ constexpr auto make_dynamic_tensor_coordinate(const TensorDesc& tensor_desc,
-                                                                  const VisibleIndex& idx_visible)
+__host__ __device__ constexpr auto make_tensor_coordinate(const TensorDesc& tensor_desc,
+                                                          const VisibleIndex& idx_visible)
 {
     static_assert(TensorDesc::GetNumOfDimension() == VisibleIndex::Size(),
                   "wrong! # of dimension inconsistent");
@@ -416,14 +416,15 @@ __host__ __device__ constexpr auto make_dynamic_tensor_coordinate(const TensorDe
         set_container_subset(idx_hidden, dims_low, idx_low);
     });
 
-    return DynamicTensorCoordinate<ndim_hidden, decltype(visible_dim_ids)>{idx_hidden};
+    return TensorCoordinate<ndim_hidden, decltype(visible_dim_ids)>{idx_hidden};
 }
 
 // UpdateLowerIndexHack: Sequence<...>
 // HACK: control UpdateLowerIndex
 template <typename TensorDesc, typename VisibleIndex, typename UpdateLowerIndexHack>
-__host__ __device__ constexpr auto make_dynamic_tensor_coordinate_iterator(
-    const TensorDesc&, const VisibleIndex& idx_diff_visible, UpdateLowerIndexHack)
+__host__ __device__ constexpr auto make_tensor_coordinate_step(const TensorDesc&,
+                                                               const VisibleIndex& idx_diff_visible,
+                                                               UpdateLowerIndexHack)
 {
     static_assert(TensorDesc::GetNumOfDimension() == VisibleIndex::Size(),
                   "wrong! # of dimension inconsistent");
@@ -470,23 +471,24 @@ __host__ __device__ constexpr auto make_dynamic_tensor_coordinate_iterator(
         set_container_subset(is_non_zero_diff, dims_low, non_zero_diff_pick_low);
     });
 
-    return DynamicTensorCoordinateIterator<ntransform, ndim_visible, UpdateLowerIndexHack>{
-        idx_diff_visible, do_transforms};
+    return TensorCoordinateStep<ntransform, ndim_visible, UpdateLowerIndexHack>{idx_diff_visible,
+                                                                                do_transforms};
 }
 
 template <typename TensorDesc, typename VisibleIndex>
-__host__ __device__ constexpr auto
-make_dynamic_tensor_coordinate_iterator(const TensorDesc&, const VisibleIndex& idx_diff_visible)
+__host__ __device__ constexpr auto make_tensor_coordinate_step(const TensorDesc&,
+                                                               const VisibleIndex& idx_diff_visible)
 {
     constexpr index_t ntransform = TensorDesc::GetNumOfTransform();
 
-    return make_dynamic_tensor_coordinate_iterator(
+    return make_tensor_coordinate_step(
         TensorDesc{}, idx_diff_visible, typename uniform_sequence_gen<ntransform, 0>::type{});
 }
 
-template <typename TensorDesc, typename TensorCoord, typename TensorCoordIterator>
-__host__ __device__ constexpr void move_dynamic_tensor_coordinate(
-    const TensorDesc& tensor_desc, TensorCoord& coord, const TensorCoordIterator& coord_iterator)
+template <typename TensorDesc, typename TensorCoord, typename TensorCoordStep>
+__host__ __device__ constexpr void move_tensor_coordinate(const TensorDesc& tensor_desc,
+                                                          TensorCoord& coord,
+                                                          const TensorCoordStep& coord_step)
 {
     constexpr index_t ndim_hidden = TensorDesc::GetNumOfHiddenDimension();
     constexpr index_t ntransform  = TensorDesc::GetNumOfTransform();
@@ -495,9 +497,8 @@ __host__ __device__ constexpr void move_dynamic_tensor_coordinate(
     auto idx_diff_hidden = make_zero_multi_index<ndim_hidden>();
 
     // initialize visible index diff
-    set_container_subset(idx_diff_hidden,
-                         TensorDesc::GetVisibleDimensionIds(),
-                         coord_iterator.GetVisibleIndexDiff());
+    set_container_subset(
+        idx_diff_hidden, TensorDesc::GetVisibleDimensionIds(), coord_step.GetVisibleIndexDiff());
 
     // this is what needs to be updated
     auto& idx_hidden = coord.GetHiddenIndex();
@@ -506,13 +507,13 @@ __host__ __device__ constexpr void move_dynamic_tensor_coordinate(
     auto idx_hidden_pick_visible =
         get_container_subset(idx_hidden, TensorDesc::GetVisibleDimensionIds());
 
-    idx_hidden_pick_visible += coord_iterator.GetIndexDiff();
+    idx_hidden_pick_visible += coord_step.GetIndexDiff();
 
     set_container_subset(idx_hidden, TensorDesc::GetVisibleDimensionIds(), idx_hidden_pick_visible);
 
     // update rest of hidden index
     static_for<ntransform - 1, -1, -1>{}([&](auto itran) {
-        if(coord_iterator.do_transforms_[itran])
+        if(coord_step.do_transforms_[itran])
         {
             const auto& tran        = tensor_desc.GetTransforms().At(itran);
             constexpr auto dims_low = TensorDesc::GetLowerDimensionIdss().At(itran);
@@ -524,8 +525,8 @@ __host__ __device__ constexpr void move_dynamic_tensor_coordinate(
 
             MultiIndex<dims_low.Size()> idx_diff_low;
 
-            // HACK: control UpdateLowerIndex for DynamicMerge using hack
-            constexpr index_t Hack = decltype(coord_iterator.update_lower_index_hack_)::At(itran);
+            // HACK: control UpdateLowerIndex for Merge using hack
+            constexpr index_t Hack = decltype(coord_step.update_lower_index_hack_)::At(itran);
 
             tran.UpdateLowerIndex(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
 
@@ -585,11 +586,11 @@ __host__ __device__ constexpr bool coordinate_has_valid_offset(const TensorDesc&
 }
 
 template <typename TensorDesc>
-using DynamicTensorCoordinate_t = decltype(make_dynamic_tensor_coordinate(
+using TensorCoordinate_t = decltype(make_tensor_coordinate(
     TensorDesc{}, MultiIndex<remove_cv_t<remove_reference_t<TensorDesc>>::GetNumOfDimension()>{}));
 
 template <typename TensorDesc>
-using DynamicTensorCoordinateIterator_t = decltype(make_dynamic_tensor_coordinate_iterator(
+using TensorCoordinateStep_t = decltype(make_tensor_coordinate_step(
     TensorDesc{}, MultiIndex<remove_cv_t<remove_reference_t<TensorDesc>>::GetNumOfDimension()>{}));
 
 } // namespace ck
diff --git a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper.hpp b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
similarity index 70%
rename from composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper.hpp
rename to composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
index 2e36451a66..ad75f9245e 100644
--- a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper.hpp
+++ b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
@@ -1,9 +1,9 @@
-#ifndef CK_DYNAMIC_TENSOR_DESCRIPTOR_HELPER_HPP
-#define CK_DYNAMIC_TENSOR_DESCRIPTOR_HELPER_HPP
+#ifndef CK_TENSOR_DESCRIPTOR_HELPER_HPP
+#define CK_TENSOR_DESCRIPTOR_HELPER_HPP
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_multi_index_transform_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "multi_index_transform_helper.hpp"
 
 namespace ck {
 
@@ -37,10 +37,9 @@ __host__ __device__ constexpr auto calculate_element_space_size_impl(const Lengt
 
 template <typename... Lengths,
           typename... Strides,
-          typename std::enable_if<sizeof...(Lengths) == sizeof...(Strides), bool>::type = false>
-__host__ __device__ constexpr auto
-make_dynamic_naive_tensor_descriptor_v2(const Tuple<Lengths...>& lengths,
-                                        const Tuple<Strides...>& strides)
+          typename enable_if<sizeof...(Lengths) == sizeof...(Strides), bool>::type = false>
+__host__ __device__ constexpr auto make_naive_tensor_descriptor(const Tuple<Lengths...>& lengths,
+                                                                const Tuple<Strides...>& strides)
 {
     constexpr index_t N = sizeof...(Lengths);
 
@@ -75,12 +74,12 @@ make_dynamic_naive_tensor_descriptor_v2(const Tuple<Lengths...>& lengths,
         calculate_element_space_size_impl(lengths, strides, Number<0>{}, Number<1>{});
 #endif
 
-    return DynamicTensorDescriptor<remove_cv_t<decltype(transforms)>,
-                                   remove_cv_t<decltype(low_dim_hidden_idss)>,
-                                   remove_cv_t<decltype(up_dim_hidden_idss)>,
-                                   remove_cv_t<decltype(visible_dim_hidden_ids)>,
-                                   remove_cv_t<decltype(element_space_size)>>{transforms,
-                                                                              element_space_size};
+    return TensorDescriptor<remove_cv_t<decltype(transforms)>,
+                            remove_cv_t<decltype(low_dim_hidden_idss)>,
+                            remove_cv_t<decltype(up_dim_hidden_idss)>,
+                            remove_cv_t<decltype(visible_dim_hidden_ids)>,
+                            remove_cv_t<decltype(element_space_size)>>{transforms,
+                                                                       element_space_size};
 }
 
 // Lengths... can be:
@@ -88,7 +87,7 @@ make_dynamic_naive_tensor_descriptor_v2(const Tuple<Lengths...>& lengths,
 //   2) Number<>, which is known at compile-time
 template <typename... Lengths>
 __host__ __device__ constexpr auto
-make_dynamic_naive_tensor_descriptor_packed_v2(const Tuple<Lengths...>& lengths)
+make_naive_tensor_descriptor_packed(const Tuple<Lengths...>& lengths)
 {
     constexpr index_t N = sizeof...(Lengths);
 
@@ -101,19 +100,19 @@ make_dynamic_naive_tensor_descriptor_packed_v2(const Tuple<Lengths...>& lengths)
 
     constexpr auto visible_dim_hidden_ids = typename arithmetic_sequence_gen<1, N + 1, 1>::type{};
 
-    const auto element_space_size = container_reduce(lengths, math::multiplies_v2{}, Number<1>{});
+    const auto element_space_size = container_reduce(lengths, math::multiplies{}, Number<1>{});
 
-    return DynamicTensorDescriptor<remove_cv_t<decltype(transforms)>,
-                                   remove_cv_t<decltype(low_dim_hidden_idss)>,
-                                   remove_cv_t<decltype(up_dim_hidden_idss)>,
-                                   remove_cv_t<decltype(visible_dim_hidden_ids)>,
-                                   remove_cv_t<decltype(element_space_size)>>{transforms,
-                                                                              element_space_size};
+    return TensorDescriptor<remove_cv_t<decltype(transforms)>,
+                            remove_cv_t<decltype(low_dim_hidden_idss)>,
+                            remove_cv_t<decltype(up_dim_hidden_idss)>,
+                            remove_cv_t<decltype(visible_dim_hidden_ids)>,
+                            remove_cv_t<decltype(element_space_size)>>{transforms,
+                                                                       element_space_size};
 }
 
 template <typename... Lengths, typename Align>
 __host__ __device__ constexpr auto
-make_dynamic_naive_tensor_descriptor_aligned_v2(const Tuple<Lengths...>& lengths, Align align)
+make_naive_tensor_descriptor_aligned(const Tuple<Lengths...>& lengths, Align align)
 {
     constexpr auto I1 = Number<1>{};
 
@@ -134,7 +133,7 @@ make_dynamic_naive_tensor_descriptor_aligned_v2(const Tuple<Lengths...>& lengths
             else
             {
                 return container_reduce(lengths,
-                                        math::multiplies_v2{},
+                                        math::multiplies{},
                                         Number<stride_n_minus_2>{},
                                         i + I1,
                                         Number<N - 1>{},
@@ -143,7 +142,7 @@ make_dynamic_naive_tensor_descriptor_aligned_v2(const Tuple<Lengths...>& lengths
         },
         Number<N>{});
 
-    return make_dynamic_naive_tensor_descriptor_v2(lengths, strides);
+    return make_naive_tensor_descriptor(lengths, strides);
 }
 
 } // namespace ck
diff --git a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r2.hpp b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r2.hpp
index 694cf9c6a3..35ff66a2b0 100644
--- a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r2.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r2.hpp
@@ -3,7 +3,7 @@
 
 #include "common_header.hpp"
 #include "tensor_adaptor.hpp"
-#include "threadwise_dynamic_tensor_slice_transfer.hpp"
+#include "threadwise_tensor_slice_transfer.hpp"
 #include "threadwise_contraction_dlops.hpp"
 
 namespace ck {
@@ -22,24 +22,24 @@ namespace ck {
 //     2. CThreadBuffer is StaticBuffer
 // Also assume:
 //   M0 = N0 = 2. It will do 2x2 pipelined read and fma (ABBA optimization)
-template <index_t BlockSize,
-          typename FloatA,
-          typename FloatB,
-          typename FloatC,
-          typename AKMBlockDesc,
-          typename BKNBlockDesc,
-          index_t M1PerThreadM11,
-          index_t N1PerThreadN11,
-          index_t KPerThread,
-          index_t M1N1ThreadClusterM100,
-          index_t M1N1ThreadClusterN100,
-          index_t M1N1ThreadClusterM101,
-          index_t M1N1ThreadClusterN101,
-          index_t AThreadCopyScalarPerVector_M11,
-          index_t BThreadCopyScalarPerVector_N11,
-          typename std::enable_if<AKMBlockDesc::IsKnownAtCompileTime() &&
-                                      BKNBlockDesc::IsKnownAtCompileTime(),
-                                  bool>::type = false>
+template <
+    index_t BlockSize,
+    typename FloatA,
+    typename FloatB,
+    typename FloatC,
+    typename AKMBlockDesc,
+    typename BKNBlockDesc,
+    index_t M1PerThreadM11,
+    index_t N1PerThreadN11,
+    index_t KPerThread,
+    index_t M1N1ThreadClusterM100,
+    index_t M1N1ThreadClusterN100,
+    index_t M1N1ThreadClusterM101,
+    index_t M1N1ThreadClusterN101,
+    index_t AThreadCopyScalarPerVector_M11,
+    index_t BThreadCopyScalarPerVector_N11,
+    typename enable_if<AKMBlockDesc::IsKnownAtCompileTime() && BKNBlockDesc::IsKnownAtCompileTime(),
+                       bool>::type = false>
 struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2
 {
     using AIndex = MultiIndex<3>;
@@ -71,9 +71,9 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2
     static constexpr index_t N0 = N / N1;
 
     __host__ __device__ static constexpr auto
-    MakeAKM0M1BlockDescriptor(const AKMBlockDesc& a_k_m_block_desc)
+    MakeAKM0M1BlockDescriptor(const AKMBlockDesc& /* a_k_m_block_desc */)
     {
-        const auto a_k_m0_m1_block_desc = transform_dynamic_tensor_descriptor(
+        const auto a_k_m0_m1_block_desc = transform_tensor_descriptor(
             AKMBlockDesc{},
             make_tuple(make_pass_through_transform(Number<K>{}),
                        make_unmerge_transform(make_tuple(Number<M0>{}, Number<M1>{}))),
@@ -84,9 +84,9 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2
     }
 
     __host__ __device__ static constexpr auto
-    MakeBKN0N1BlockDescriptor(const BKNBlockDesc& b_k_n_block_desc)
+    MakeBKN0N1BlockDescriptor(const BKNBlockDesc& /* b_k_n_block_desc */)
     {
-        const auto b_k_n0_n1_block_desc = transform_dynamic_tensor_descriptor(
+        const auto b_k_n0_n1_block_desc = transform_tensor_descriptor(
             BKNBlockDesc{},
             make_tuple(make_pass_through_transform(Number<K>{}),
                        make_unmerge_transform(make_tuple(Number<N0>{}, Number<N1>{}))),
@@ -194,7 +194,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2
               typename ABlockBuffer,
               typename BBlockBuffer,
               typename CThreadBuffer>
-    __device__ void Run(const CM0M1N0N1ThreadDesc& c_m0_m1_n0_n1_thread_desc,
+    __device__ void Run(const CM0M1N0N1ThreadDesc& /* c_m0_m1_n0_n1_thread_desc */,
                         const ABlockBuffer& a_block_buf,
                         const BBlockBuffer& b_block_buf,
                         CThreadBuffer& c_thread_buf) const
@@ -357,34 +357,32 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2
 
     private:
     // A[K, M0, M1]
-    static constexpr auto a_k_m0_m1_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2(
+    static constexpr auto a_k_m0_m1_thread_desc_ = make_naive_tensor_descriptor_packed(
         make_tuple(Number<KPerThread>{}, Number<M0>{}, Number<M1PerThreadM11>{}));
 
     // B[K, N0, N1]
-    static constexpr auto b_k_n0_n1_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2(
+    static constexpr auto b_k_n0_n1_thread_desc_ = make_naive_tensor_descriptor_packed(
         make_tuple(Number<KPerThread>{}, Number<N0>{}, Number<N1PerThreadN11>{}));
 
-    using AThreadCopy =
-        ThreadwiseDynamicTensorSliceTransfer_v4<FloatA,
-                                                FloatA,
-                                                decltype(a_k_m0_m1_block_desc_),
-                                                decltype(a_k_m0_m1_thread_desc_),
-                                                Sequence<KPerThread, 1, M1PerThreadM11>,
-                                                Sequence<0, 1, 2>,
-                                                2,
-                                                AThreadCopyScalarPerVector_M11,
-                                                1>;
-
-    using BThreadCopy =
-        ThreadwiseDynamicTensorSliceTransfer_v4<FloatB,
-                                                FloatB,
-                                                decltype(b_k_n0_n1_block_desc_),
-                                                decltype(b_k_n0_n1_thread_desc_),
-                                                Sequence<KPerThread, 1, N1PerThreadN11>,
-                                                Sequence<0, 1, 2>,
-                                                2,
-                                                BThreadCopyScalarPerVector_N11,
-                                                1>;
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatA,
+                                                         FloatA,
+                                                         decltype(a_k_m0_m1_block_desc_),
+                                                         decltype(a_k_m0_m1_thread_desc_),
+                                                         Sequence<KPerThread, 1, M1PerThreadM11>,
+                                                         Sequence<0, 1, 2>,
+                                                         2,
+                                                         AThreadCopyScalarPerVector_M11,
+                                                         1>;
+
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatB,
+                                                         FloatB,
+                                                         decltype(b_k_n0_n1_block_desc_),
+                                                         decltype(b_k_n0_n1_thread_desc_),
+                                                         Sequence<KPerThread, 1, N1PerThreadN11>,
+                                                         Sequence<0, 1, 2>,
+                                                         2,
+                                                         BThreadCopyScalarPerVector_N11,
+                                                         1>;
 
     CIndex c_thread_origin_data_idx_;
 
diff --git a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r3.hpp b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r3.hpp
index 6a3885936e..26ca0bf111 100644
--- a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r3.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r3.hpp
@@ -3,7 +3,7 @@
 
 #include "common_header.hpp"
 #include "tensor_adaptor.hpp"
-#include "threadwise_dynamic_tensor_slice_transfer_v2.hpp"
+#include "threadwise_tensor_slice_transfer_v2.hpp"
 #include "threadwise_contraction_dlops.hpp"
 
 namespace ck {
@@ -38,9 +38,9 @@ template <index_t BlockSize,
                                                 //          BM10BN10ThreadClusterBN101, ...>
           index_t AThreadCopyScalarPerVector_BM11,
           index_t BThreadCopyScalarPerVector_BN11,
-          typename std::enable_if<ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&
-                                      BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(),
-                                  bool>::type = false>
+          typename enable_if<ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&
+                                 BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(),
+                             bool>::type = false>
 struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2
 {
     using AIndex = MultiIndex<3>;
@@ -75,7 +75,7 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B
     __host__ __device__ static constexpr auto
     MakeABlockDescriptor_BK0_BM0_BM1_BK1(const ABlockDesc_BK0_BM_BK1& a_block_desc_bk0_bm_bk1)
     {
-        const auto a_block_bk0_bm0_bm1_bk1 = transform_dynamic_tensor_descriptor(
+        const auto a_block_bk0_bm0_bm1_bk1 = transform_tensor_descriptor(
             a_block_desc_bk0_bm_bk1,
             make_tuple(make_pass_through_transform(Number<BK0>{}),
                        make_unmerge_transform(make_tuple(Number<BM0>{}, Number<BM1>{})),
@@ -89,7 +89,7 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B
     __host__ __device__ static constexpr auto
     MakeBBlockDescriptor_BK0_BN0_BN1_BK1(const BBlockDesc_BK0_BN_BK1& b_block_desc_bk0_bn_bk1)
     {
-        const auto b_block_desc_bk0_bn0_bn1_bk1 = transform_dynamic_tensor_descriptor(
+        const auto b_block_desc_bk0_bn0_bn1_bk1 = transform_tensor_descriptor(
             b_block_desc_bk0_bn_bk1,
             make_tuple(make_pass_through_transform(Number<BK0>{}),
                        make_unmerge_transform(make_tuple(Number<BN0>{}, Number<BN1>{})),
@@ -372,15 +372,15 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B
     private:
     // A[BK0, BM0, BM1, BK1]
     static constexpr auto a_thread_desc_bk0_bm0_bm1_bk1_ =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
+        make_naive_tensor_descriptor_packed(make_tuple(
             Number<BK0PerThread>{}, Number<BM0>{}, Number<BM1PerThreadBM11>{}, Number<BK1>{}));
 
     // B[BK0, BN0, BN1, BK1]
     static constexpr auto b_thread_desc_bk0_bn0_bn1_bk1_ =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
+        make_naive_tensor_descriptor_packed(make_tuple(
             Number<BK0PerThread>{}, Number<BN0>{}, Number<BN1PerThreadBN11>{}, Number<BK1>{}));
 
-    using AThreadCopy = ThreadwiseDynamicTensorSliceTransfer_v4r1<
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4r1<
         FloatA,
         FloatA,
         decltype(a_block_desc_bk0_bm0_bm1_bk1_),
@@ -390,7 +390,7 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B
         Sequence<1, 1, BM1PerThreadBM11, BK1>,            // SrcVectorTensorLengths
         Sequence<0, 1, 2, 3>>;                            // SrcVectorTensorContiguousDimOrder
 
-    using BThreadCopy = ThreadwiseDynamicTensorSliceTransfer_v4r1<
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4r1<
         FloatB,
         FloatB,
         decltype(b_block_desc_bk0_bn0_bn1_bk1_),
diff --git a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp
index 074d519b76..03f889649e 100644
--- a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp
@@ -31,25 +31,24 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
     // HACK: fix this @Jing Zhang
     static constexpr index_t KPerThreadSubC = 4;
 
-    static constexpr auto a_thread_mtx_ = make_dynamic_naive_tensor_descriptor_packed_v2(
+    static constexpr auto a_thread_mtx_ = make_naive_tensor_descriptor_packed(
         make_tuple(Number<EPerThreadLoop>{}, Number<KPerThreadSubC>{}));
 
-    static constexpr auto b_thread_mtx_ = make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
+    static constexpr auto b_thread_mtx_ = make_naive_tensor_descriptor_packed(make_tuple(
         Number<EPerThreadLoop>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{}));
 
-    static constexpr auto c_thread_mtx_ = make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
+    static constexpr auto c_thread_mtx_ = make_naive_tensor_descriptor_packed(make_tuple(
         Number<KPerThreadSubC>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{}));
 
-    using AThreadCopy =
-        ThreadwiseDynamicTensorSliceTransfer_v4<FloatA,
-                                                FloatA,
-                                                BlockMatrixA,
-                                                decltype(a_thread_mtx_),
-                                                Sequence<EPerThreadLoop, KPerThreadSubC>,
-                                                Sequence<0, 1>,
-                                                1,
-                                                ThreadGemmADataPerRead_K,
-                                                1>;
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatA,
+                                                         FloatA,
+                                                         BlockMatrixA,
+                                                         decltype(a_thread_mtx_),
+                                                         Sequence<EPerThreadLoop, KPerThreadSubC>,
+                                                         Sequence<0, 1>,
+                                                         1,
+                                                         ThreadGemmADataPerRead_K,
+                                                         1>;
 
     __device__ BlockwiseGemmDlops_km_kn_m0m1n0n1_v3()
         : c_thread_begin_mtx_idx_{GetBeginOfThreadMatrixC(get_thread_local_1d_id())},
@@ -69,7 +68,6 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
                       "wrong! K dimension not consistent\n");
 
         constexpr index_t K = BlockMatrixA{}.GetLength(I1); // A is transposed
-        constexpr index_t N = BlockMatrixB{}.GetLength(I1);
         constexpr index_t H = BlockMatrixB{}.GetLength(I2);
         constexpr index_t W = BlockMatrixB{}.GetLength(I3);
 
@@ -121,9 +119,6 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
                       "wrong! inconsistent type");
 
         constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
 
         constexpr auto a_block_mtx = BlockMatrixA{};
 
@@ -138,7 +133,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
         static_assert(WPerThread % WoPerThreadSubC == 0, "");
 
         // thread A buffer for GEMM
-        StaticBuffer<AddressSpaceEnum_t::Vgpr, FloatA, a_thread_mtx_.GetElementSpaceSize()>
+        StaticBuffer<AddressSpaceEnum_t::Vgpr, FloatA, a_thread_mtx_.GetElementSpaceSize(), true>
             a_thread_buf;
 
         constexpr auto threadwise_gemm = ThreadwiseGemmDlops_km_kn_mn_v3<FloatA,
diff --git a/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp b/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp
index 98407ab7fc..ee6a0b7427 100644
--- a/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp
@@ -2,7 +2,7 @@
 #define CK_BLOCKWISE_GEMM_XDLOPS_HPP
 
 #include "common_header.hpp"
-#include "threadwise_dynamic_tensor_slice_transfer.hpp"
+#include "threadwise_tensor_slice_transfer.hpp"
 #include "xdlops_gemm.hpp"
 
 namespace ck {
@@ -52,7 +52,6 @@ struct BlockwiseGemmXdlops_km_kn_m0m1m2n_v1
         const index_t waveId    = thread_id / WaveSize;
         const index_t laneId    = thread_id % WaveSize;
         const index_t waveId_m  = waveId / NWaves;
-        const index_t waveId_n  = waveId % NWaves;
 
         if constexpr(xdlops_gemm.IsKReduction)
         {
@@ -73,7 +72,6 @@ struct BlockwiseGemmXdlops_km_kn_m0m1m2n_v1
         const index_t thread_id = get_thread_local_1d_id();
         const index_t waveId    = thread_id / WaveSize;
         const index_t laneId    = thread_id % WaveSize;
-        const index_t waveId_m  = waveId / NWaves;
         const index_t waveId_n  = waveId % NWaves;
 
         if constexpr(xdlops_gemm.IsKReduction)
@@ -193,35 +191,35 @@ struct BlockwiseGemmXdlops_km_kn_m0m1m2n_v1
 
     private:
     // A[K, M]
-    static constexpr auto a_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2(
-        make_tuple(I1, Number<MRepeat>{}, I1, Number<K1>{}));
+    static constexpr auto a_thread_desc_ =
+        make_naive_tensor_descriptor_packed(make_tuple(I1, Number<MRepeat>{}, I1, Number<K1>{}));
 
     // B[K, N]
-    static constexpr auto b_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2(
-        make_tuple(I1, Number<NRepeat>{}, I1, Number<K1>{}));
-
-    static constexpr auto c_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2(
-        make_tuple(Number<MRepeat>{}, Number<NRepeat>{}));
-
-    using AThreadCopy = ThreadwiseDynamicTensorSliceTransfer_v4<FloatAB,
-                                                                FloatAB,
-                                                                ABlockDesc,
-                                                                decltype(a_thread_desc_),
-                                                                Sequence<1, MRepeat, 1, K1>,
-                                                                Sequence<0, 1, 2, 3>,
-                                                                3,
-                                                                K1,
-                                                                1>;
-
-    using BThreadCopy = ThreadwiseDynamicTensorSliceTransfer_v4<FloatAB,
-                                                                FloatAB,
-                                                                BBlockDesc,
-                                                                decltype(b_thread_desc_),
-                                                                Sequence<1, NRepeat, 1, K1>,
-                                                                Sequence<0, 1, 2, 3>,
-                                                                3,
-                                                                K1,
-                                                                1>;
+    static constexpr auto b_thread_desc_ =
+        make_naive_tensor_descriptor_packed(make_tuple(I1, Number<NRepeat>{}, I1, Number<K1>{}));
+
+    static constexpr auto c_thread_desc_ =
+        make_naive_tensor_descriptor_packed(make_tuple(Number<MRepeat>{}, Number<NRepeat>{}));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
+                                                         FloatAB,
+                                                         ABlockDesc,
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, MRepeat, 1, K1>,
+                                                         Sequence<0, 1, 2, 3>,
+                                                         3,
+                                                         K1,
+                                                         1>;
+
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
+                                                         FloatAB,
+                                                         BBlockDesc,
+                                                         decltype(b_thread_desc_),
+                                                         Sequence<1, NRepeat, 1, K1>,
+                                                         Sequence<0, 1, 2, 3>,
+                                                         3,
+                                                         K1,
+                                                         1>;
 
     AThreadCopy a_thread_copy_;
     BThreadCopy b_thread_copy_;
@@ -272,7 +270,6 @@ struct BlockwiseGemmXdlops_km_kn_m0m1m2n_v1_2x2pipeline
         const index_t waveId    = thread_id / WaveSize;
         const index_t laneId    = thread_id % WaveSize;
         const index_t waveId_m  = waveId / NWaves;
-        const index_t waveId_n  = waveId % NWaves;
 
         if constexpr(xdlops_gemm.IsKReduction)
         {
@@ -293,7 +290,6 @@ struct BlockwiseGemmXdlops_km_kn_m0m1m2n_v1_2x2pipeline
         const index_t thread_id = get_thread_local_1d_id();
         const index_t waveId    = thread_id / WaveSize;
         const index_t laneId    = thread_id % WaveSize;
-        const index_t waveId_m  = waveId / NWaves;
         const index_t waveId_n  = waveId % NWaves;
 
         if constexpr(xdlops_gemm.IsKReduction)
@@ -490,35 +486,35 @@ struct BlockwiseGemmXdlops_km_kn_m0m1m2n_v1_2x2pipeline
 
     private:
     // A[K, M]
-    static constexpr auto a_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2(
-        make_tuple(I1, Number<MRepeat>{}, I1, Number<K1>{}));
+    static constexpr auto a_thread_desc_ =
+        make_naive_tensor_descriptor_packed(make_tuple(I1, Number<MRepeat>{}, I1, Number<K1>{}));
 
     // B[K, N]
-    static constexpr auto b_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2(
-        make_tuple(I1, Number<NRepeat>{}, I1, Number<K1>{}));
-
-    static constexpr auto c_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2(
-        make_tuple(Number<MRepeat>{}, Number<NRepeat>{}));
-
-    using AThreadCopy = ThreadwiseDynamicTensorSliceTransfer_v4<FloatAB,
-                                                                FloatAB,
-                                                                ABlockDesc,
-                                                                decltype(a_thread_desc_),
-                                                                Sequence<1, 1, 1, K1>,
-                                                                Sequence<0, 1, 2, 3>,
-                                                                3,
-                                                                1, // K1,
-                                                                1>;
-
-    using BThreadCopy = ThreadwiseDynamicTensorSliceTransfer_v4<FloatAB,
-                                                                FloatAB,
-                                                                BBlockDesc,
-                                                                decltype(b_thread_desc_),
-                                                                Sequence<1, 1, 1, K1>,
-                                                                Sequence<0, 1, 2, 3>,
-                                                                3,
-                                                                1, // K1,
-                                                                1>;
+    static constexpr auto b_thread_desc_ =
+        make_naive_tensor_descriptor_packed(make_tuple(I1, Number<NRepeat>{}, I1, Number<K1>{}));
+
+    static constexpr auto c_thread_desc_ =
+        make_naive_tensor_descriptor_packed(make_tuple(Number<MRepeat>{}, Number<NRepeat>{}));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
+                                                         FloatAB,
+                                                         ABlockDesc,
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, 1, 1, K1>,
+                                                         Sequence<0, 1, 2, 3>,
+                                                         3,
+                                                         1, // K1,
+                                                         1>;
+
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
+                                                         FloatAB,
+                                                         BBlockDesc,
+                                                         decltype(b_thread_desc_),
+                                                         Sequence<1, 1, 1, K1>,
+                                                         Sequence<0, 1, 2, 3>,
+                                                         3,
+                                                         1, // K1,
+                                                         1>;
 
     AThreadCopy a_thread_copy_;
     BThreadCopy b_thread_copy_;
diff --git a/composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer.hpp b/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp
similarity index 65%
rename from composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
rename to composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp
index 694b2fd2cc..0214b71352 100644
--- a/composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp
@@ -1,18 +1,18 @@
-#ifndef CK_BLOCKWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_HPP
-#define CK_BLOCKWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_HPP
+#ifndef CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_HPP
+#define CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_HPP
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 #include "cluster_descriptor.hpp"
-#include "threadwise_dynamic_tensor_slice_transfer.hpp"
+#include "threadwise_tensor_slice_transfer.hpp"
 
 namespace ck {
 
 // this version does following things to avoid scratch memory issue
 // 1. Use StaticallyIndexedArray instead of C array for thread buffer
-// 2. ThreadwiseDynamicTensorSliceTransfer_v3 does not keep reference to tensor descriptor
-// 3. ThreadwiseDynamicTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
+// 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor
+// 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
 template <index_t BlockSize,
           InMemoryDataOperationEnum_t DstInMemOp,
           typename BlockSliceLengths,
@@ -33,16 +33,16 @@ template <index_t BlockSize,
           index_t DstScalarStrideInVector,
           bool ThreadTransferSrcResetCoordinateAfterRun,
           bool ThreadTransferDstResetCoordinateAfterRun>
-struct BlockwiseDynamicTensorSliceTransfer_v4
+struct BlockwiseTensorSliceTransfer_v4
 {
     static constexpr index_t nDim = remove_reference_t<SrcDesc>::GetNumOfDimension();
 
     using Index = MultiIndex<nDim>;
 
-    __device__ constexpr BlockwiseDynamicTensorSliceTransfer_v4(const SrcDesc& src_desc,
-                                                                const Index& src_block_slice_origin,
-                                                                const DstDesc& dst_desc,
-                                                                const Index& dst_block_slice_origin)
+    __device__ constexpr BlockwiseTensorSliceTransfer_v4(const SrcDesc& src_desc,
+                                                         const Index& src_block_slice_origin,
+                                                         const DstDesc& dst_desc,
+                                                         const Index& dst_block_slice_origin)
         : threadwise_transfer_(
               src_desc, make_zero_multi_index<nDim>(), dst_desc, make_zero_multi_index<nDim>())
 
@@ -77,15 +77,14 @@ struct BlockwiseDynamicTensorSliceTransfer_v4
         }
     }
 
-    template <typename SrcBuffer, typename SrcIteratorHacks>
-    __device__ void RunRead(const SrcDesc& src_desc,
-                            const SrcBuffer& src_buf,
-                            const SrcIteratorHacks& src_iterator_hacks)
+    template <typename SrcBuffer, typename SrcStepHacks>
+    __device__ void
+    RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf, const SrcStepHacks& src_step_hacks)
     {
         if(BlockSize == thread_cluster_desc_.GetElementSize() or
            get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
         {
-            threadwise_transfer_.RunRead(src_desc, src_buf, src_iterator_hacks);
+            threadwise_transfer_.RunRead(src_desc, src_buf, src_step_hacks);
         }
     }
 
@@ -118,18 +117,18 @@ struct BlockwiseDynamicTensorSliceTransfer_v4
         }
     }
 
-    // SrcMoveSliceWindowIteratorHack to control index calculation move slice window
-    template <typename SrcMoveSliceWindowIteratorHack>
+    // SrcMoveSliceWindowStepHack to control index calculation move slice window
+    template <typename SrcMoveSliceWindowStepHack>
     __device__ void
     MoveSrcSliceWindow(const SrcDesc& src_desc,
                        const Index& step,
-                       const SrcMoveSliceWindowIteratorHack& src_move_slice_window_iterator_hack)
+                       const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
     {
         if(BlockSize == thread_cluster_desc_.GetElementSize() or
            get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
         {
             threadwise_transfer_.MoveSrcSliceWindow(
-                src_desc, step, src_move_slice_window_iterator_hack);
+                src_desc, step, src_move_slice_window_step_hack);
         }
     }
 
@@ -144,25 +143,25 @@ struct BlockwiseDynamicTensorSliceTransfer_v4
 
     private:
     static constexpr auto thread_cluster_desc_ =
-        make_cluster_descriptor_v2(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
+        make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
 
     using ThreadwiseTransfer =
-        ThreadwiseDynamicTensorSliceTransfer_v3<ThreadSliceLengths,
-                                                DstInMemOp,
-                                                SrcData,
-                                                DstData,
-                                                SrcDesc,
-                                                DstDesc,
-                                                SrcDimAccessOrder,
-                                                DstDimAccessOrder,
-                                                SrcVectorDim,
-                                                DstVectorDim,
-                                                SrcScalarPerVector,
-                                                DstScalarPerVector,
-                                                SrcScalarStrideInVector,
-                                                DstScalarStrideInVector,
-                                                ThreadTransferSrcResetCoordinateAfterRun,
-                                                ThreadTransferDstResetCoordinateAfterRun>;
+        ThreadwiseTensorSliceTransfer_v3<ThreadSliceLengths,
+                                         DstInMemOp,
+                                         SrcData,
+                                         DstData,
+                                         SrcDesc,
+                                         DstDesc,
+                                         SrcDimAccessOrder,
+                                         DstDimAccessOrder,
+                                         SrcVectorDim,
+                                         DstVectorDim,
+                                         SrcScalarPerVector,
+                                         DstScalarPerVector,
+                                         SrcScalarStrideInVector,
+                                         DstScalarStrideInVector,
+                                         ThreadTransferSrcResetCoordinateAfterRun,
+                                         ThreadTransferDstResetCoordinateAfterRun>;
 
     ThreadwiseTransfer threadwise_transfer_;
 };
diff --git a/composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer_v2.hpp b/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer_v2.hpp
similarity index 65%
rename from composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer_v2.hpp
rename to composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer_v2.hpp
index 20f3225f82..6b2d2d5231 100644
--- a/composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer_v2.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer_v2.hpp
@@ -1,18 +1,18 @@
-#ifndef CK_BLOCKWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_V2_HPP
-#define CK_BLOCKWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_V2_HPP
+#ifndef CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V2_HPP
+#define CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V2_HPP
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 #include "cluster_descriptor.hpp"
-#include "threadwise_dynamic_tensor_slice_transfer_v2.hpp"
+#include "threadwise_tensor_slice_transfer_v2.hpp"
 
 namespace ck {
 
 // this version does following things to avoid scratch memory issue
 // 1. Use StaticallyIndexedArray instead of C array for thread buffer
-// 2. ThreadwiseDynamicTensorSliceTransfer_v3 does not keep reference to tensor descriptor
-// 3. ThreadwiseDynamicTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
+// 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor
+// 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
 template <index_t BlockSize,
           InMemoryDataOperationEnum_t DstInMemOp,
           typename BlockSliceLengths,
@@ -31,17 +31,16 @@ template <index_t BlockSize,
           typename DstVectorTensorContiguousDimOrder,
           bool ThreadTransferSrcResetCoordinateAfterRun,
           bool ThreadTransferDstResetCoordinateAfterRun>
-struct BlockwiseDynamicTensorSliceTransfer_v4r1
+struct BlockwiseTensorSliceTransfer_v4r1
 {
     static constexpr index_t nDim = remove_reference_t<SrcDesc>::GetNumOfDimension();
 
     using Index = MultiIndex<nDim>;
 
-    __device__ constexpr BlockwiseDynamicTensorSliceTransfer_v4r1(
-        const SrcDesc& src_desc,
-        const Index& src_block_slice_origin,
-        const DstDesc& dst_desc,
-        const Index& dst_block_slice_origin)
+    __device__ constexpr BlockwiseTensorSliceTransfer_v4r1(const SrcDesc& src_desc,
+                                                           const Index& src_block_slice_origin,
+                                                           const DstDesc& dst_desc,
+                                                           const Index& dst_block_slice_origin)
         : threadwise_transfer_(
               src_desc, make_zero_multi_index<nDim>(), dst_desc, make_zero_multi_index<nDim>())
 
@@ -76,15 +75,14 @@ struct BlockwiseDynamicTensorSliceTransfer_v4r1
         }
     }
 
-    template <typename SrcBuffer, typename SrcIteratorHacks>
-    __device__ void RunRead(const SrcDesc& src_desc,
-                            const SrcBuffer& src_buf,
-                            const SrcIteratorHacks& src_iterator_hacks)
+    template <typename SrcBuffer, typename SrcStepHacks>
+    __device__ void
+    RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf, const SrcStepHacks& src_step_hacks)
     {
         if(BlockSize == thread_cluster_desc_.GetElementSize() or
            get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
         {
-            threadwise_transfer_.RunRead(src_desc, src_buf, src_iterator_hacks);
+            threadwise_transfer_.RunRead(src_desc, src_buf, src_step_hacks);
         }
     }
 
@@ -107,18 +105,18 @@ struct BlockwiseDynamicTensorSliceTransfer_v4r1
         }
     }
 
-    // SrcMoveSliceWindowIteratorHack to control index calculation move slice window
-    template <typename SrcMoveSliceWindowIteratorHack>
+    // SrcMoveSliceWindowStepHack to control index calculation move slice window
+    template <typename SrcMoveSliceWindowStepHack>
     __device__ void
     MoveSrcSliceWindow(const SrcDesc& src_desc,
                        const Index& step,
-                       const SrcMoveSliceWindowIteratorHack& src_move_slice_window_iterator_hack)
+                       const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
     {
         if(BlockSize == thread_cluster_desc_.GetElementSize() or
            get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
         {
             threadwise_transfer_.MoveSrcSliceWindow(
-                src_desc, step, src_move_slice_window_iterator_hack);
+                src_desc, step, src_move_slice_window_step_hack);
         }
     }
 
@@ -133,23 +131,23 @@ struct BlockwiseDynamicTensorSliceTransfer_v4r1
 
     private:
     static constexpr auto thread_cluster_desc_ =
-        make_cluster_descriptor_v2(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
+        make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
 
     using ThreadwiseTransfer =
-        ThreadwiseDynamicTensorSliceTransfer_v3r1<ThreadSliceLengths,
-                                                  DstInMemOp,
-                                                  SrcData,
-                                                  DstData,
-                                                  SrcDesc,
-                                                  DstDesc,
-                                                  SrcDimAccessOrder,
-                                                  DstDimAccessOrder,
-                                                  SrcVectorTensorLengths,
-                                                  DstVectorTensorLengths,
-                                                  SrcVectorTensorContiguousDimOrder,
-                                                  DstVectorTensorContiguousDimOrder,
-                                                  ThreadTransferSrcResetCoordinateAfterRun,
-                                                  ThreadTransferDstResetCoordinateAfterRun>;
+        ThreadwiseTensorSliceTransfer_v3r1<ThreadSliceLengths,
+                                           DstInMemOp,
+                                           SrcData,
+                                           DstData,
+                                           SrcDesc,
+                                           DstDesc,
+                                           SrcDimAccessOrder,
+                                           DstDimAccessOrder,
+                                           SrcVectorTensorLengths,
+                                           DstVectorTensorLengths,
+                                           SrcVectorTensorContiguousDimOrder,
+                                           DstVectorTensorContiguousDimOrder,
+                                           ThreadTransferSrcResetCoordinateAfterRun,
+                                           ThreadTransferDstResetCoordinateAfterRun>;
 
     ThreadwiseTransfer threadwise_transfer_;
 };
diff --git a/composable_kernel/include/tensor_operation/gridwise_dynamic_contraction_dlops_v1r2.hpp b/composable_kernel/include/tensor_operation/gridwise_contraction_dlops_v1r2.hpp
similarity index 89%
rename from composable_kernel/include/tensor_operation/gridwise_dynamic_contraction_dlops_v1r2.hpp
rename to composable_kernel/include/tensor_operation/gridwise_contraction_dlops_v1r2.hpp
index 6d48a18169..fe56d0d813 100644
--- a/composable_kernel/include/tensor_operation/gridwise_dynamic_contraction_dlops_v1r2.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_contraction_dlops_v1r2.hpp
@@ -1,14 +1,14 @@
-#ifndef CK_GRIDWISE_DYNAMIC_CONTRACTION_DLOPS_V1R2_HPP
-#define CK_GRIDWISE_DYNAMIC_CONTRACTION_DLOPS_V1R2_HPP
+#ifndef CK_GRIDWISE_CONTRACTION_DLOPS_V1R2_HPP
+#define CK_GRIDWISE_CONTRACTION_DLOPS_V1R2_HPP
 
 #include "common_header.hpp"
-#include "dynamic_multi_index_transform_helper.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "multi_index_transform_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 #include "blockwise_gemm_dlops_v2r3.hpp"
-#include "blockwise_dynamic_tensor_slice_transfer_v2.hpp"
-#include "threadwise_dynamic_tensor_slice_transfer.hpp"
-#include "threadwise_dynamic_tensor_slice_set.hpp"
+#include "blockwise_tensor_slice_transfer_v2.hpp"
+#include "threadwise_tensor_slice_transfer.hpp"
+#include "threadwise_tensor_slice_set.hpp"
 
 namespace ck {
 
@@ -25,7 +25,7 @@ __global__ void
 #if CK_USE_LAUNCH_BOUNDS
     __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_dynamic_contraction_dlops_v1r2(
+        kernel_contraction_dlops_v1r2(
             const FloatAB* __restrict__ p_a_grid,
             const FloatAB* __restrict__ p_b_grid,
             FloatC* __restrict__ p_c_grid,
@@ -84,12 +84,12 @@ template <index_t BlockSize,
           typename CThreadTransferSrcDstAccessOrder,
           index_t CThreadTransferSrcDstVectorDim,
           index_t CThreadTransferDstScalarPerVector,
-          typename AGridIteratorHacks,
-          typename BGridIteratorHacks,
-          typename CGridIteratorHacks,
-          typename AGridMoveSliceWindowIteratorHacks,
-          typename BGridMoveSliceWindowIteratorHacks>
-struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1
+          typename AGridStepHacks,
+          typename BGridStepHacks,
+          typename CGridStepHacks,
+          typename AGridMoveSliceWindowStepHacks,
+          typename BGridMoveSliceWindowStepHacks>
+struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1
 {
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
@@ -110,17 +110,15 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
 
         // A matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto a_block_desc_gk0_gm0_gm10_gm11_gk1 =
-            make_dynamic_naive_tensor_descriptor_aligned_v2(
-                make_tuple(Number<GK0PerBlock>{}, GM0, I1, Number<GM1PerBlockGM11>{}, GK1),
-                max_lds_align);
+        constexpr auto a_block_desc_gk0_gm0_gm10_gm11_gk1 = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<GK0PerBlock>{}, GM0, I1, Number<GM1PerBlockGM11>{}, GK1),
+            max_lds_align);
 
         // B matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto b_block_desc_gk0_gn0_gn10_gn11_gk1 =
-            make_dynamic_naive_tensor_descriptor_aligned_v2(
-                make_tuple(Number<GK0PerBlock>{}, GN0, I1, Number<GN1PerBlockGN11>{}, GK1),
-                max_lds_align);
+        constexpr auto b_block_desc_gk0_gn0_gn10_gn11_gk1 = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<GK0PerBlock>{}, GN0, I1, Number<GN1PerBlockGN11>{}, GK1),
+            max_lds_align);
 
         // LDS allocation for A and B: be careful of alignment
         constexpr auto a_block_aligned_space_size = math::integer_least_multiple(
@@ -201,7 +199,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
         const auto GM11 = Number<GM1PerBlockGM11>{};
         const auto GM10 = GM1 / GM11;
 
-        const auto a_grid_desc_gk0_gm0_gm10_gm11_gk1 = transform_dynamic_tensor_descriptor(
+        const auto a_grid_desc_gk0_gm0_gm10_gm11_gk1 = transform_tensor_descriptor(
             a_grid_desc_gk0_gm0_gm1_gk1,
             make_tuple(make_pass_through_transform(GK0),
                        make_pass_through_transform(GM0),
@@ -222,7 +220,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
         const auto GN11 = Number<GN1PerBlockGN11>{};
         const auto GN10 = GN1 / GN11;
 
-        const auto b_grid_desc_gk0_gn0_gn10_gn11_gk1 = transform_dynamic_tensor_descriptor(
+        const auto b_grid_desc_gk0_gn0_gn10_gn11_gk1 = transform_tensor_descriptor(
             b_grid_desc_gk0_gn0_gn1_gk1,
             make_tuple(make_pass_through_transform(GK0),
                        make_pass_through_transform(GN0),
@@ -250,16 +248,16 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
         constexpr auto BN = GN0 * GN11;
 
         constexpr auto BM1 =
-            Number<container_reduce(BM10BN10ThreadClusterBM10Xs{}, math::multiplies_v2{}, I1) *
+            Number<container_reduce(BM10BN10ThreadClusterBM10Xs{}, math::multiplies{}, I1) *
                    BM1PerThreadBM11>{};
         constexpr auto BN1 =
-            Number<container_reduce(BM10BN10ThreadClusterBN10Xs{}, math::multiplies_v2{}, I1) *
+            Number<container_reduce(BM10BN10ThreadClusterBN10Xs{}, math::multiplies{}, I1) *
                    BN1PerThreadBN11>{};
 
         constexpr auto BM0 = BM / BM1;
         constexpr auto BN0 = BN / BN1;
 
-        const auto c_gm0_gm10_gm11_gn0_gn10_gn11_grid_desc = transform_dynamic_tensor_descriptor(
+        const auto c_gm0_gm10_gm11_gn0_gn10_gn11_grid_desc = transform_tensor_descriptor(
             c_grid_desc_gm0_gm1_gn0_gn1,
             make_tuple(make_pass_through_transform(GM0),
                        make_unmerge_transform(make_tuple(GM10, GM11)),
@@ -268,7 +266,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
             make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
             make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}, Sequence<4, 5>{}));
 
-        const auto c_gm10_bm_gn10_bn_grid_desc = transform_dynamic_tensor_descriptor(
+        const auto c_gm10_bm_gn10_bn_grid_desc = transform_tensor_descriptor(
             c_gm0_gm10_gm11_gn0_gn10_gn11_grid_desc,
             make_tuple(make_pass_through_transform(GM10),
                        make_merge_transform(make_tuple(GM0, GM11)),
@@ -277,7 +275,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
             make_tuple(Sequence<1>{}, Sequence<0, 2>{}, Sequence<4>{}, Sequence<3, 5>{}),
             make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
 
-        const auto c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1 = transform_dynamic_tensor_descriptor(
+        const auto c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1 = transform_tensor_descriptor(
             c_gm10_bm_gn10_bn_grid_desc,
             make_tuple(make_pass_through_transform(GM10),
                        make_unmerge_transform(make_tuple(BM0, BM1)),
@@ -356,26 +354,24 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
 
         // A matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto a_block_desc_gk0_gm0_gm10_gm11_gk1 =
-            make_dynamic_naive_tensor_descriptor_aligned_v2(
-                make_tuple(Number<GK0PerBlock>{}, GM0, I1, Number<GM1PerBlockGM11>{}, GK1),
-                max_lds_align);
+        constexpr auto a_block_desc_gk0_gm0_gm10_gm11_gk1 = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<GK0PerBlock>{}, GM0, I1, Number<GM1PerBlockGM11>{}, GK1),
+            max_lds_align);
 
         // B matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto b_block_desc_gk0_gn0_gn10_gn11_gk1 =
-            make_dynamic_naive_tensor_descriptor_aligned_v2(
-                make_tuple(Number<GK0PerBlock>{}, GN0, I1, Number<GN1PerBlockGN11>{}, GK1),
-                max_lds_align);
+        constexpr auto b_block_desc_gk0_gn0_gn10_gn11_gk1 = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<GK0PerBlock>{}, GN0, I1, Number<GN1PerBlockGN11>{}, GK1),
+            max_lds_align);
 
         // A matrix in LDS memory for blockwise GEMM
         //   be careful of LDS alignment
-        constexpr auto a_block_desc_gk0_bm_gk1 = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_block_desc_gk0_bm_gk1 = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<GK0PerBlock>{}, GM0 * Number<GM1PerBlockGM11>{}, GK1), max_lds_align);
 
         // B matrix in LDS memory for blockwise GEMM
         //   be careful of LDS alignment
-        constexpr auto b_block_desc_gk0_bn_gk1 = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto b_block_desc_gk0_bn_gk1 = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<GK0PerBlock>{}, GN0 * Number<GN1PerBlockGN11>{}, GK1), max_lds_align);
 
         static_assert(a_block_desc_gk0_gm0_gm10_gm11_gk1.GetElementSpaceSize() ==
@@ -385,7 +381,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
                       "wrong!");
 
         // A matrix blockwise copy
-        auto a_blockwise_copy = BlockwiseDynamicTensorSliceTransfer_v4r1<
+        auto a_blockwise_copy = BlockwiseTensorSliceTransfer_v4r1<
             BlockSize,
             InMemoryDataOperationEnum_t::Set,
             Sequence<GK0PerBlock, GM0, 1, GM1PerBlockGM11, GK1.value>,
@@ -409,7 +405,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
                   make_multi_index(0, 0, 0, 0, 0));
 
         // B matrix blockwise copy
-        auto b_blockwise_copy = BlockwiseDynamicTensorSliceTransfer_v4r1<
+        auto b_blockwise_copy = BlockwiseTensorSliceTransfer_v4r1<
             BlockSize,
             InMemoryDataOperationEnum_t::Set,
             Sequence<GK0PerBlock, GN0, 1, GN1PerBlockGN11, GK1.value>,
@@ -457,9 +453,8 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
         constexpr auto c_thread_tensor_lengths_bm0_bm1_bn0_bn1 =
             decltype(blockwise_gemm)::GetCThreadTensorLengths_BM0_BM1_BN0_BN1();
 
-        constexpr auto c_thread_desc_bm0_bm1_bn0_bn1 =
-            make_dynamic_naive_tensor_descriptor_packed_v2(
-                sequence_to_tuple_of_number(c_thread_tensor_lengths_bm0_bm1_bn0_bn1));
+        constexpr auto c_thread_desc_bm0_bm1_bn0_bn1 = make_naive_tensor_descriptor_packed(
+            sequence_to_tuple_of_number(c_thread_tensor_lengths_bm0_bm1_bn0_bn1));
 
         // LDS allocation for A and B: be careful of alignment
         constexpr auto a_block_aligned_space_size = math::integer_least_multiple(
@@ -475,9 +470,9 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
         auto c_thread_buf = make_static_buffer<AddressSpaceEnum_t::Vgpr, FloatAcc>(
             c_thread_desc_bm0_bm1_bn0_bn1.GetElementSpaceSize());
 
-        ThreadwiseDynamicTensorSliceSet_v1<FloatAcc,
-                                           decltype(c_thread_desc_bm0_bm1_bn0_bn1),
-                                           decltype(c_thread_tensor_lengths_bm0_bm1_bn0_bn1)>{}
+        ThreadwiseTensorSliceSet_v1<FloatAcc,
+                                    decltype(c_thread_desc_bm0_bm1_bn0_bn1),
+                                    decltype(c_thread_tensor_lengths_bm0_bm1_bn0_bn1)>{}
             .Run(c_thread_desc_bm0_bm1_bn0_bn1,
                  make_tuple(I0, I0, I0, I0),
                  c_thread_buf,
@@ -501,9 +496,9 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
         // LDS double buffer: preload data into LDS
         {
             a_blockwise_copy.RunRead(
-                a_grid_desc_gk0_gm0_gm10_gm11_gk1, a_global_buf, AGridIteratorHacks{});
+                a_grid_desc_gk0_gm0_gm10_gm11_gk1, a_global_buf, AGridStepHacks{});
             b_blockwise_copy.RunRead(
-                b_grid_desc_gk0_gn0_gn10_gn11_gk1, b_global_buf, BGridIteratorHacks{});
+                b_grid_desc_gk0_gn0_gn10_gn11_gk1, b_global_buf, BGridStepHacks{});
 
             a_blockwise_copy.RunWrite(a_block_desc_gk0_gm0_gm10_gm11_gk1, a_block_even_buf);
             b_blockwise_copy.RunWrite(b_block_desc_gk0_gn0_gn10_gn11_gk1, b_block_even_buf);
@@ -520,18 +515,18 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
                 // even iteration
                 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_gk0_gm0_gm10_gm11_gk1,
                                                     a_block_slice_copy_step,
-                                                    AGridMoveSliceWindowIteratorHacks{});
+                                                    AGridMoveSliceWindowStepHacks{});
                 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc_gk0_gn0_gn10_gn11_gk1,
                                                     b_block_slice_copy_step,
-                                                    BGridMoveSliceWindowIteratorHacks{});
+                                                    BGridMoveSliceWindowStepHacks{});
 
                 __syncthreads();
 
                 // LDS doubel buffer: load next data from device mem
                 a_blockwise_copy.RunRead(
-                    a_grid_desc_gk0_gm0_gm10_gm11_gk1, a_global_buf, AGridIteratorHacks{});
+                    a_grid_desc_gk0_gm0_gm10_gm11_gk1, a_global_buf, AGridStepHacks{});
                 b_blockwise_copy.RunRead(
-                    b_grid_desc_gk0_gn0_gn10_gn11_gk1, b_global_buf, BGridIteratorHacks{});
+                    b_grid_desc_gk0_gn0_gn10_gn11_gk1, b_global_buf, BGridStepHacks{});
 
                 // LDS double buffer: GEMM on current data
                 blockwise_gemm.Run(c_thread_desc_bm0_bm1_bn0_bn1,
@@ -546,18 +541,18 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
                 // odd iteration
                 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_gk0_gm0_gm10_gm11_gk1,
                                                     a_block_slice_copy_step,
-                                                    AGridMoveSliceWindowIteratorHacks{});
+                                                    AGridMoveSliceWindowStepHacks{});
                 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc_gk0_gn0_gn10_gn11_gk1,
                                                     b_block_slice_copy_step,
-                                                    BGridMoveSliceWindowIteratorHacks{});
+                                                    BGridMoveSliceWindowStepHacks{});
 
                 __syncthreads();
 
                 // LDS doubel buffer: load next data from device mem
                 a_blockwise_copy.RunRead(
-                    a_grid_desc_gk0_gm0_gm10_gm11_gk1, a_global_buf, AGridIteratorHacks{});
+                    a_grid_desc_gk0_gm0_gm10_gm11_gk1, a_global_buf, AGridStepHacks{});
                 b_blockwise_copy.RunRead(
-                    b_grid_desc_gk0_gn0_gn10_gn11_gk1, b_global_buf, BGridIteratorHacks{});
+                    b_grid_desc_gk0_gn0_gn10_gn11_gk1, b_global_buf, BGridStepHacks{});
 
                 // LDS double buffer: GEMM on current data
                 blockwise_gemm.Run(
@@ -576,18 +571,18 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
         {
             a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_gk0_gm0_gm10_gm11_gk1,
                                                 a_block_slice_copy_step,
-                                                AGridMoveSliceWindowIteratorHacks{});
+                                                AGridMoveSliceWindowStepHacks{});
             b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc_gk0_gn0_gn10_gn11_gk1,
                                                 b_block_slice_copy_step,
-                                                BGridMoveSliceWindowIteratorHacks{});
+                                                BGridMoveSliceWindowStepHacks{});
 
             __syncthreads();
 
             // LDS double buffer: load last data from device mem
             a_blockwise_copy.RunRead(
-                a_grid_desc_gk0_gm0_gm10_gm11_gk1, a_global_buf, AGridIteratorHacks{});
+                a_grid_desc_gk0_gm0_gm10_gm11_gk1, a_global_buf, AGridStepHacks{});
             b_blockwise_copy.RunRead(
-                b_grid_desc_gk0_gn0_gn10_gn11_gk1, b_global_buf, BGridIteratorHacks{});
+                b_grid_desc_gk0_gn0_gn10_gn11_gk1, b_global_buf, BGridStepHacks{});
 
             // LDS double buffer: GEMM on 2nd-last data
             blockwise_gemm.Run(
@@ -615,7 +610,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
         // output: register to global memory
         {
             constexpr auto c_thread_desc_gm10_bm0_bm1_gn10_bn0_bn1 =
-                make_dynamic_naive_tensor_descriptor_packed_v2(
+                make_naive_tensor_descriptor_packed(
                     make_tuple(I1,
                                Number<c_thread_tensor_lengths_bm0_bm1_bn0_bn1[I0]>{},
                                Number<c_thread_tensor_lengths_bm0_bm1_bn0_bn1[I1]>{},
@@ -627,7 +622,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
                 blockwise_gemm.CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1(
                     get_thread_local_1d_id());
 
-            ThreadwiseDynamicTensorSliceTransfer_v1r3<
+            ThreadwiseTensorSliceTransfer_v1r3<
                 FloatAcc,
                 FloatC,
                 decltype(c_thread_desc_gm10_bm0_bm1_gn10_bn0_bn1),
@@ -655,7 +650,7 @@ struct GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0
                      c_thread_buf,
                      c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
                      c_grid_buf,
-                     CGridIteratorHacks{});
+                     CGridStepHacks{});
         }
     }
 };
diff --git a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r2.hpp b/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r2.hpp
similarity index 74%
rename from composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r2.hpp
rename to composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r2.hpp
index 7a4ef1d7ea..d91159b884 100644
--- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r2.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r2.hpp
@@ -1,14 +1,14 @@
-#ifndef CK_GRIDWISE_DYNAMIC_GEMM_DLOPS_V1R2_HPP
-#define CK_GRIDWISE_DYNAMIC_GEMM_DLOPS_V1R2_HPP
+#ifndef CK_GRIDWISE_GEMM_DLOPS_V1R2_HPP
+#define CK_GRIDWISE_GEMM_DLOPS_V1R2_HPP
 
 #include "common_header.hpp"
-#include "dynamic_multi_index_transform_helper.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "multi_index_transform_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 #include "blockwise_gemm_dlops_v2r2.hpp"
-#include "blockwise_dynamic_tensor_slice_transfer.hpp"
-#include "threadwise_dynamic_tensor_slice_transfer.hpp"
-#include "threadwise_dynamic_tensor_slice_set.hpp"
+#include "blockwise_tensor_slice_transfer.hpp"
+#include "threadwise_tensor_slice_transfer.hpp"
+#include "threadwise_tensor_slice_set.hpp"
 
 namespace ck {
 
@@ -26,7 +26,7 @@ __global__ void
 #if CK_USE_LAUNCH_BOUNDS
     __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_dynamic_gemm_dlops_v1r2(
+        kernel_gemm_dlops_v1r2(
             const FloatAB* __restrict__ p_a_grid,
             const FloatAB* __restrict__ p_b_grid,
             FloatC* __restrict__ p_c_grid,
@@ -68,28 +68,27 @@ __global__ void
 #if CK_USE_LAUNCH_BOUNDS
     __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_dynamic_gemm_dlops_v1r2(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            FloatC* __restrict__ p_c_grid,
-            const void CONSTANT* p_a_k_m0_m1_grid_desc,
-            const void CONSTANT* p_b_k_n0_n1_grid_desc,
-            const void CONSTANT* p_c_m0_m10_m11_n0_n10_n11_grid_desc,
-            const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor)
+        kernel_gemm_dlops_v1r2(const FloatAB* __restrict__ p_a_grid,
+                               const FloatAB* __restrict__ p_b_grid,
+                               FloatC* __restrict__ p_c_grid,
+                               const void CONSTANT* p_a_k_m0_m1_grid_desc,
+                               const void CONSTANT* p_b_k_n0_n1_grid_desc,
+                               const void CONSTANT* p_c_m0_m10_m11_n0_n10_n11_grid_desc,
+                               const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor)
 {
     // first cast void CONSTANT void* to void*
     // second cast void* to Desc*
     // the copy constructor of tensor descriptor doesn't take address_space(4)
-    const auto a_k_m0_m1_grid_desc =
-        *reinterpret_cast<const AKM0M1GridDesc*>((const void*)p_a_k_m0_m1_grid_desc);
-    const auto b_k_n0_n1_grid_desc =
-        *reinterpret_cast<const BKN0N1GridDesc*>((const void*)p_b_k_n0_n1_grid_desc);
+    const auto a_k_m0_m1_grid_desc = *reinterpret_cast<const AKM0M1GridDesc*>(
+        cast_pointer_to_generic_address_space(p_a_k_m0_m1_grid_desc));
+    const auto b_k_n0_n1_grid_desc = *reinterpret_cast<const BKN0N1GridDesc*>(
+        cast_pointer_to_generic_address_space(p_b_k_n0_n1_grid_desc));
     const auto c_m0_m10_m11_n0_n10_n11_grid_desc =
         *reinterpret_cast<const CM0M10M11N0N10N11GridDesc*>(
-            (const void*)p_c_m0_m10_m11_n0_n10_n11_grid_desc);
+            cast_pointer_to_generic_address_space(p_c_m0_m10_m11_n0_n10_n11_grid_desc));
     const auto c_blockid_to_m0_n0_block_cluster_adaptor =
         *reinterpret_cast<const CBlockIdToM0N0BlockClusterAdaptor*>(
-            (const void*)p_c_blockid_to_m0_n0_block_cluster_adaptor);
+            cast_pointer_to_generic_address_space(p_c_blockid_to_m0_n0_block_cluster_adaptor));
 
     constexpr index_t shared_block_size =
         GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
@@ -146,12 +145,12 @@ template <index_t BlockSize,
           typename CThreadTransferSrcDstAccessOrder,
           index_t CThreadTransferSrcDstVectorDim,
           index_t CThreadTransferDstScalarPerVector,
-          typename AGridIteratorHacks,
-          typename BGridIteratorHacks,
-          typename CGridIteratorHacks,
-          typename AGridMoveSliceWindowIteratorHacks,
-          typename BGridMoveSliceWindowIteratorHacks>
-struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
+          typename AGridStepHacks,
+          typename BGridStepHacks,
+          typename CGridStepHacks,
+          typename AGridMoveSliceWindowStepHacks,
+          typename BGridMoveSliceWindowStepHacks>
+struct GridwiseGemmDlops_km_kn_mn_v1r2
 {
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
@@ -167,12 +166,12 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
 
         // A matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto a_k_m_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}), max_lds_align);
 
         // B matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto b_k_n_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}), max_lds_align);
 
         // LDS allocation for A and B: be careful of alignment
@@ -230,7 +229,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
         const auto M1 = Number<MPerBlockM1>{};
         const auto M0 = M / M1;
 
-        const auto a_k_m0_m1_grid_desc = transform_dynamic_tensor_descriptor(
+        const auto a_k_m0_m1_grid_desc = transform_tensor_descriptor(
             a_k_m_grid_desc,
             make_tuple(make_pass_through_transform(K), make_unmerge_transform(make_tuple(M0, M1))),
             make_tuple(Sequence<0>{}, Sequence<1>{}),
@@ -248,7 +247,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
         const auto N1 = Number<NPerBlockN1>{};
         const auto N0 = N / N1;
 
-        const auto b_k_n0_n1_grid_desc = transform_dynamic_tensor_descriptor(
+        const auto b_k_n0_n1_grid_desc = transform_tensor_descriptor(
             b_k_n_grid_desc,
             make_tuple(make_pass_through_transform(K), make_unmerge_transform(make_tuple(N0, N1))),
             make_tuple(Sequence<0>{}, Sequence<1>{}),
@@ -277,7 +276,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
         constexpr auto M10 = M1 / M11;
         constexpr auto N10 = N1 / N11;
 
-        const auto c_m0_m10_m11_n0_n10_n11_grid_desc = transform_dynamic_tensor_descriptor(
+        const auto c_m0_m10_m11_n0_n10_n11_grid_desc = transform_tensor_descriptor(
             c_m_n_grid_desc,
             make_tuple(make_unmerge_transform(make_tuple(M0, M10, M11)),
                        make_unmerge_transform(make_tuple(N0, N10, N11))),
@@ -352,75 +351,75 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
 
         // A matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto a_k_m_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}), max_lds_align);
 
         // B matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto b_k_n_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}), max_lds_align);
 
         // A matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto a_k_m0_m1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_k_m0_m1_block_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<KPerBlock>{}, I1, Number<MPerBlockM1>{}), max_lds_align);
 
         // B matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto b_k_n0_n1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto b_k_n0_n1_block_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<KPerBlock>{}, I1, Number<NPerBlockN1>{}), max_lds_align);
 
         // A matrix blockwise copy
         auto a_blockwise_copy =
-            BlockwiseDynamicTensorSliceTransfer_v4<BlockSize,
-                                                   InMemoryDataOperationEnum_t::Set,
-                                                   Sequence<KPerBlock, 1, MPerBlockM1>,
-                                                   ABlockTransferThreadSliceLengths_K_M0_M1,
-                                                   ABlockTransferThreadClusterLengths_K_M0_M1,
-                                                   ABlockTransferThreadClusterArrangeOrder,
-                                                   FloatAB,
-                                                   FloatAB,
-                                                   decltype(a_k_m0_m1_grid_desc),
-                                                   decltype(a_k_m0_m1_block_desc),
-                                                   ABlockTransferSrcAccessOrder,
-                                                   Sequence<0, 1, 2>,
-                                                   ABlockTransferSrcVectorDim,
-                                                   2,
-                                                   ABlockTransferSrcScalarPerVector,
-                                                   ABlockTransferDstScalarPerVector_M1,
-                                                   1,
-                                                   1,
-                                                   AThreadTransferSrcResetCoordinateAfterRun,
-                                                   true>(a_k_m0_m1_grid_desc,
-                                                         make_multi_index(0, im0, 0),
-                                                         a_k_m0_m1_block_desc,
-                                                         make_multi_index(0, 0, 0));
+            BlockwiseTensorSliceTransfer_v4<BlockSize,
+                                            InMemoryDataOperationEnum_t::Set,
+                                            Sequence<KPerBlock, 1, MPerBlockM1>,
+                                            ABlockTransferThreadSliceLengths_K_M0_M1,
+                                            ABlockTransferThreadClusterLengths_K_M0_M1,
+                                            ABlockTransferThreadClusterArrangeOrder,
+                                            FloatAB,
+                                            FloatAB,
+                                            decltype(a_k_m0_m1_grid_desc),
+                                            decltype(a_k_m0_m1_block_desc),
+                                            ABlockTransferSrcAccessOrder,
+                                            Sequence<0, 1, 2>,
+                                            ABlockTransferSrcVectorDim,
+                                            2,
+                                            ABlockTransferSrcScalarPerVector,
+                                            ABlockTransferDstScalarPerVector_M1,
+                                            1,
+                                            1,
+                                            AThreadTransferSrcResetCoordinateAfterRun,
+                                            true>(a_k_m0_m1_grid_desc,
+                                                  make_multi_index(0, im0, 0),
+                                                  a_k_m0_m1_block_desc,
+                                                  make_multi_index(0, 0, 0));
 
         // B matrix blockwise copy
         auto b_blockwise_copy =
-            BlockwiseDynamicTensorSliceTransfer_v4<BlockSize,
-                                                   InMemoryDataOperationEnum_t::Set,
-                                                   Sequence<KPerBlock, 1, NPerBlockN1>,
-                                                   BBlockTransferThreadSliceLengths_K_N0_N1,
-                                                   BBlockTransferThreadClusterLengths_K_N0_N1,
-                                                   BBlockTransferThreadClusterArrangeOrder,
-                                                   FloatAB,
-                                                   FloatAB,
-                                                   decltype(b_k_n0_n1_grid_desc),
-                                                   decltype(b_k_n0_n1_block_desc),
-                                                   BBlockTransferSrcAccessOrder,
-                                                   Sequence<0, 1, 2>,
-                                                   BBlockTransferSrcVectorDim,
-                                                   2,
-                                                   BBlockTransferSrcScalarPerVector,
-                                                   BBlockTransferDstScalarPerVector_N1,
-                                                   1,
-                                                   1,
-                                                   BThreadTransferSrcResetCoordinateAfterRun,
-                                                   true>(b_k_n0_n1_grid_desc,
-                                                         make_multi_index(0, in0, 0),
-                                                         b_k_n0_n1_block_desc,
-                                                         make_multi_index(0, 0, 0));
+            BlockwiseTensorSliceTransfer_v4<BlockSize,
+                                            InMemoryDataOperationEnum_t::Set,
+                                            Sequence<KPerBlock, 1, NPerBlockN1>,
+                                            BBlockTransferThreadSliceLengths_K_N0_N1,
+                                            BBlockTransferThreadClusterLengths_K_N0_N1,
+                                            BBlockTransferThreadClusterArrangeOrder,
+                                            FloatAB,
+                                            FloatAB,
+                                            decltype(b_k_n0_n1_grid_desc),
+                                            decltype(b_k_n0_n1_block_desc),
+                                            BBlockTransferSrcAccessOrder,
+                                            Sequence<0, 1, 2>,
+                                            BBlockTransferSrcVectorDim,
+                                            2,
+                                            BBlockTransferSrcScalarPerVector,
+                                            BBlockTransferDstScalarPerVector_N1,
+                                            1,
+                                            1,
+                                            BThreadTransferSrcResetCoordinateAfterRun,
+                                            true>(b_k_n0_n1_grid_desc,
+                                                  make_multi_index(0, in0, 0),
+                                                  b_k_n0_n1_block_desc,
+                                                  make_multi_index(0, 0, 0));
 
         // GEMM definition
         //   c_mtx += transpose(a_mtx) * b_mtx
@@ -447,9 +446,8 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
         constexpr auto c_m10_m11_n10_n11_thread_tensor_lengths =
             decltype(blockwise_gemm)::GetCM0M1N0N1ThreadTensorLengths();
 
-        constexpr auto c_m10_m11_n10_n11_thread_desc =
-            make_dynamic_naive_tensor_descriptor_packed_v2(
-                sequence_to_tuple_of_number(c_m10_m11_n10_n11_thread_tensor_lengths));
+        constexpr auto c_m10_m11_n10_n11_thread_desc = make_naive_tensor_descriptor_packed(
+            sequence_to_tuple_of_number(c_m10_m11_n10_n11_thread_tensor_lengths));
 
         // LDS allocation for A and B: be careful of alignment
         constexpr auto a_block_aligned_space_size =
@@ -465,9 +463,9 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
         auto c_thread_buf = make_static_buffer<AddressSpaceEnum_t::Vgpr, FloatAcc>(
             c_m10_m11_n10_n11_thread_desc.GetElementSpaceSize());
 
-        ThreadwiseDynamicTensorSliceSet_v1<FloatAcc,
-                                           decltype(c_m10_m11_n10_n11_thread_desc),
-                                           decltype(c_m10_m11_n10_n11_thread_tensor_lengths)>{}
+        ThreadwiseTensorSliceSet_v1<FloatAcc,
+                                    decltype(c_m10_m11_n10_n11_thread_desc),
+                                    decltype(c_m10_m11_n10_n11_thread_tensor_lengths)>{}
             .Run(c_m10_m11_n10_n11_thread_desc,
                  make_tuple(I0, I0, I0, I0),
                  c_thread_buf,
@@ -477,15 +475,15 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
         constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock, 0, 0);
 
         // hack to control index calculation when iterating over A and B matrix for threadwise copy
-        constexpr auto a_k_m0_m1_global_iterator_hacks = AGridIteratorHacks{};
-        constexpr auto b_k_n0_n1_global_iterator_hacks = BGridIteratorHacks{};
+        constexpr auto a_k_m0_m1_global_step_hacks = AGridStepHacks{};
+        constexpr auto b_k_n0_n1_global_step_hacks = BGridStepHacks{};
 
         // hack to control index calculation when move slice window for A and B matrix for
         // threadwise copy
-        constexpr auto a_k_m0_m1_global_move_slice_window_iterator_hack =
-            AGridMoveSliceWindowIteratorHacks{};
-        constexpr auto b_k_n0_n1_global_move_slice_window_iterator_hack =
-            BGridMoveSliceWindowIteratorHacks{};
+        constexpr auto a_k_m0_m1_global_move_slice_window_step_hack =
+            AGridMoveSliceWindowStepHacks{};
+        constexpr auto b_k_n0_n1_global_move_slice_window_step_hack =
+            BGridMoveSliceWindowStepHacks{};
 
         auto a_block_even_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
             p_a_block_double, a_k_m0_m1_block_desc.GetElementSpaceSize());
@@ -502,9 +500,9 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
         // LDS double buffer: preload data into LDS
         {
             a_blockwise_copy.RunRead(
-                a_k_m0_m1_grid_desc, a_global_buf, a_k_m0_m1_global_iterator_hacks);
+                a_k_m0_m1_grid_desc, a_global_buf, a_k_m0_m1_global_step_hacks);
             b_blockwise_copy.RunRead(
-                b_k_n0_n1_grid_desc, b_global_buf, b_k_n0_n1_global_iterator_hacks);
+                b_k_n0_n1_grid_desc, b_global_buf, b_k_n0_n1_global_step_hacks);
 
             a_blockwise_copy.RunWrite(a_k_m0_m1_block_desc, a_block_even_buf);
             b_blockwise_copy.RunWrite(b_k_n0_n1_block_desc, b_block_even_buf);
@@ -519,22 +517,20 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
             do
             {
                 // even iteration
-                a_blockwise_copy.MoveSrcSliceWindow(
-                    a_k_m0_m1_grid_desc,
-                    a_block_slice_copy_step,
-                    a_k_m0_m1_global_move_slice_window_iterator_hack);
-                b_blockwise_copy.MoveSrcSliceWindow(
-                    b_k_n0_n1_grid_desc,
-                    b_block_slice_copy_step,
-                    b_k_n0_n1_global_move_slice_window_iterator_hack);
+                a_blockwise_copy.MoveSrcSliceWindow(a_k_m0_m1_grid_desc,
+                                                    a_block_slice_copy_step,
+                                                    a_k_m0_m1_global_move_slice_window_step_hack);
+                b_blockwise_copy.MoveSrcSliceWindow(b_k_n0_n1_grid_desc,
+                                                    b_block_slice_copy_step,
+                                                    b_k_n0_n1_global_move_slice_window_step_hack);
 
                 __syncthreads();
 
                 // LDS doubel buffer: load next data from device mem
                 a_blockwise_copy.RunRead(
-                    a_k_m0_m1_grid_desc, a_global_buf, a_k_m0_m1_global_iterator_hacks);
+                    a_k_m0_m1_grid_desc, a_global_buf, a_k_m0_m1_global_step_hacks);
                 b_blockwise_copy.RunRead(
-                    b_k_n0_n1_grid_desc, b_global_buf, b_k_n0_n1_global_iterator_hacks);
+                    b_k_n0_n1_grid_desc, b_global_buf, b_k_n0_n1_global_step_hacks);
 
                 // LDS double buffer: GEMM on current data
                 blockwise_gemm.Run(c_m10_m11_n10_n11_thread_desc,
@@ -547,22 +543,20 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
                 b_blockwise_copy.RunWrite(b_k_n0_n1_block_desc, b_block_odd_buf);
 
                 // odd iteration
-                a_blockwise_copy.MoveSrcSliceWindow(
-                    a_k_m0_m1_grid_desc,
-                    a_block_slice_copy_step,
-                    a_k_m0_m1_global_move_slice_window_iterator_hack);
-                b_blockwise_copy.MoveSrcSliceWindow(
-                    b_k_n0_n1_grid_desc,
-                    b_block_slice_copy_step,
-                    b_k_n0_n1_global_move_slice_window_iterator_hack);
+                a_blockwise_copy.MoveSrcSliceWindow(a_k_m0_m1_grid_desc,
+                                                    a_block_slice_copy_step,
+                                                    a_k_m0_m1_global_move_slice_window_step_hack);
+                b_blockwise_copy.MoveSrcSliceWindow(b_k_n0_n1_grid_desc,
+                                                    b_block_slice_copy_step,
+                                                    b_k_n0_n1_global_move_slice_window_step_hack);
 
                 __syncthreads();
 
                 // LDS doubel buffer: load next data from device mem
                 a_blockwise_copy.RunRead(
-                    a_k_m0_m1_grid_desc, a_global_buf, a_k_m0_m1_global_iterator_hacks);
+                    a_k_m0_m1_grid_desc, a_global_buf, a_k_m0_m1_global_step_hacks);
                 b_blockwise_copy.RunRead(
-                    b_k_n0_n1_grid_desc, b_global_buf, b_k_n0_n1_global_iterator_hacks);
+                    b_k_n0_n1_grid_desc, b_global_buf, b_k_n0_n1_global_step_hacks);
 
                 // LDS double buffer: GEMM on current data
                 blockwise_gemm.Run(
@@ -581,18 +575,18 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
         {
             a_blockwise_copy.MoveSrcSliceWindow(a_k_m0_m1_grid_desc,
                                                 a_block_slice_copy_step,
-                                                a_k_m0_m1_global_move_slice_window_iterator_hack);
+                                                a_k_m0_m1_global_move_slice_window_step_hack);
             b_blockwise_copy.MoveSrcSliceWindow(b_k_n0_n1_grid_desc,
                                                 b_block_slice_copy_step,
-                                                b_k_n0_n1_global_move_slice_window_iterator_hack);
+                                                b_k_n0_n1_global_move_slice_window_step_hack);
 
             __syncthreads();
 
             // LDS double buffer: load last data from device mem
             a_blockwise_copy.RunRead(
-                a_k_m0_m1_grid_desc, a_global_buf, a_k_m0_m1_global_iterator_hacks);
+                a_k_m0_m1_grid_desc, a_global_buf, a_k_m0_m1_global_step_hacks);
             b_blockwise_copy.RunRead(
-                b_k_n0_n1_grid_desc, b_global_buf, b_k_n0_n1_global_iterator_hacks);
+                b_k_n0_n1_grid_desc, b_global_buf, b_k_n0_n1_global_step_hacks);
 
             // LDS double buffer: GEMM on 2nd-last data
             blockwise_gemm.Run(
@@ -619,19 +613,8 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
 
         // output: register to global memory
         {
-            constexpr index_t M11 =
-                M1PerThreadM111 * M11N11ThreadClusterM1100 * M11N11ThreadClusterM1101;
-            constexpr index_t N11 =
-                N1PerThreadN111 * M11N11ThreadClusterN1100 * M11N11ThreadClusterN1101;
-
-            constexpr index_t M10 = MPerBlockM1 / M11;
-            constexpr index_t N10 = NPerBlockN1 / N11;
-
-            constexpr index_t M111 = M1PerThreadM111;
-            constexpr index_t N111 = N1PerThreadN111;
-
             constexpr auto c_m0_m10_m11_n0_n10_n11_thread_desc =
-                make_dynamic_naive_tensor_descriptor_packed_v2(
+                make_naive_tensor_descriptor_packed(
                     make_tuple(I1,
                                Number<c_m10_m11_n10_n11_thread_tensor_lengths[I0]>{},
                                Number<c_m10_m11_n10_n11_thread_tensor_lengths[I1]>{},
@@ -642,7 +625,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
             const auto c_m10_m11_n10_n11_thread_origin_idx_on_block =
                 blockwise_gemm.CalculateCM0M1N0N1ThreadOriginOnBlock(get_thread_local_1d_id());
 
-            ThreadwiseDynamicTensorSliceTransfer_v1r3<
+            ThreadwiseTensorSliceTransfer_v1r3<
                 FloatAcc,
                 FloatC,
                 decltype(c_m0_m10_m11_n0_n10_n11_thread_desc),
@@ -670,7 +653,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
                      c_thread_buf,
                      c_m0_m10_m11_n0_n10_n11_grid_desc,
                      c_grid_buf,
-                     CGridIteratorHacks{});
+                     CGridStepHacks{});
         }
     }
 };
diff --git a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r3.hpp b/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r3.hpp
similarity index 81%
rename from composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r3.hpp
rename to composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r3.hpp
index db3cb99121..2653dd4340 100644
--- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r3.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r3.hpp
@@ -1,14 +1,14 @@
-#ifndef CK_GRIDWISE_DYNAMIC_GEMM_V1R3_HPP
-#define CK_GRIDWISE_DYNAMIC_GEMM_V1R3_HPP
+#ifndef CK_GRIDWISE_GEMM_V1R3_HPP
+#define CK_GRIDWISE_GEMM_V1R3_HPP
 
 #include "common_header.hpp"
-#include "dynamic_multi_index_transform_helper.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "multi_index_transform_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 #include "blockwise_gemm_dlops_v2r3.hpp"
-#include "blockwise_dynamic_tensor_slice_transfer_v2.hpp"
-#include "threadwise_dynamic_tensor_slice_transfer_v2.hpp"
-#include "threadwise_dynamic_tensor_slice_set.hpp"
+#include "blockwise_tensor_slice_transfer_v2.hpp"
+#include "threadwise_tensor_slice_transfer_v2.hpp"
+#include "threadwise_tensor_slice_set.hpp"
 
 namespace ck {
 
@@ -26,7 +26,7 @@ __global__ void
 #if CK_USE_LAUNCH_BOUNDS
     __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_dynamic_gemm_dlops_v1r3(
+        kernel_gemm_dlops_v1r3(
             const FloatAB* __restrict__ p_a_grid,
             const FloatAB* __restrict__ p_b_grid,
             FloatC* __restrict__ p_c_grid,
@@ -68,28 +68,27 @@ __global__ void
 #if CK_USE_LAUNCH_BOUNDS
     __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_dynamic_gemm_dlops_v1r3(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            FloatC* __restrict__ p_c_grid,
-            const void CONSTANT* p_a_k0_m0_m1_k1_grid_desc,
-            const void CONSTANT* p_b_k0_n0_n1_k1_grid_desc,
-            const void CONSTANT* p_c_m0_m10_m11_n0_n10_n11_grid_desc,
-            const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor)
+        kernel_gemm_dlops_v1r3(const FloatAB* __restrict__ p_a_grid,
+                               const FloatAB* __restrict__ p_b_grid,
+                               FloatC* __restrict__ p_c_grid,
+                               const void CONSTANT* p_a_k0_m0_m1_k1_grid_desc,
+                               const void CONSTANT* p_b_k0_n0_n1_k1_grid_desc,
+                               const void CONSTANT* p_c_m0_m10_m11_n0_n10_n11_grid_desc,
+                               const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor)
 {
     // first cast void CONSTANT void* to void*
     // second cast void* to Desc*
     // the copy constructor of tensor descriptor doesn't take address_space(4)
-    const auto a_k0_m0_m1_k1_grid_desc =
-        *reinterpret_cast<const AK0M0M1K1GridDesc*>((const void*)p_a_k0_m0_m1_k1_grid_desc);
-    const auto b_k0_n0_n1_k1_grid_desc =
-        *reinterpret_cast<const BK0N0N1K1GridDesc*>((const void*)p_b_k0_n0_n1_k1_grid_desc);
+    const auto a_k0_m0_m1_k1_grid_desc = *reinterpret_cast<const AK0M0M1K1GridDesc*>(
+        cast_pointer_to_generic_address_space(p_a_k0_m0_m1_k1_grid_desc));
+    const auto b_k0_n0_n1_k1_grid_desc = *reinterpret_cast<const BK0N0N1K1GridDesc*>(
+        cast_pointer_to_generic_address_space(p_b_k0_n0_n1_k1_grid_desc));
     const auto c_m0_m10_m11_n0_n10_n11_grid_desc =
         *reinterpret_cast<const CM0M10M11N0N10N11GridDesc*>(
-            (const void*)p_c_m0_m10_m11_n0_n10_n11_grid_desc);
+            cast_pointer_to_generic_address_space(p_c_m0_m10_m11_n0_n10_n11_grid_desc));
     const auto c_blockid_to_m0_n0_block_cluster_adaptor =
         *reinterpret_cast<const CBlockIdToM0N0BlockClusterAdaptor*>(
-            (const void*)p_c_blockid_to_m0_n0_block_cluster_adaptor);
+            cast_pointer_to_generic_address_space(p_c_blockid_to_m0_n0_block_cluster_adaptor));
 
     constexpr index_t shared_block_size =
         GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
@@ -142,12 +141,12 @@ template <index_t BlockSize,
           typename CThreadTransferSrcDstAccessOrder,
           index_t CThreadTransferSrcDstVectorDim,
           index_t CThreadTransferDstScalarPerVector,
-          typename AGridIteratorHacks,
-          typename BGridIteratorHacks,
-          typename CGridIteratorHacks,
-          typename AGridMoveSliceWindowIteratorHacks,
-          typename BGridMoveSliceWindowIteratorHacks>
-struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
+          typename AGridStepHacks,
+          typename BGridStepHacks,
+          typename CGridStepHacks,
+          typename AGridMoveSliceWindowStepHacks,
+          typename BGridMoveSliceWindowStepHacks>
+struct GridwiseGemmDlops_km_kn_mn_v1r3
 {
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
@@ -164,12 +163,12 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
 
         // TODO: check alignment
         // A matrix in LDS memory, dst of blockwise copy
-        constexpr auto a_k_m_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}, K1), max_lds_align);
 
         // TODO: check alignment
         // B matrix in LDS memory, dst of blockwise copy
-        constexpr auto b_k_n_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}, K1), max_lds_align);
 
         // TODO: check alignment
@@ -191,12 +190,12 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
         const auto M  = a_k0_m_k1_grid_desc.GetLength(I1);
         const auto N  = b_k0_n_k1_grid_desc.GetLength(I1);
         const auto K0 = a_k0_m_k1_grid_desc.GetLength(I0);
-        const auto K1 = a_k0_m_k1_grid_desc.GetLength(I2);
 
         // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
 
         return (M == c_m_n_grid_desc.GetLength(I0) && N == c_m_n_grid_desc.GetLength(I1) &&
                 K0 == b_k0_n_k1_grid_desc.GetLength(I0) &&
+                K1 == a_k0_m_k1_grid_desc.GetLength(I2) &&
                 K1 == b_k0_n_k1_grid_desc.GetLength(I2)) &&
                (M % MPerBlockM1 == 0 && N % NPerBlockN1 == 0 && K0 % KPerBlock == 0);
     }
@@ -231,13 +230,13 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
         const auto M1 = Number<MPerBlockM1>{};
         const auto M0 = M / M1;
 
-        const auto a_k0_m0_m1_k1_grid_desc = transform_dynamic_tensor_descriptor(
-            a_k0_m_k1_grid_desc,
-            make_tuple(make_pass_through_transform(K0),
-                       make_unmerge_transform(make_tuple(M0, M1)),
-                       make_pass_through_transform(K1)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+        const auto a_k0_m0_m1_k1_grid_desc =
+            transform_tensor_descriptor(a_k0_m_k1_grid_desc,
+                                        make_tuple(make_pass_through_transform(K0),
+                                                   make_unmerge_transform(make_tuple(M0, M1)),
+                                                   make_pass_through_transform(K1)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
 
         return a_k0_m0_m1_k1_grid_desc;
     }
@@ -251,13 +250,13 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
         const auto N1 = Number<NPerBlockN1>{};
         const auto N0 = N / N1;
 
-        const auto b_k0_n0_n1_k1_grid_desc = transform_dynamic_tensor_descriptor(
-            b_k0_n_k1_grid_desc,
-            make_tuple(make_pass_through_transform(K0),
-                       make_unmerge_transform(make_tuple(N0, N1)),
-                       make_pass_through_transform(K1)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+        const auto b_k0_n0_n1_k1_grid_desc =
+            transform_tensor_descriptor(b_k0_n_k1_grid_desc,
+                                        make_tuple(make_pass_through_transform(K0),
+                                                   make_unmerge_transform(make_tuple(N0, N1)),
+                                                   make_pass_through_transform(K1)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
 
         return b_k0_n0_n1_k1_grid_desc;
     }
@@ -275,16 +274,16 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
         const auto N0 = N / N1;
 
         constexpr auto M11 =
-            Number<container_reduce(M11N11ThreadClusterM110Xs{}, math::multiplies_v2{}, I1) *
+            Number<container_reduce(M11N11ThreadClusterM110Xs{}, math::multiplies{}, I1) *
                    M1PerThreadM111>{};
         constexpr auto N11 =
-            Number<container_reduce(M11N11ThreadClusterN110Xs{}, math::multiplies_v2{}, I1) *
+            Number<container_reduce(M11N11ThreadClusterN110Xs{}, math::multiplies{}, I1) *
                    N1PerThreadN111>{};
 
         constexpr auto M10 = M1 / M11;
         constexpr auto N10 = N1 / N11;
 
-        const auto c_m0_m10_m11_n0_n10_n11_grid_desc = transform_dynamic_tensor_descriptor(
+        const auto c_m0_m10_m11_n0_n10_n11_grid_desc = transform_tensor_descriptor(
             c_m_n_grid_desc,
             make_tuple(make_unmerge_transform(make_tuple(M0, M10, M11)),
                        make_unmerge_transform(make_tuple(N0, N10, N11))),
@@ -355,23 +354,23 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
         // TODO: check alignment
         // A matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto a_k0_m0_m1_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_k0_m0_m1_k1_block_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<KPerBlock>{}, I1, Number<MPerBlockM1>{}, K1), max_lds_align);
 
         // TODO: check alignment
         // B matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto b_k0_n0_n1_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto b_k0_n0_n1_k1_block_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<KPerBlock>{}, I1, Number<NPerBlockN1>{}, K1), max_lds_align);
 
         // TODO: check alignment
         // A matrix in LDS memory, for blockwise GEMM
-        constexpr auto a_k0_m_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}, K1), max_lds_align);
 
         // TODO: check alignment
         // B matrix in LDS memory, for blockwise GEMM
-        constexpr auto b_k0_n_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}, K1), max_lds_align);
 
         static_assert(a_k0_m0_m1_k1_block_desc.GetElementSpaceSize() ==
@@ -381,7 +380,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
                       "wrong!");
 
         // A matrix blockwise copy
-        auto a_blockwise_copy = BlockwiseDynamicTensorSliceTransfer_v4r1<
+        auto a_blockwise_copy = BlockwiseTensorSliceTransfer_v4r1<
             BlockSize,
             InMemoryDataOperationEnum_t::Set,
             Sequence<KPerBlock, 1, MPerBlockM1, K1.value>,
@@ -405,7 +404,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
                   make_multi_index(0, 0, 0, 0));
 
         // B matrix blockwise copy
-        auto b_blockwise_copy = BlockwiseDynamicTensorSliceTransfer_v4r1<
+        auto b_blockwise_copy = BlockwiseTensorSliceTransfer_v4r1<
             BlockSize,
             InMemoryDataOperationEnum_t::Set,
             Sequence<KPerBlock, 1, NPerBlockN1, K1.value>,
@@ -453,9 +452,8 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
         constexpr auto c_m10_m11_n10_n11_thread_tensor_lengths =
             decltype(blockwise_gemm)::GetCThreadTensorLengths_BM0_BM1_BN0_BN1();
 
-        constexpr auto c_m10_m11_n10_n11_thread_desc =
-            make_dynamic_naive_tensor_descriptor_packed_v2(
-                sequence_to_tuple_of_number(c_m10_m11_n10_n11_thread_tensor_lengths));
+        constexpr auto c_m10_m11_n10_n11_thread_desc = make_naive_tensor_descriptor_packed(
+            sequence_to_tuple_of_number(c_m10_m11_n10_n11_thread_tensor_lengths));
 
         // LDS allocation for A and B: be careful of alignment
         constexpr auto a_block_aligned_space_size = math::integer_least_multiple(
@@ -471,9 +469,9 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
         auto c_thread_buf = make_static_buffer<AddressSpaceEnum_t::Vgpr, FloatAcc>(
             c_m10_m11_n10_n11_thread_desc.GetElementSpaceSize());
 
-        ThreadwiseDynamicTensorSliceSet_v1<FloatAcc,
-                                           decltype(c_m10_m11_n10_n11_thread_desc),
-                                           decltype(c_m10_m11_n10_n11_thread_tensor_lengths)>{}
+        ThreadwiseTensorSliceSet_v1<FloatAcc,
+                                    decltype(c_m10_m11_n10_n11_thread_desc),
+                                    decltype(c_m10_m11_n10_n11_thread_tensor_lengths)>{}
             .Run(c_m10_m11_n10_n11_thread_desc,
                  make_tuple(I0, I0, I0, I0),
                  c_thread_buf,
@@ -496,8 +494,8 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
 
         // LDS double buffer: preload data into LDS
         {
-            a_blockwise_copy.RunRead(a_k0_m0_m1_k1_grid_desc, a_global_buf, AGridIteratorHacks{});
-            b_blockwise_copy.RunRead(b_k0_n0_n1_k1_grid_desc, b_global_buf, BGridIteratorHacks{});
+            a_blockwise_copy.RunRead(a_k0_m0_m1_k1_grid_desc, a_global_buf, AGridStepHacks{});
+            b_blockwise_copy.RunRead(b_k0_n0_n1_k1_grid_desc, b_global_buf, BGridStepHacks{});
 
             a_blockwise_copy.RunWrite(a_k0_m0_m1_k1_block_desc, a_block_even_buf);
             b_blockwise_copy.RunWrite(b_k0_n0_n1_k1_block_desc, b_block_even_buf);
@@ -516,18 +514,16 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
                 // even iteration
                 a_blockwise_copy.MoveSrcSliceWindow(a_k0_m0_m1_k1_grid_desc,
                                                     a_block_slice_copy_step,
-                                                    AGridMoveSliceWindowIteratorHacks{});
+                                                    AGridMoveSliceWindowStepHacks{});
                 b_blockwise_copy.MoveSrcSliceWindow(b_k0_n0_n1_k1_grid_desc,
                                                     b_block_slice_copy_step,
-                                                    BGridMoveSliceWindowIteratorHacks{});
+                                                    BGridMoveSliceWindowStepHacks{});
 
                 __syncthreads();
 
                 // LDS doubel buffer: load next data from device mem
-                a_blockwise_copy.RunRead(
-                    a_k0_m0_m1_k1_grid_desc, a_global_buf, AGridIteratorHacks{});
-                b_blockwise_copy.RunRead(
-                    b_k0_n0_n1_k1_grid_desc, b_global_buf, BGridIteratorHacks{});
+                a_blockwise_copy.RunRead(a_k0_m0_m1_k1_grid_desc, a_global_buf, AGridStepHacks{});
+                b_blockwise_copy.RunRead(b_k0_n0_n1_k1_grid_desc, b_global_buf, BGridStepHacks{});
 
                 // LDS double buffer: GEMM on current data
                 blockwise_gemm.Run(c_m10_m11_n10_n11_thread_desc,
@@ -542,18 +538,16 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
                 // odd iteration
                 a_blockwise_copy.MoveSrcSliceWindow(a_k0_m0_m1_k1_grid_desc,
                                                     a_block_slice_copy_step,
-                                                    AGridMoveSliceWindowIteratorHacks{});
+                                                    AGridMoveSliceWindowStepHacks{});
                 b_blockwise_copy.MoveSrcSliceWindow(b_k0_n0_n1_k1_grid_desc,
                                                     b_block_slice_copy_step,
-                                                    BGridMoveSliceWindowIteratorHacks{});
+                                                    BGridMoveSliceWindowStepHacks{});
 
                 __syncthreads();
 
                 // LDS doubel buffer: load next data from device mem
-                a_blockwise_copy.RunRead(
-                    a_k0_m0_m1_k1_grid_desc, a_global_buf, AGridIteratorHacks{});
-                b_blockwise_copy.RunRead(
-                    b_k0_n0_n1_k1_grid_desc, b_global_buf, BGridIteratorHacks{});
+                a_blockwise_copy.RunRead(a_k0_m0_m1_k1_grid_desc, a_global_buf, AGridStepHacks{});
+                b_blockwise_copy.RunRead(b_k0_n0_n1_k1_grid_desc, b_global_buf, BGridStepHacks{});
 
                 // LDS double buffer: GEMM on current data
                 blockwise_gemm.Run(
@@ -570,18 +564,16 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
         // LDS double buffer: tail
         if constexpr(HasDoubleTailKBlockLoop) // if has 2 iteration left
         {
-            a_blockwise_copy.MoveSrcSliceWindow(a_k0_m0_m1_k1_grid_desc,
-                                                a_block_slice_copy_step,
-                                                AGridMoveSliceWindowIteratorHacks{});
-            b_blockwise_copy.MoveSrcSliceWindow(b_k0_n0_n1_k1_grid_desc,
-                                                b_block_slice_copy_step,
-                                                BGridMoveSliceWindowIteratorHacks{});
+            a_blockwise_copy.MoveSrcSliceWindow(
+                a_k0_m0_m1_k1_grid_desc, a_block_slice_copy_step, AGridMoveSliceWindowStepHacks{});
+            b_blockwise_copy.MoveSrcSliceWindow(
+                b_k0_n0_n1_k1_grid_desc, b_block_slice_copy_step, BGridMoveSliceWindowStepHacks{});
 
             __syncthreads();
 
             // LDS double buffer: load last data from device mem
-            a_blockwise_copy.RunRead(a_k0_m0_m1_k1_grid_desc, a_global_buf, AGridIteratorHacks{});
-            b_blockwise_copy.RunRead(b_k0_n0_n1_k1_grid_desc, b_global_buf, BGridIteratorHacks{});
+            a_blockwise_copy.RunRead(a_k0_m0_m1_k1_grid_desc, a_global_buf, AGridStepHacks{});
+            b_blockwise_copy.RunRead(b_k0_n0_n1_k1_grid_desc, b_global_buf, BGridStepHacks{});
 
             // LDS double buffer: GEMM on 2nd-last data
             blockwise_gemm.Run(
@@ -608,21 +600,8 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
 
         // output: register to global memory
         {
-            constexpr auto M11 =
-                Number<container_reduce(M11N11ThreadClusterM110Xs{}, math::multiplies_v2{}, I1) *
-                       M1PerThreadM111>{};
-            constexpr auto N11 =
-                Number<container_reduce(M11N11ThreadClusterN110Xs{}, math::multiplies_v2{}, I1) *
-                       N1PerThreadN111>{};
-
-            constexpr index_t M10 = MPerBlockM1 / M11;
-            constexpr index_t N10 = NPerBlockN1 / N11;
-
-            constexpr index_t M111 = M1PerThreadM111;
-            constexpr index_t N111 = N1PerThreadN111;
-
             constexpr auto c_m0_m10_m11_n0_n10_n11_thread_desc =
-                make_dynamic_naive_tensor_descriptor_packed_v2(
+                make_naive_tensor_descriptor_packed(
                     make_tuple(I1,
                                Number<c_m10_m11_n10_n11_thread_tensor_lengths[I0]>{},
                                Number<c_m10_m11_n10_n11_thread_tensor_lengths[I1]>{},
@@ -634,7 +613,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
                 blockwise_gemm.CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1(
                     get_thread_local_1d_id());
 
-            ThreadwiseDynamicTensorSliceTransfer_v1r3<
+            ThreadwiseTensorSliceTransfer_v1r3<
                 FloatAcc,
                 FloatC,
                 decltype(c_m0_m10_m11_n0_n10_n11_thread_desc),
@@ -662,7 +641,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
                      c_thread_buf,
                      c_m0_m10_m11_n0_n10_n11_grid_desc,
                      c_grid_buf,
-                     CGridIteratorHacks{});
+                     CGridStepHacks{});
         }
     }
 };
diff --git a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v2.hpp b/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
similarity index 74%
rename from composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v2.hpp
rename to composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
index 34dea34833..84ee6f40ec 100644
--- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v2.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
@@ -1,12 +1,12 @@
-#ifndef CK_GRIDWISE_DYNAMIC_GEMM_V2_HPP
-#define CK_GRIDWISE_DYNAMIC_GEMM_V2_HPP
+#ifndef CK_GRIDWISE_GEMM_V2_HPP
+#define CK_GRIDWISE_GEMM_V2_HPP
 
 #include "common_header.hpp"
-#include "dynamic_multi_index_transform_helper.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "blockwise_dynamic_tensor_slice_transfer.hpp"
-#include "threadwise_dynamic_tensor_slice_transfer.hpp"
+#include "multi_index_transform_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "blockwise_tensor_slice_transfer.hpp"
+#include "threadwise_tensor_slice_transfer.hpp"
 #include "blockwise_gemm_dlops_v3.hpp"
 
 namespace ck {
@@ -42,12 +42,12 @@ template <index_t BlockSize,
           typename CThreadTransferSrcDstAccessOrder,
           index_t CThreadTransferSrcDstVectorDim,
           index_t CThreadTransferDstScalarPerVector,
-          typename AGlobalIteratorHacks,
-          typename BGlobalIteratorHacks,
-          typename CGlobalIteratorHacks,
-          typename AGlobalMoveSliceWindowIteratorHacks,
-          typename BGlobalMoveSliceWindowIteratorHacks>
-struct GridwiseDynamicGemmDlops_km_kn_mn_v3
+          typename AGlobalStepHacks,
+          typename BGlobalStepHacks,
+          typename CGlobalStepHacks,
+          typename AGlobalMoveSliceWindowStepHacks,
+          typename BGlobalMoveSliceWindowStepHacks>
+struct GridwiseGemmDlops_km_kn_mn_v3
 {
     __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
     {
@@ -58,7 +58,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
 
         // A matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto a_e_k_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_e_k_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<E>{}, Number<KPerBlock>{}), max_lds_align);
 
         // LDS allocation for A and B: be careful of alignment
@@ -102,7 +102,6 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
 
 // divide block work by [M, N]
 #if 0
-        const auto k_block_work_num   = K / Number<KPerBlock>{};
         const auto ho_block_work_num  = Ho / Number<HoPerBlock>{};
         const auto wo_block_work_num  = Wo / Number<WoPerBlock>{};
         const auto hwo_block_work_num = ho_block_work_num * wo_block_work_num;
@@ -114,7 +113,6 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
         const index_t wo_block_work_id = hwo_block_work_id - ho_block_work_id * wo_block_work_num;
 #else
         // Hack: this force result into SGPR
-        const index_t k_block_work_num   = __builtin_amdgcn_readfirstlane(K / KPerBlock);
         const index_t ho_block_work_num  = __builtin_amdgcn_readfirstlane(Ho / HoPerBlock);
         const index_t wo_block_work_num  = __builtin_amdgcn_readfirstlane(Wo / WoPerBlock);
         const index_t hwo_block_work_num = ho_block_work_num * wo_block_work_num;
@@ -134,23 +132,21 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
 
         // A matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto a_e_k_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_e_k_block_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<EPerBlock>{}, Number<KPerBlock>{}), max_lds_align);
 
-        constexpr auto a_e_k_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_e_k_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<E>{}, Number<KPerBlock>{}), max_lds_align);
 
         // B matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto b_e_n_ho_wo_block_desc =
-            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
-                Number<EPerBlock>{}, Number<1>{}, Number<HoPerBlock>{}, Number<WoPerBlock>{}));
+        constexpr auto b_e_n_ho_wo_block_desc = make_naive_tensor_descriptor_packed(make_tuple(
+            Number<EPerBlock>{}, Number<1>{}, Number<HoPerBlock>{}, Number<WoPerBlock>{}));
 
         // c_thread_mtx definition: this is a mess
         // TODO:: more elegent way of defining c_thread_mtx
-        constexpr auto c_k_n_ho_wo_thread_desc =
-            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
-                Number<KPerThread>{}, Number<1>{}, Number<HoPerThread>{}, Number<WoPerThread>{}));
+        constexpr auto c_k_n_ho_wo_thread_desc = make_naive_tensor_descriptor_packed(make_tuple(
+            Number<KPerThread>{}, Number<1>{}, Number<HoPerThread>{}, Number<WoPerThread>{}));
 
         auto blockwise_gemm =
             BlockwiseGemmDlops_km_kn_m0m1n0n1_v3<BlockSize,
@@ -184,47 +180,46 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
 
         // A matrix blockwise copy
         auto a_blockwise_copy =
-            BlockwiseDynamicTensorSliceTransfer_v4<BlockSize,
-                                                   InMemoryDataOperationEnum_t::Set,
-                                                   Sequence<E, KPerBlock>,
-                                                   ABlockTransferThreadSliceLengths_E_K,
-                                                   ABlockTransferThreadClusterLengths_E_K,
-                                                   ABlockTransferThreadClusterArrangeOrder,
-                                                   FloatAB,
-                                                   FloatAB,
-                                                   decltype(a_e_k_global_desc),
-                                                   decltype(a_e_k_desc),
-                                                   ABlockTransferSrcAccessOrder,
-                                                   Sequence<0, 1>,
-                                                   ABlockTransferSrcVectorDim,
-                                                   1,
-                                                   ABlockTransferSrcScalarPerVector,
-                                                   ABlockTransferDstScalarPerVector_K,
-                                                   1,
-                                                   1,
-                                                   AThreadTransferSrcResetCoordinateAfterRun,
-                                                   true>(
-                a_e_k_global_desc,
-                make_multi_index(0, k_block_data_on_global),
-                a_e_k_desc,
-                make_multi_index(0, 0));
-
-        constexpr auto b_e_n_ho_wo_thread_desc =
-            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
-                Number<EPerBlock>{}, Number<1>{}, Number<HoPerThread>{}, Number<WoPerThread>{}));
-
-        auto b_threadwise_transfer = ThreadwiseDynamicTensorSliceTransfer_v2<
-            FloatAB,
-            FloatAB,
-            decltype(b_e_n_ho_wo_global_desc),
-            decltype(b_e_n_ho_wo_thread_desc),
-            Sequence<EPerBlock, 1, HoPerThread, WoPerThread>,
-            BBlockTransferSrcAccessOrder,
-            BBlockTransferSrcVectorDim,
-            BBlockTransferSrcScalarPerVector,
-            1,
-            true>(b_e_n_ho_wo_global_desc,
-                  make_multi_index(0, 0, ho_thread_data_on_global, wo_thread_data_on_global));
+            BlockwiseTensorSliceTransfer_v4<BlockSize,
+                                            InMemoryDataOperationEnum_t::Set,
+                                            Sequence<E, KPerBlock>,
+                                            ABlockTransferThreadSliceLengths_E_K,
+                                            ABlockTransferThreadClusterLengths_E_K,
+                                            ABlockTransferThreadClusterArrangeOrder,
+                                            FloatAB,
+                                            FloatAB,
+                                            decltype(a_e_k_global_desc),
+                                            decltype(a_e_k_desc),
+                                            ABlockTransferSrcAccessOrder,
+                                            Sequence<0, 1>,
+                                            ABlockTransferSrcVectorDim,
+                                            1,
+                                            ABlockTransferSrcScalarPerVector,
+                                            ABlockTransferDstScalarPerVector_K,
+                                            1,
+                                            1,
+                                            AThreadTransferSrcResetCoordinateAfterRun,
+                                            true>(a_e_k_global_desc,
+                                                  make_multi_index(0, k_block_data_on_global),
+                                                  a_e_k_desc,
+                                                  make_multi_index(0, 0));
+
+        constexpr auto b_e_n_ho_wo_thread_desc = make_naive_tensor_descriptor_packed(make_tuple(
+            Number<EPerBlock>{}, Number<1>{}, Number<HoPerThread>{}, Number<WoPerThread>{}));
+
+        auto b_threadwise_transfer =
+            ThreadwiseTensorSliceTransfer_v2<FloatAB,
+                                             FloatAB,
+                                             decltype(b_e_n_ho_wo_global_desc),
+                                             decltype(b_e_n_ho_wo_thread_desc),
+                                             Sequence<EPerBlock, 1, HoPerThread, WoPerThread>,
+                                             BBlockTransferSrcAccessOrder,
+                                             BBlockTransferSrcVectorDim,
+                                             BBlockTransferSrcScalarPerVector,
+                                             1,
+                                             true>(
+                b_e_n_ho_wo_global_desc,
+                make_multi_index(0, 0, ho_thread_data_on_global, wo_thread_data_on_global));
 
         auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
             p_shared_block, a_e_k_desc.GetElementSpaceSize());
@@ -232,44 +227,45 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
         // register allocation for output
         StaticBuffer<AddressSpaceEnum_t::Vgpr,
                      FloatAcc,
-                     c_k_n_ho_wo_thread_desc.GetElementSpaceSize()>
+                     c_k_n_ho_wo_thread_desc.GetElementSpaceSize(),
+                     true>
             c_thread_buf;
 
         // initialize output thread tensor
-        ThreadwiseDynamicTensorSliceSet_v1<FloatAcc,
-                                           decltype(c_k_n_ho_wo_thread_desc),
-                                           Sequence<KPerThread, 1, HoPerThread, WoPerThread>>{}
+        ThreadwiseTensorSliceSet_v1<FloatAcc,
+                                    decltype(c_k_n_ho_wo_thread_desc),
+                                    Sequence<KPerThread, 1, HoPerThread, WoPerThread>>{}
             .Run(c_k_n_ho_wo_thread_desc, make_tuple(I0, I0, I0, I0), c_thread_buf, FloatAcc{0});
 
         constexpr auto b_thread_slice_copy_step = make_multi_index(EPerBlock, 0, 0, 0);
 
         // hack to control index calculation when iterating over A and B matrix for threadwise copy
-        constexpr auto a_e_k_global_iterator_hacks       = AGlobalIteratorHacks{};
-        constexpr auto b_e_n_ho_wo_global_iterator_hacks = BGlobalIteratorHacks{};
+        constexpr auto a_e_k_global_step_hacks       = AGlobalStepHacks{};
+        constexpr auto b_e_n_ho_wo_global_step_hacks = BGlobalStepHacks{};
 
         // hack to control index calculation when move slice window for A and B matrix for
         // threadwise copy
-        constexpr auto a_e_k_global_move_slice_window_iterator_hack =
-            AGlobalMoveSliceWindowIteratorHacks{};
-        constexpr auto b_e_n_ho_wo_global_move_slice_window_iterator_hack =
-            BGlobalMoveSliceWindowIteratorHacks{};
+        constexpr auto a_e_k_global_move_slice_window_step_hack = AGlobalMoveSliceWindowStepHacks{};
+        constexpr auto b_e_n_ho_wo_global_move_slice_window_step_hack =
+            BGlobalMoveSliceWindowStepHacks{};
 
         // double regsiter buffer for b
         StaticBuffer<AddressSpaceEnum_t::Vgpr,
                      FloatAB,
-                     b_e_n_ho_wo_thread_desc.GetElementSpaceSize()>
+                     b_e_n_ho_wo_thread_desc.GetElementSpaceSize(),
+                     true>
             b_thread_even_buf, b_thread_odd_buf;
 
         // LDS double buffer: preload data
         {
-            a_blockwise_copy.RunRead(a_e_k_global_desc, a_global_buf, a_e_k_global_iterator_hacks);
+            a_blockwise_copy.RunRead(a_e_k_global_desc, a_global_buf, a_e_k_global_step_hacks);
 
             b_threadwise_transfer.Run(b_e_n_ho_wo_global_desc,
                                       b_global_buf,
                                       b_e_n_ho_wo_thread_desc,
                                       make_tuple(I0, I0, I0, I0),
                                       b_thread_even_buf,
-                                      b_e_n_ho_wo_global_iterator_hacks);
+                                      b_e_n_ho_wo_global_step_hacks);
 
             a_blockwise_copy.RunWrite(a_e_k_desc, a_block_buf);
         }
@@ -293,7 +289,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
                                           b_e_n_ho_wo_thread_desc,
                                           make_tuple(I0, I0, I0, I0),
                                           b_thread_odd_buf,
-                                          b_e_n_ho_wo_global_iterator_hacks);
+                                          b_e_n_ho_wo_global_step_hacks);
 
                 // LDS double buffer: GEMM on current data
                 // TODO: @Zhang Jing: blockwise gemm should be able to move slice window
@@ -309,7 +305,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
                                           b_e_n_ho_wo_thread_desc,
                                           make_tuple(I0, I0, I0, I0),
                                           b_thread_even_buf,
-                                          b_e_n_ho_wo_global_iterator_hacks);
+                                          b_e_n_ho_wo_global_step_hacks);
 
                 // LDS double buffer: GEMM on current data
                 blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf);
@@ -332,7 +328,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
                                       b_e_n_ho_wo_thread_desc,
                                       make_tuple(I0, I0, I0, I0),
                                       b_thread_odd_buf,
-                                      b_e_n_ho_wo_global_iterator_hacks);
+                                      b_e_n_ho_wo_global_step_hacks);
 
             // LDS double buffer: GEMM on 2nd-last data
             blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
@@ -351,23 +347,22 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
         // output: register to global memory
         {
             // hack to control index calculation when iterating over c_k_n_ho_wo_global tensor
-            constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks = CGlobalIteratorHacks{};
+            constexpr auto c_k_n_ho_wo_global_tensor_step_hacks = CGlobalStepHacks{};
 
             const index_t k_thread_data_on_global =
                 k_block_data_on_global + k_thread_id * KPerThread;
 
-            ThreadwiseDynamicTensorSliceTransfer_v1r3<
-                FloatAcc,
-                FloatC,
-                decltype(c_k_n_ho_wo_thread_desc),
-                decltype(c_k_n_ho_wo_global_desc),
-                Sequence<KPerThread, 1, HoPerThread, WoPerThread>,
-                CThreadTransferSrcDstAccessOrder,
-                CThreadTransferSrcDstVectorDim,
-                CThreadTransferDstScalarPerVector,
-                CGlobalMemoryDataOperation,
-                1,
-                true>(
+            ThreadwiseTensorSliceTransfer_v1r3<FloatAcc,
+                                               FloatC,
+                                               decltype(c_k_n_ho_wo_thread_desc),
+                                               decltype(c_k_n_ho_wo_global_desc),
+                                               Sequence<KPerThread, 1, HoPerThread, WoPerThread>,
+                                               CThreadTransferSrcDstAccessOrder,
+                                               CThreadTransferSrcDstVectorDim,
+                                               CThreadTransferDstScalarPerVector,
+                                               CGlobalMemoryDataOperation,
+                                               1,
+                                               true>(
                 c_k_n_ho_wo_global_desc,
                 make_multi_index(
                     k_thread_data_on_global, 0, ho_thread_data_on_global, wo_thread_data_on_global))
@@ -376,7 +371,7 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
                      c_thread_buf,
                      c_k_n_ho_wo_global_desc,
                      c_global_buf,
-                     c_k_n_ho_wo_global_tensor_iterator_hacks);
+                     c_k_n_ho_wo_global_tensor_step_hacks);
         }
     }
 
diff --git a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp b/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
similarity index 74%
rename from composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp
rename to composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
index a5b1de79a7..207f73072f 100644
--- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
@@ -1,14 +1,14 @@
-#ifndef CK_GRIDWISE_DYNAMIC_GEMM_XDLOPS_V2R3_HPP
-#define CK_GRIDWISE_DYNAMIC_GEMM_XDLOPS_V2R3_HPP
+#ifndef CK_GRIDWISE_GEMM_XDLOPS_V2R3_HPP
+#define CK_GRIDWISE_GEMM_XDLOPS_V2R3_HPP
 
 #include "common_header.hpp"
-#include "dynamic_multi_index_transform_helper.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "multi_index_transform_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 #include "blockwise_gemm_xdlops.hpp"
-#include "blockwise_dynamic_tensor_slice_transfer.hpp"
-#include "threadwise_dynamic_tensor_slice_transfer.hpp"
-#include "threadwise_dynamic_tensor_slice_set.hpp"
+#include "blockwise_tensor_slice_transfer.hpp"
+#include "threadwise_tensor_slice_transfer.hpp"
+#include "threadwise_tensor_slice_set.hpp"
 
 namespace ck {
 
@@ -24,13 +24,13 @@ __global__ void
 #if CK_USE_LAUNCH_BOUNDS
     __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_dynamic_gemm_xdlops_v2r3(const FloatAB* __restrict__ p_a_grid,
-                                        const FloatAB* __restrict__ p_b_grid,
-                                        FloatC* __restrict__ p_c_grid,
-                                        const AK0MK1GridDesc a_k0_m_k1_grid_desc,
-                                        const BK0NK1GridDesc b_k0_n_k1_grid_desc,
-                                        const CM0M1M2NGridDesc c_m0_m1_m2_n_grid_desc,
-                                        const CBlockClusterAdaptor c_block_cluster_adaptor)
+        kernel_gemm_xdlops_v2r3(const FloatAB* __restrict__ p_a_grid,
+                                const FloatAB* __restrict__ p_b_grid,
+                                FloatC* __restrict__ p_c_grid,
+                                const AK0MK1GridDesc a_k0_m_k1_grid_desc,
+                                const BK0NK1GridDesc b_k0_n_k1_grid_desc,
+                                const CM0M1M2NGridDesc c_m0_m1_m2_n_grid_desc,
+                                const CBlockClusterAdaptor c_block_cluster_adaptor)
 {
     constexpr index_t shared_block_size =
         GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
@@ -58,25 +58,25 @@ __global__ void
 #if CK_USE_LAUNCH_BOUNDS
     __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_dynamic_gemm_xdlops_v2r3(const FloatAB* __restrict__ p_a_grid,
-                                        const FloatAB* __restrict__ p_b_grid,
-                                        FloatC* __restrict__ p_c_grid,
-                                        const void CONSTANT* p_a_k0_m_k1_grid_desc,
-                                        const void CONSTANT* p_b_k0_n_k1_grid_desc,
-                                        const void CONSTANT* p_c_m0_m1_m2_n_grid_desc,
-                                        const void CONSTANT* p_c_block_cluster_adaptor)
+        kernel_gemm_xdlops_v2r3(const FloatAB* __restrict__ p_a_grid,
+                                const FloatAB* __restrict__ p_b_grid,
+                                FloatC* __restrict__ p_c_grid,
+                                const void CONSTANT* p_a_k0_m_k1_grid_desc,
+                                const void CONSTANT* p_b_k0_n_k1_grid_desc,
+                                const void CONSTANT* p_c_m0_m1_m2_n_grid_desc,
+                                const void CONSTANT* p_c_block_cluster_adaptor)
 {
     constexpr index_t shared_block_size =
         GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
 
-    const auto a_k0_m_k1_grid_desc =
-        *reinterpret_cast<const AK0MK1GridDesc*>((const void*)p_a_k0_m_k1_grid_desc);
-    const auto b_k0_n_k1_grid_desc =
-        *reinterpret_cast<const BK0NK1GridDesc*>((const void*)p_b_k0_n_k1_grid_desc);
-    const auto c_m0_m1_m2_n_grid_desc =
-        *reinterpret_cast<const CM0M1M2NGridDesc*>((const void*)p_c_m0_m1_m2_n_grid_desc);
-    const auto c_block_cluster_adaptor =
-        *reinterpret_cast<const CBlockClusterAdaptor*>((const void*)p_c_block_cluster_adaptor);
+    const auto a_k0_m_k1_grid_desc = *reinterpret_cast<const AK0MK1GridDesc*>(
+        cast_pointer_to_generic_address_space(p_a_k0_m_k1_grid_desc));
+    const auto b_k0_n_k1_grid_desc = *reinterpret_cast<const BK0NK1GridDesc*>(
+        cast_pointer_to_generic_address_space(p_b_k0_n_k1_grid_desc));
+    const auto c_m0_m1_m2_n_grid_desc = *reinterpret_cast<const CM0M1M2NGridDesc*>(
+        cast_pointer_to_generic_address_space(p_c_m0_m1_m2_n_grid_desc));
+    const auto c_block_cluster_adaptor = *reinterpret_cast<const CBlockClusterAdaptor*>(
+        cast_pointer_to_generic_address_space(p_c_block_cluster_adaptor));
 
     __shared__ FloatAB p_shared_block[shared_block_size];
 
@@ -126,13 +126,13 @@ template <index_t BlockSize,
           typename CThreadTransferSrcDstAccessOrder,
           index_t CThreadTransferSrcDstVectorDim,
           index_t CThreadTransferDstScalarPerVector,
-          typename AGridIteratorHacks,
-          typename BGridIteratorHacks,
-          typename CGridIteratorHacks,
-          typename AGridMoveSliceWindowIteratorHacks,
-          typename BGridMoveSliceWindowIteratorHacks,
+          typename AGridStepHacks,
+          typename BGridStepHacks,
+          typename CGridStepHacks,
+          typename AGridMoveSliceWindowStepHacks,
+          typename BGridMoveSliceWindowStepHacks,
           bool CAccessOrderMRepeatNRepeat>
-struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
+struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
 {
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
@@ -148,12 +148,12 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
 
         // A matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto a_k0_m_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<KPerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
 
         // B matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto b_k0_n_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<KPerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
 
         // LDS allocation for A and B: be careful of alignment
@@ -203,9 +203,6 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
     __host__ __device__ static constexpr auto
     MakeCM0M1M2NGridDescriptor(const CMNGridDesc& c_m_n_grid_desc)
     {
-        const auto M = c_m_n_grid_desc.GetLength(I0);
-        const auto N = c_m_n_grid_desc.GetLength(I1);
-
         constexpr auto xdlops_gemm = XdlopsGemm<FloatAB, MPerWave, NPerWave, K1>{};
 
         constexpr auto CLayout = xdlops_gemm.GetCLayout();
@@ -217,10 +214,9 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
         constexpr index_t MWaves = MPerBlock / (MPerWave * MRepeat);
         constexpr index_t NWaves = NPerBlock / (NPerWave * NRepeat);
 
-        constexpr auto N0 = Number<CLayout.N1()>{};
         constexpr auto N1 = Number<CLayout.N0()>{};
 
-        const auto c_m0_m1_m2_n_grid_desc = transform_dynamic_tensor_descriptor(
+        const auto c_m0_m1_m2_n_grid_desc = transform_tensor_descriptor(
             c_m_n_grid_desc,
             make_tuple(make_unmerge_transform(make_tuple(MRepeat, MWaves, M0, M1, M2)),
                        make_unmerge_transform(make_tuple(NRepeat, NWaves, N1))),
@@ -269,11 +265,6 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
                                const CM0M1M2NGridDesc& c_m0_m1_m2_n_grid_desc,
                                const CBlockClusterAdaptor& c_block_cluster_adaptor)
     {
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-
         const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
             p_a_grid, a_k0_m_k1_grid_desc.GetElementSpaceSize());
         const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
@@ -282,8 +273,6 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
             p_c_grid, c_m0_m1_m2_n_grid_desc.GetElementSpaceSize());
 
         const auto K0 = a_k0_m_k1_grid_desc.GetLength(I0);
-        const auto M  = a_k0_m_k1_grid_desc.GetLength(I1);
-        const auto N  = b_k0_n_k1_grid_desc.GetLength(I1);
 
         // divide block work by [M, N]
         const auto block_work_idx =
@@ -301,67 +290,65 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
 
         // A matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto a_k0_m_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<KPerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
 
         // B matrix in LDS memory, dst of blockwise copy
         //   be careful of LDS alignment
-        constexpr auto b_k0_n_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+        constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned(
             make_tuple(Number<KPerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
 
         // A matrix blockwise copy
         auto a_blockwise_copy =
-            BlockwiseDynamicTensorSliceTransfer_v4<BlockSize,
-                                                   InMemoryDataOperationEnum_t::Set,
-                                                   Sequence<KPerBlock, MPerBlock, K1>,
-                                                   ABlockTransferThreadSliceLengths_K0_M_K1,
-                                                   ABlockTransferThreadClusterLengths_K0_M_K1,
-                                                   ABlockTransferThreadClusterArrangeOrder,
-                                                   FloatAB,
-                                                   FloatAB,
-                                                   decltype(a_k0_m_k1_grid_desc),
-                                                   decltype(a_k0_m_k1_block_desc),
-                                                   ABlockTransferSrcAccessOrder,
-                                                   Sequence<1, 0, 2>,
-                                                   ABlockTransferSrcVectorDim,
-                                                   2,
-                                                   ABlockTransferSrcScalarPerVector,
-                                                   ABlockTransferDstScalarPerVector_K1,
-                                                   1,
-                                                   1,
-                                                   AThreadTransferSrcResetCoordinateAfterRun,
-                                                   true>(
-                a_k0_m_k1_grid_desc,
-                make_multi_index(0, m_block_data_idx_on_grid, 0),
-                a_k0_m_k1_block_desc,
-                make_multi_index(0, 0, 0));
+            BlockwiseTensorSliceTransfer_v4<BlockSize,
+                                            InMemoryDataOperationEnum_t::Set,
+                                            Sequence<KPerBlock, MPerBlock, K1>,
+                                            ABlockTransferThreadSliceLengths_K0_M_K1,
+                                            ABlockTransferThreadClusterLengths_K0_M_K1,
+                                            ABlockTransferThreadClusterArrangeOrder,
+                                            FloatAB,
+                                            FloatAB,
+                                            decltype(a_k0_m_k1_grid_desc),
+                                            decltype(a_k0_m_k1_block_desc),
+                                            ABlockTransferSrcAccessOrder,
+                                            Sequence<1, 0, 2>,
+                                            ABlockTransferSrcVectorDim,
+                                            2,
+                                            ABlockTransferSrcScalarPerVector,
+                                            ABlockTransferDstScalarPerVector_K1,
+                                            1,
+                                            1,
+                                            AThreadTransferSrcResetCoordinateAfterRun,
+                                            true>(a_k0_m_k1_grid_desc,
+                                                  make_multi_index(0, m_block_data_idx_on_grid, 0),
+                                                  a_k0_m_k1_block_desc,
+                                                  make_multi_index(0, 0, 0));
 
         // B matrix blockwise copy
         auto b_blockwise_copy =
-            BlockwiseDynamicTensorSliceTransfer_v4<BlockSize,
-                                                   InMemoryDataOperationEnum_t::Set,
-                                                   Sequence<KPerBlock, NPerBlock, K1>,
-                                                   BBlockTransferThreadSliceLengths_K0_N_K1,
-                                                   BBlockTransferThreadClusterLengths_K0_N_K1,
-                                                   BBlockTransferThreadClusterArrangeOrder,
-                                                   FloatAB,
-                                                   FloatAB,
-                                                   decltype(b_k0_n_k1_grid_desc),
-                                                   decltype(b_k0_n_k1_block_desc),
-                                                   BBlockTransferSrcAccessOrder,
-                                                   Sequence<1, 0, 2>,
-                                                   BBlockTransferSrcVectorDim,
-                                                   2,
-                                                   BBlockTransferSrcScalarPerVector,
-                                                   BBlockTransferDstScalarPerVector_K1,
-                                                   1,
-                                                   1,
-                                                   BThreadTransferSrcResetCoordinateAfterRun,
-                                                   true>(
-                b_k0_n_k1_grid_desc,
-                make_multi_index(0, n_block_data_idx_on_grid, 0),
-                b_k0_n_k1_block_desc,
-                make_multi_index(0, 0, 0));
+            BlockwiseTensorSliceTransfer_v4<BlockSize,
+                                            InMemoryDataOperationEnum_t::Set,
+                                            Sequence<KPerBlock, NPerBlock, K1>,
+                                            BBlockTransferThreadSliceLengths_K0_N_K1,
+                                            BBlockTransferThreadClusterLengths_K0_N_K1,
+                                            BBlockTransferThreadClusterArrangeOrder,
+                                            FloatAB,
+                                            FloatAB,
+                                            decltype(b_k0_n_k1_grid_desc),
+                                            decltype(b_k0_n_k1_block_desc),
+                                            BBlockTransferSrcAccessOrder,
+                                            Sequence<1, 0, 2>,
+                                            BBlockTransferSrcVectorDim,
+                                            2,
+                                            BBlockTransferSrcScalarPerVector,
+                                            BBlockTransferDstScalarPerVector_K1,
+                                            1,
+                                            1,
+                                            BThreadTransferSrcResetCoordinateAfterRun,
+                                            true>(b_k0_n_k1_grid_desc,
+                                                  make_multi_index(0, n_block_data_idx_on_grid, 0),
+                                                  b_k0_n_k1_block_desc,
+                                                  make_multi_index(0, 0, 0));
 
         // GEMM definition
         //   c_mtx += transpose(a_mtx) * b_mtx
@@ -375,7 +362,7 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
                           NPerBlock % (NPerWave * NRepeat) == 0,
                       "wrong!");
 
-        constexpr auto a_k0_m0_m1_k1_block_desc = transform_dynamic_tensor_descriptor(
+        constexpr auto a_k0_m0_m1_k1_block_desc = transform_tensor_descriptor(
             a_k0_m_k1_block_desc,
             make_tuple(make_pass_through_transform(Number<KPerBlock>{}),
                        make_unmerge_transform(
@@ -384,7 +371,7 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
             make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
             make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
 
-        constexpr auto b_k0_n0_n1_k1_block_desc = transform_dynamic_tensor_descriptor(
+        constexpr auto b_k0_n0_n1_k1_block_desc = transform_tensor_descriptor(
             b_k0_n_k1_block_desc,
             make_tuple(make_pass_through_transform(Number<KPerBlock>{}),
                        make_unmerge_transform(
@@ -410,21 +397,19 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
 
         static_assert(NumBlks == 1 && NumXdlops == 1, "K Reduction Mfma only");
 
-        constexpr auto c_mr_nr_blk_desc = make_dynamic_naive_tensor_descriptor_packed_v2(
-            make_tuple(Number<MRepeat>{}, Number<NRepeat>{}));
+        constexpr auto c_mr_nr_blk_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<MRepeat>{}, Number<NRepeat>{}));
 
         StaticBuffer<AddressSpaceEnum_t::Vgpr,
                      vector_type<FloatAcc, BlkSize>,
-                     c_mr_nr_blk_desc.GetElementSpaceSize()>
+                     c_mr_nr_blk_desc.GetElementSpaceSize(),
+                     true>
             c_thread_buf;
 
         // LDS allocation for A and B: be careful of alignment
         constexpr auto a_block_space_size =
             math::integer_least_multiple(a_k0_m_k1_block_desc.GetElementSpaceSize(), max_lds_align);
 
-        constexpr auto b_block_space_size =
-            math::integer_least_multiple(b_k0_n_k1_block_desc.GetElementSpaceSize(), max_lds_align);
-
         FloatAB* p_a_block = p_shared_block;
         FloatAB* p_b_block = p_shared_block + a_block_space_size;
 
@@ -432,15 +417,13 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
         constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock, 0, 0);
 
         // hack to control index calculation when iterating over A and B matrix for threadwise copy
-        constexpr auto a_k0_m_k1_grid_iterator_hacks = AGridIteratorHacks{};
-        constexpr auto b_k0_n_k1_grid_iterator_hacks = BGridIteratorHacks{};
+        constexpr auto a_k0_m_k1_grid_step_hacks = AGridStepHacks{};
+        constexpr auto b_k0_n_k1_grid_step_hacks = BGridStepHacks{};
 
         // hack to control index calculation when move slice window for A and B matrix for
         // threadwise copy
-        constexpr auto a_k0_m_k1_grid_move_slice_window_iterator_hack =
-            AGridMoveSliceWindowIteratorHacks{};
-        constexpr auto b_k0_n_k1_grid_move_slice_window_iterator_hack =
-            BGridMoveSliceWindowIteratorHacks{};
+        constexpr auto a_k0_m_k1_grid_move_slice_window_step_hack = AGridMoveSliceWindowStepHacks{};
+        constexpr auto b_k0_n_k1_grid_move_slice_window_step_hack = BGridMoveSliceWindowStepHacks{};
 
         auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
             p_a_block, a_k0_m_k1_block_desc.GetElementSpaceSize());
@@ -449,10 +432,8 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
 
         // preload data into LDS
         {
-            a_blockwise_copy.RunRead(
-                a_k0_m_k1_grid_desc, a_grid_buf, a_k0_m_k1_grid_iterator_hacks);
-            b_blockwise_copy.RunRead(
-                b_k0_n_k1_grid_desc, b_grid_buf, b_k0_n_k1_grid_iterator_hacks);
+            a_blockwise_copy.RunRead(a_k0_m_k1_grid_desc, a_grid_buf, a_k0_m_k1_grid_step_hacks);
+            b_blockwise_copy.RunRead(b_k0_n_k1_grid_desc, b_grid_buf, b_k0_n_k1_grid_step_hacks);
 
             a_blockwise_copy.RunWrite(a_k0_m_k1_block_desc, a_block_buf);
             b_blockwise_copy.RunWrite(b_k0_n_k1_block_desc, b_block_buf);
@@ -465,18 +446,16 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
         {
             a_blockwise_copy.MoveSrcSliceWindow(a_k0_m_k1_grid_desc,
                                                 a_block_slice_copy_step,
-                                                a_k0_m_k1_grid_move_slice_window_iterator_hack);
+                                                a_k0_m_k1_grid_move_slice_window_step_hack);
             b_blockwise_copy.MoveSrcSliceWindow(b_k0_n_k1_grid_desc,
                                                 b_block_slice_copy_step,
-                                                b_k0_n_k1_grid_move_slice_window_iterator_hack);
+                                                b_k0_n_k1_grid_move_slice_window_step_hack);
 
-            a_blockwise_copy.RunRead(
-                a_k0_m_k1_grid_desc, a_grid_buf, a_k0_m_k1_grid_iterator_hacks);
+            a_blockwise_copy.RunRead(a_k0_m_k1_grid_desc, a_grid_buf, a_k0_m_k1_grid_step_hacks);
 
             block_sync_lds();
 
-            b_blockwise_copy.RunRead(
-                b_k0_n_k1_grid_desc, b_grid_buf, b_k0_n_k1_grid_iterator_hacks);
+            b_blockwise_copy.RunRead(b_k0_n_k1_grid_desc, b_grid_buf, b_k0_n_k1_grid_step_hacks);
 
             blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
 
@@ -506,7 +485,7 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
             constexpr index_t N1 = CLayout.N0();
 
             constexpr auto c_m0_m1_m2_n_thread_desc =
-                make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(Number<MRepeat>{},
+                make_naive_tensor_descriptor_packed(make_tuple(Number<MRepeat>{},
                                                                           Number<NRepeat>{},
                                                                           Number<1>{},
                                                                           Number<1>{},
@@ -515,7 +494,7 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
                                                                           Number<M2>{},
                                                                           Number<1>{}));
 
-            StaticBuffer<AddressSpaceEnum_t::Vgpr, FloatC, c_m0_m1_m2_n_thread_desc.GetElementSpaceSize()>
+            StaticBuffer<AddressSpaceEnum_t::Vgpr, FloatC, c_m0_m1_m2_n_thread_desc.GetElementSpaceSize(), true>
                 c_blk_buf_;
 
             static_for<0, MRepeat, 1>{}([&](auto mr_i) {
@@ -542,12 +521,12 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
             const index_t n_thread_data_on_grid =
                 n_block_data_idx_on_grid + c_thread_mtx_on_block[I1];
 
-            constexpr auto c_m0_m1_m2_n_grid_tensor_iterator_hacks = CGridIteratorHacks{};
+            constexpr auto c_m0_m1_m2_n_grid_tensor_step_hacks = CGridStepHacks{};
 
             constexpr index_t MWaves = MPerBlock / (MPerWave * MRepeat);
             constexpr index_t NWaves = NPerBlock / (NPerWave * NRepeat);
 
-            ThreadwiseDynamicTensorSliceTransfer_v1r3<
+            ThreadwiseTensorSliceTransfer_v1r3<
                 FloatC,
                 FloatC,
                 decltype(c_m0_m1_m2_n_thread_desc),
@@ -573,7 +552,7 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
                      c_blk_buf_,
                      c_m0_m1_m2_n_grid_desc,
                      c_grid_buf,
-                     c_m0_m1_m2_n_grid_tensor_iterator_hacks);
+                     c_m0_m1_m2_n_grid_tensor_step_hacks);
         }
 #else
         {
@@ -581,11 +560,8 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
             constexpr index_t M1 = CLayout.N1();
             constexpr index_t M2 = CLayout.M0();
 
-            constexpr auto c_m0_m1_m2_n_thread_desc =
-                make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
-                    I1, I1, I1, I1, Number<M0>{}, Number<1>{}, Number<M2>{}, Number<1>{}));
-
-            StaticBuffer<AddressSpaceEnum_t::Vgpr, FloatC, BlkSize> c_blk_buf_;
+            constexpr auto c_m0_m1_m2_n_thread_desc = make_naive_tensor_descriptor_packed(
+                make_tuple(I1, I1, I1, I1, Number<M0>{}, Number<1>{}, Number<M2>{}, Number<1>{}));
 
             // calculate origin of thread output tensor on global memory
             //     blockwise GEMM c matrix starting index
@@ -598,20 +574,20 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
             const index_t n_thread_data_on_grid =
                 n_block_data_idx_on_grid + c_thread_mtx_on_block[I1];
 
-            constexpr auto c_m0_m1_m2_n_grid_tensor_iterator_hacks = CGridIteratorHacks{};
+            constexpr auto c_m0_m1_m2_n_grid_tensor_step_hacks = CGridStepHacks{};
 
             auto c_thread_copy =
-                ThreadwiseDynamicTensorSliceTransfer_v1r3<FloatC,
-                                                          FloatC,
-                                                          decltype(c_m0_m1_m2_n_thread_desc),
-                                                          decltype(c_m0_m1_m2_n_grid_desc),
-                                                          Sequence<1, 1, 1, 1, M0, 1, M2, 1>,
-                                                          CThreadTransferSrcDstAccessOrder,
-                                                          CThreadTransferSrcDstVectorDim,
-                                                          CThreadTransferDstScalarPerVector,
-                                                          CGlobalMemoryDataOperation,
-                                                          1,
-                                                          true>{
+                ThreadwiseTensorSliceTransfer_v1r3<FloatC,
+                                                   FloatC,
+                                                   decltype(c_m0_m1_m2_n_thread_desc),
+                                                   decltype(c_m0_m1_m2_n_grid_desc),
+                                                   Sequence<1, 1, 1, 1, M0, 1, M2, 1>,
+                                                   CThreadTransferSrcDstAccessOrder,
+                                                   CThreadTransferSrcDstVectorDim,
+                                                   CThreadTransferDstScalarPerVector,
+                                                   CGlobalMemoryDataOperation,
+                                                   1,
+                                                   true>{
                     c_m0_m1_m2_n_grid_desc,
                     make_multi_index(0,
                                      0,
@@ -629,7 +605,7 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
                                   c_thread_buf[Number<blk_off>{}].template AsType<FloatAcc>(),
                                   c_m0_m1_m2_n_grid_desc,
                                   c_grid_buf,
-                                  c_m0_m1_m2_n_grid_tensor_iterator_hacks);
+                                  c_m0_m1_m2_n_grid_tensor_step_hacks);
 
                 return c_thread_idx_;
             };
@@ -644,7 +620,7 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
                                   c_thread_buf[Number<blk_off>{}].template AsType<FloatAcc>(),
                                   c_m0_m1_m2_n_grid_desc,
                                   c_grid_buf,
-                                  c_m0_m1_m2_n_grid_tensor_iterator_hacks);
+                                  c_m0_m1_m2_n_grid_tensor_step_hacks);
             };
 
             auto nrepeat_plus_copy = [&](auto c_thread_idx_) {
@@ -657,7 +633,7 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
                                   c_thread_buf[Number<blk_off>{}].template AsType<FloatAcc>(),
                                   c_m0_m1_m2_n_grid_desc,
                                   c_grid_buf,
-                                  c_m0_m1_m2_n_grid_tensor_iterator_hacks);
+                                  c_m0_m1_m2_n_grid_tensor_step_hacks);
             };
 
             auto mrepeat_minus_copy = [&](auto c_thread_idx_) {
@@ -670,7 +646,7 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
                                   c_thread_buf[Number<blk_off>{}].template AsType<FloatAcc>(),
                                   c_m0_m1_m2_n_grid_desc,
                                   c_grid_buf,
-                                  c_m0_m1_m2_n_grid_tensor_iterator_hacks);
+                                  c_m0_m1_m2_n_grid_tensor_step_hacks);
             };
 
             auto nrepeat_minus_copy = [&](auto c_thread_idx_) {
@@ -683,7 +659,7 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
                                   c_thread_buf[Number<blk_off>{}].template AsType<FloatAcc>(),
                                   c_m0_m1_m2_n_grid_desc,
                                   c_grid_buf,
-                                  c_m0_m1_m2_n_grid_tensor_iterator_hacks);
+                                  c_m0_m1_m2_n_grid_tensor_step_hacks);
             };
 
             static_assert((MRepeat == 4 && NRepeat == 4) or (MRepeat == 4 && NRepeat == 2) or
diff --git a/composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp b/composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp
index 7e7bb9c8c3..a925a5cd68 100644
--- a/composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp
@@ -21,10 +21,10 @@ template <typename FloatA,
           typename TKLengths,
           typename TMLengths,
           typename TNLengths,
-          typename std::enable_if<AThreadDesc_TK0_TM0_TM1_TK1::IsKnownAtCompileTime() &&
-                                      BThreadDesc_TK0_TN0_TN1_TK1::IsKnownAtCompileTime() &&
-                                      CThreadDesc_TM0_TM1_TN0_TN1::IsKnownAtCompileTime(),
-                                  bool>::type = false>
+          typename enable_if<AThreadDesc_TK0_TM0_TM1_TK1::IsKnownAtCompileTime() &&
+                                 BThreadDesc_TK0_TN0_TN1_TK1::IsKnownAtCompileTime() &&
+                                 CThreadDesc_TM0_TM1_TN0_TN1::IsKnownAtCompileTime(),
+                             bool>::type = false>
 struct ThreadwiseGemmDlops_km0m1_kn0n1_m0m1n0n1
 {
     __device__ constexpr ThreadwiseGemmDlops_km0m1_kn0n1_m0m1n0n1()
@@ -97,10 +97,9 @@ struct ThreadwiseGemmDlops_km0m1_kn0n1_m0m1n0n1
                                 CThreadDesc_TM0_TM1_TN0_TN1{}.CalculateOffset(
                                     c_origin_idx + make_multi_index(tm0, tm1, tn0, tn1));
 
-                            amd_inner_product_dlop<FloatA, FloatB, FloatC>(
-                                a_buf[Number<a_offset>{}],
-                                b_buf[Number<b_offset>{}],
-                                c_buf(Number<c_offset>{}));
+                            inner_product<FloatA, FloatB, FloatC>(a_buf[Number<a_offset>{}],
+                                                                  b_buf[Number<b_offset>{}],
+                                                                  c_buf(Number<c_offset>{}));
                         });
                     });
                 });
@@ -124,10 +123,10 @@ template <typename FloatA,
           typename TKLengths,
           typename TMLengths,
           typename TNLengths,
-          typename std::enable_if<AThreadDesc_TK0_TM0_TM1_TK1::IsKnownAtCompileTime() &&
-                                      BThreadDesc_TK0_TN0_TN1_TK1::IsKnownAtCompileTime() &&
-                                      CThreadDesc_TM0_TM1_TN0_TN1::IsKnownAtCompileTime(),
-                                  bool>::type = false>
+          typename enable_if<AThreadDesc_TK0_TM0_TM1_TK1::IsKnownAtCompileTime() &&
+                                 BThreadDesc_TK0_TN0_TN1_TK1::IsKnownAtCompileTime() &&
+                                 CThreadDesc_TM0_TM1_TN0_TN1::IsKnownAtCompileTime(),
+                             bool>::type = false>
 struct ThreadwiseContractionDlops_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1
 {
     __device__ constexpr ThreadwiseContractionDlops_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1()
@@ -214,7 +213,7 @@ struct ThreadwiseContractionDlops_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_
                                 CThreadDesc_TM0_TM1_TN0_TN1{}.CalculateOffset(
                                     c_origin_idx + make_multi_index(tm0, tm1, tn0, tn1));
 
-                            amd_inner_product_dlop<a_vector_t, b_vector_t, FloatC>(
+                            inner_product<a_vector_t, b_vector_t, FloatC>(
                                 a_vec.template AsType<a_vector_t>()[I0],
                                 b_vec.template AsType<b_vector_t>()[I0],
                                 c_buf(Number<c_offset>{}));
diff --git a/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp b/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp
index 153d512df7..015ad675fb 100644
--- a/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp
@@ -19,9 +19,9 @@ template <typename FloatA,
           typename CDesc,
           index_t H,
           index_t W,
-          typename std::enable_if<ADesc::IsKnownAtCompileTime() && BDesc::IsKnownAtCompileTime() &&
-                                      CDesc::IsKnownAtCompileTime(),
-                                  bool>::type = false>
+          typename enable_if<ADesc::IsKnownAtCompileTime() && BDesc::IsKnownAtCompileTime() &&
+                                 CDesc::IsKnownAtCompileTime(),
+                             bool>::type = false>
 struct ThreadwiseGemmDlops_km_kn_mn_v3
 {
     template <typename ABuffer,
@@ -57,8 +57,6 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3
 
         constexpr auto I0 = Number<0>{};
         constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
 
         constexpr auto E = ADesc{}.GetLength(I0);
         constexpr auto K = ADesc{}.GetLength(I1);
diff --git a/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_set.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_set.hpp
similarity index 78%
rename from composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_set.hpp
rename to composable_kernel/include/tensor_operation/threadwise_tensor_slice_set.hpp
index f1b632aa84..0c7aa978a7 100644
--- a/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_set.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_set.hpp
@@ -1,9 +1,9 @@
-#ifndef CK_THREADWISE_DYNAMIC_TENSOR_SET_HPP
-#define CK_THREADWISE_DYNAMIC_TENSOR_SET_HPP
+#ifndef CK_THREADWISE_TENSOR_SET_HPP
+#define CK_THREADWISE_TENSOR_SET_HPP
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 
 namespace ck {
 
@@ -11,12 +11,12 @@ namespace ck {
 //   1. Desc is known at compile-time
 //   2. Buffer is StaticBuffer
 //   3. OriginIdx is known at compile-time
-//   4. use #-iterator
+//   4. use #-step
 template <typename Data,
           typename Desc,
           typename SliceLengths,
-          typename std::enable_if<Desc::IsKnownAtCompileTime(), bool>::type = false>
-struct ThreadwiseDynamicTensorSliceSet_v1
+          typename enable_if<Desc::IsKnownAtCompileTime(), bool>::type = false>
+struct ThreadwiseTensorSliceSet_v1
 {
     static constexpr index_t nDim = SliceLengths::Size();
 
@@ -40,7 +40,7 @@ struct ThreadwiseDynamicTensorSliceSet_v1
         constexpr auto origin_idx = to_multi_index(OriginIdx{});
 
         static_ford<SliceLengths>{}([&](auto access_idx) {
-            constexpr auto coord = make_dynamic_tensor_coordinate(desc, origin_idx + access_idx);
+            constexpr auto coord = make_tensor_coordinate(desc, origin_idx + access_idx);
 
             constexpr bool is_valid =
                 coordinate_has_valid_offset_assuming_visible_index_is_valid(desc, coord);
diff --git a/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
similarity index 80%
rename from composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
rename to composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
index 9626113686..0071accf7f 100644
--- a/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
@@ -1,9 +1,9 @@
-#ifndef CK_THREADWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_HPP
-#define CK_THREADWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_HPP
+#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_HPP
+#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_HPP
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 
 namespace ck {
 
@@ -57,20 +57,20 @@ template <typename SrcData,
           InMemoryDataOperationEnum_t DstInMemOp,
           index_t DstScalarStrideInVector,
           bool DstResetCoordinateAfterRun,
-          typename std::enable_if<SrcDesc::IsKnownAtCompileTime(), bool>::type = false>
-struct ThreadwiseDynamicTensorSliceTransfer_v1r3
+          typename enable_if<SrcDesc::IsKnownAtCompileTime(), bool>::type = false>
+struct ThreadwiseTensorSliceTransfer_v1r3
 {
     static constexpr index_t nDim = SliceLengths::Size();
 
     using Index = MultiIndex<nDim>;
 
-    using DstCoord = decltype(make_dynamic_tensor_coordinate(DstDesc{}, Index{}));
+    using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
 
-    using DstCoordIterator = decltype(make_dynamic_tensor_coordinate_iterator(DstDesc{}, Index{}));
+    using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
 
-    __device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v1r3(
-        const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
-        : dst_coord_(make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_idx))
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v1r3(const DstDesc& dst_desc,
+                                                            const Index& dst_slice_origin_idx)
+        : dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin_idx))
     {
         static_assert(SrcDesc::IsKnownAtCompileTime(),
                       "wrong! SrcDesc need to known at compile-time");
@@ -78,19 +78,19 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
 
     __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
     {
-        dst_coord_ = make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_idx);
+        dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
     }
 
     template <typename SrcSliceOriginIdx,
               typename SrcBuffer,
               typename DstBuffer,
-              typename DstIteratorHacks>
+              typename DstStepHacks>
     __device__ void Run(const SrcDesc&,
                         const SrcSliceOriginIdx&,
                         const SrcBuffer& src_buf,
                         const DstDesc& dst_desc,
                         DstBuffer& dst_buf,
-                        const DstIteratorHacks& dst_iterator_hacks)
+                        const DstStepHacks& dst_step_hacks)
     {
         static_assert(SrcDesc::IsKnownAtCompileTime(),
                       "wrong! SrcDesc need to known at compile-time");
@@ -127,31 +127,31 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
         constexpr auto ordered_access_lengths =
             container_reorder_given_new2old(access_lengths, dim_access_order);
 
-        // make forward iterators
-        const auto dst_forward_iterators = generate_tuple(
+        // make forward steps
+        const auto dst_forward_steps = generate_tuple(
             [&](auto i) {
-                Index forward_step;
+                Index forward_step_idx;
 
                 static_for<0, nDim, 1>{}([&](auto j) {
-                    forward_step(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
+                    forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
                 });
 
-                return make_dynamic_tensor_coordinate_iterator(
-                    dst_desc, forward_step, dst_iterator_hacks[I0][i]);
+                return make_tensor_coordinate_step(
+                    dst_desc, forward_step_idx, dst_step_hacks[I0][i]);
             },
             Number<nDim>{});
 
-        // make backward iterators
-        const auto dst_backward_iterators = generate_tuple(
+        // make backward steps
+        const auto dst_backward_steps = generate_tuple(
             [&](auto i) {
-                Index backward_step;
+                Index backward_step_idx;
 
                 static_for<0, nDim, 1>{}([&](auto j) {
-                    backward_step(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
+                    backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
                 });
 
-                return make_dynamic_tensor_coordinate_iterator(
-                    dst_desc, backward_step, dst_iterator_hacks[I1][i]);
+                return make_tensor_coordinate_step(
+                    dst_desc, backward_step_idx, dst_step_hacks[I1][i]);
             },
             Number<nDim>{});
 
@@ -235,13 +235,13 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
                 {
                     if constexpr(forward_sweep[i])
                     {
-                        move_dynamic_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_forward_iterators[dim_access_order[i]]);
+                        move_tensor_coordinate(
+                            dst_desc, dst_coord_, dst_forward_steps[dim_access_order[i]]);
                     }
                     else
                     {
-                        move_dynamic_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_backward_iterators[dim_access_order[i]]);
+                        move_tensor_coordinate(
+                            dst_desc, dst_coord_, dst_backward_steps[dim_access_order[i]]);
                     }
                 }
             });
@@ -250,10 +250,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
         // move dst coordinate back to slice origin (or not)
         if constexpr(DstResetCoordinateAfterRun)
         {
-            const auto dst_reset_iterator =
-                make_dynamic_tensor_coordinate_iterator(dst_desc, GetDstCoordinateResetStep());
+            const auto dst_reset_step =
+                make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep());
 
-            move_dynamic_tensor_coordinate(dst_desc, dst_coord_, dst_reset_iterator);
+            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
         }
     }
 
@@ -268,11 +268,11 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
 
         constexpr auto zeros = typename uniform_sequence_gen<ntransform_dst, 0>::type{};
 
-        constexpr auto dst_iterator_hacks =
+        constexpr auto dst_step_hacks =
             make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
                        generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
 
-        Run(SrcDesc{}, SrcSliceOriginIdx{}, src_buf, dst_desc, dst_buf, dst_iterator_hacks);
+        Run(SrcDesc{}, SrcSliceOriginIdx{}, src_buf, dst_desc, dst_buf, dst_step_hacks);
     }
 
     __device__ static constexpr auto GetDstCoordinateResetStep()
@@ -345,10 +345,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
                                        : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
 
         // is it OK to construct a new step every time?
-        const auto adjusted_step =
-            make_dynamic_tensor_coordinate_iterator(dst_desc, adjusted_step_idx);
+        const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
 
-        move_dynamic_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
+        move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
     }
 
     private:
@@ -374,20 +373,20 @@ template <typename SrcData,
           index_t SrcScalarPerVector,
           index_t SrcScalarStrideInVector,
           bool SrcResetCoordinateAfterRun,
-          typename std::enable_if<DstDesc::IsKnownAtCompileTime(), bool>::type = false>
-struct ThreadwiseDynamicTensorSliceTransfer_v2
+          typename enable_if<DstDesc::IsKnownAtCompileTime(), bool>::type = false>
+struct ThreadwiseTensorSliceTransfer_v2
 {
     static constexpr index_t nDim = SliceLengths::Size();
 
     using Index = MultiIndex<nDim>;
 
-    using SrcCoord = decltype(make_dynamic_tensor_coordinate(SrcDesc{}, Index{}));
+    using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
 
-    using SrcCoordIterator = decltype(make_dynamic_tensor_coordinate_iterator(SrcDesc{}, Index{}));
+    using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
 
-    __device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v2(const SrcDesc& src_desc,
-                                                                 const Index& src_slice_origin_idx)
-        : src_coord_(make_dynamic_tensor_coordinate(src_desc, src_slice_origin_idx))
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v2(const SrcDesc& src_desc,
+                                                          const Index& src_slice_origin_idx)
+        : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin_idx))
     {
         static_assert(DstDesc::IsKnownAtCompileTime(),
                       "wrong! SrcDesc need to known at compile-time");
@@ -395,19 +394,19 @@ struct ThreadwiseDynamicTensorSliceTransfer_v2
 
     __device__ void SetDstSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
     {
-        src_coord_ = make_dynamic_tensor_coordinate(src_desc, src_slice_origin_idx);
+        src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx);
     }
 
     template <typename SrcBuffer,
               typename DstBuffer,
               typename DstSliceOriginIdx,
-              typename SrcIteratorHacks>
+              typename SrcStepHacks>
     __device__ void Run(const SrcDesc& src_desc,
                         const SrcBuffer& src_buf,
                         const DstDesc&,
                         const DstSliceOriginIdx&,
                         DstBuffer& dst_buf,
-                        const SrcIteratorHacks& src_iterator_hacks)
+                        const SrcStepHacks& src_step_hacks)
     {
         static_assert(DstDesc::IsKnownAtCompileTime(),
                       "wrong! DstDesc need to known at compile-time");
@@ -442,31 +441,31 @@ struct ThreadwiseDynamicTensorSliceTransfer_v2
         constexpr auto ordered_access_lengths =
             container_reorder_given_new2old(access_lengths, dim_access_order);
 
-        // make forward iterators
-        const auto src_forward_iterators = generate_tuple(
+        // make forward steps
+        const auto src_forward_steps = generate_tuple(
             [&](auto i) {
-                Index forward_step;
+                Index forward_step_idx;
 
                 static_for<0, nDim, 1>{}([&](auto j) {
-                    forward_step(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0;
+                    forward_step_idx(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0;
                 });
 
-                return make_dynamic_tensor_coordinate_iterator(
-                    src_desc, forward_step, src_iterator_hacks[I0][i]);
+                return make_tensor_coordinate_step(
+                    src_desc, forward_step_idx, src_step_hacks[I0][i]);
             },
             Number<nDim>{});
 
-        // make backward iterators
-        const auto src_backward_iterators = generate_tuple(
+        // make backward steps
+        const auto src_backward_steps = generate_tuple(
             [&](auto i) {
-                Index backward_step;
+                Index backward_step_idx;
 
                 static_for<0, nDim, 1>{}([&](auto j) {
-                    backward_step(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0;
+                    backward_step_idx(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0;
                 });
 
-                return make_dynamic_tensor_coordinate_iterator(
-                    src_desc, backward_step, src_iterator_hacks[I1][i]);
+                return make_tensor_coordinate_step(
+                    src_desc, backward_step_idx, src_step_hacks[I1][i]);
             },
             Number<nDim>{});
 
@@ -548,13 +547,13 @@ struct ThreadwiseDynamicTensorSliceTransfer_v2
                 {
                     if constexpr(forward_sweep[i])
                     {
-                        move_dynamic_tensor_coordinate(
-                            src_desc, src_coord_, src_forward_iterators[dim_access_order[i]]);
+                        move_tensor_coordinate(
+                            src_desc, src_coord_, src_forward_steps[dim_access_order[i]]);
                     }
                     else
                     {
-                        move_dynamic_tensor_coordinate(
-                            src_desc, src_coord_, src_backward_iterators[dim_access_order[i]]);
+                        move_tensor_coordinate(
+                            src_desc, src_coord_, src_backward_steps[dim_access_order[i]]);
                     }
                 }
             });
@@ -563,10 +562,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v2
         // move src coordinate back to slice origin (or not)
         if constexpr(SrcResetCoordinateAfterRun)
         {
-            const auto src_reset_iterator =
-                make_dynamic_tensor_coordinate_iterator(src_desc, GetSrcCoordinateResetStep());
+            const auto src_reset_step =
+                make_tensor_coordinate_step(src_desc, GetSrcCoordinateResetStep());
 
-            move_dynamic_tensor_coordinate(src_desc, src_coord_, src_reset_iterator);
+            move_tensor_coordinate(src_desc, src_coord_, src_reset_step);
         }
     }
 
@@ -581,11 +580,11 @@ struct ThreadwiseDynamicTensorSliceTransfer_v2
 
         constexpr auto zeros = typename uniform_sequence_gen<ntransform_src, 0>::type{};
 
-        constexpr auto src_iterator_hacks =
+        constexpr auto src_step_hacks =
             make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
                        generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
 
-        Run(src_desc, src_buf, DstDesc{}, DstSliceOriginIdx{}, dst_buf, src_iterator_hacks);
+        Run(src_desc, src_buf, DstDesc{}, DstSliceOriginIdx{}, dst_buf, src_step_hacks);
     }
 
     __device__ static constexpr auto GetSrcCoordinateResetStep()
@@ -658,10 +657,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v2
                                        : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
 
         // is it OK to construct a new step every time?
-        const auto adjusted_step =
-            make_dynamic_tensor_coordinate_iterator(src_desc, adjusted_step_idx);
+        const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
 
-        move_dynamic_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
     }
 
     private:
@@ -693,23 +691,23 @@ template <typename SliceLengths,
           bool DstResetCoordinateAfterRun> // control whether to move back dst coordinate after each
                                            // RunWrite(),  will be fused with MoveDstSliceWindow to
                                            // save addr computation
-struct ThreadwiseDynamicTensorSliceTransfer_v3
+struct ThreadwiseTensorSliceTransfer_v3
 {
     static constexpr index_t nDim = SliceLengths::Size();
     using Index                   = MultiIndex<nDim>;
 
-    using SrcCoord = decltype(make_dynamic_tensor_coordinate(SrcDesc{}, Index{}));
-    using DstCoord = decltype(make_dynamic_tensor_coordinate(DstDesc{}, Index{}));
+    using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
+    using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
 
-    using SrcCoordIterator = decltype(make_dynamic_tensor_coordinate_iterator(SrcDesc{}, Index{}));
-    using DstCoordIterator = decltype(make_dynamic_tensor_coordinate_iterator(DstDesc{}, Index{}));
+    using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
+    using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
 
-    __device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v3(const SrcDesc& src_desc,
-                                                                 const Index& src_slice_origin,
-                                                                 const DstDesc& dst_desc,
-                                                                 const Index& dst_slice_origin)
-        : src_coord_(make_dynamic_tensor_coordinate(src_desc, src_slice_origin)),
-          dst_coord_(make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin))
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v3(const SrcDesc& src_desc,
+                                                          const Index& src_slice_origin,
+                                                          const DstDesc& dst_desc,
+                                                          const Index& dst_slice_origin)
+        : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)),
+          dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin))
     {
         // TODO: fix this
         static_assert(is_same<SrcData, DstData>::value,
@@ -718,18 +716,17 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
 
     __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
     {
-        src_coord_ = make_dynamic_tensor_coordinate(src_desc, src_slice_origin_idx);
+        src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx);
     }
 
     __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
     {
-        dst_coord_ = make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_idx);
+        dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
     }
 
-    template <typename SrcBuffer, typename SrcIteratorHacks>
-    __device__ void RunRead(const SrcDesc& src_desc,
-                            const SrcBuffer& src_buf,
-                            const SrcIteratorHacks& src_iterator_hacks)
+    template <typename SrcBuffer, typename SrcStepHacks>
+    __device__ void
+    RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf, const SrcStepHacks& src_step_hacks)
     {
         static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
                           SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
@@ -757,31 +754,31 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
         constexpr auto ordered_src_access_lengths =
             container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
 
-        // make forward iterators
-        const auto src_forward_iterators = generate_tuple(
+        // make forward steps
+        const auto src_forward_steps = generate_tuple(
             [&](auto i) {
-                Index forward_step;
+                Index forward_step_idx;
 
                 static_for<0, nDim, 1>{}([&](auto j) {
-                    forward_step(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0;
+                    forward_step_idx(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0;
                 });
 
-                return make_dynamic_tensor_coordinate_iterator(
-                    src_desc, forward_step, src_iterator_hacks[I0][i]);
+                return make_tensor_coordinate_step(
+                    src_desc, forward_step_idx, src_step_hacks[I0][i]);
             },
             Number<nDim>{});
 
-        // make backward iterators
-        const auto src_backward_iterators = generate_tuple(
+        // make backward steps
+        const auto src_backward_steps = generate_tuple(
             [&](auto i) {
-                Index backward_step;
+                Index backward_step_idx;
 
                 static_for<0, nDim, 1>{}([&](auto j) {
-                    backward_step(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0;
+                    backward_step_idx(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0;
                 });
 
-                return make_dynamic_tensor_coordinate_iterator(
-                    src_desc, backward_step, src_iterator_hacks[I1][i]);
+                return make_tensor_coordinate_step(
+                    src_desc, backward_step_idx, src_step_hacks[I1][i]);
             },
             Number<nDim>{});
 
@@ -862,13 +859,13 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
                 {
                     if constexpr(forward_sweep[i])
                     {
-                        move_dynamic_tensor_coordinate(
-                            src_desc, src_coord_, src_forward_iterators[src_dim_access_order[i]]);
+                        move_tensor_coordinate(
+                            src_desc, src_coord_, src_forward_steps[src_dim_access_order[i]]);
                     }
                     else
                     {
-                        move_dynamic_tensor_coordinate(
-                            src_desc, src_coord_, src_backward_iterators[src_dim_access_order[i]]);
+                        move_tensor_coordinate(
+                            src_desc, src_coord_, src_backward_steps[src_dim_access_order[i]]);
                     }
                 }
             });
@@ -877,17 +874,16 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
         // move src coordinate back to slice origin (or not)
         if constexpr(SrcResetCoordinateAfterRun)
         {
-            const auto src_reset_iterator =
-                make_dynamic_tensor_coordinate_iterator(src_desc, GetSrcCoordinateResetStep());
+            const auto src_reset_step =
+                make_tensor_coordinate_step(src_desc, GetSrcCoordinateResetStep());
 
-            move_dynamic_tensor_coordinate(src_desc, src_coord_, src_reset_iterator);
+            move_tensor_coordinate(src_desc, src_coord_, src_reset_step);
         }
     }
 
-    template <typename DstBuffer, typename DstIteratorHacks>
-    __device__ void RunWrite(const DstDesc& dst_desc,
-                             DstBuffer& dst_buf,
-                             const DstIteratorHacks& dst_iterator_hacks)
+    template <typename DstBuffer, typename DstStepHacks>
+    __device__ void
+    RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf, const DstStepHacks& dst_step_hacks)
     {
         static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
                           DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
@@ -915,35 +911,31 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
         constexpr auto ordered_dst_access_lengths =
             container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
 
-        // make forward iterators
-        const auto dst_forward_iterators = generate_tuple(
+        // make forward steps
+        const auto dst_forward_steps = generate_tuple(
             [&](auto i) {
-                Index forward_step;
+                Index forward_step_idx;
 
                 static_for<0, nDim, 1>{}([&](auto j) {
-                    forward_step(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
+                    forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
                 });
 
-                const auto forward_iterator = make_dynamic_tensor_coordinate_iterator(
-                    dst_desc, forward_step, dst_iterator_hacks[I0][i]);
-
-                return forward_iterator;
+                return make_tensor_coordinate_step(
+                    dst_desc, forward_step_idx, dst_step_hacks[I0][i]);
             },
             Number<nDim>{});
 
-        // make backward iterators
-        const auto dst_backward_iterators = generate_tuple(
+        // make backward steps
+        const auto dst_backward_steps = generate_tuple(
             [&](auto i) {
-                Index backward_step;
+                Index backward_step_idx;
 
                 static_for<0, nDim, 1>{}([&](auto j) {
-                    backward_step(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
+                    backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
                 });
 
-                const auto backward_iterator = make_dynamic_tensor_coordinate_iterator(
-                    dst_desc, backward_step, dst_iterator_hacks[I1][i]);
-
-                return backward_iterator;
+                return make_tensor_coordinate_step(
+                    dst_desc, backward_step_idx, dst_step_hacks[I1][i]);
             },
             Number<nDim>{});
 
@@ -1026,13 +1018,13 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
                 {
                     if constexpr(forward_sweep[i])
                     {
-                        move_dynamic_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_forward_iterators[dst_dim_access_order[i]]);
+                        move_tensor_coordinate(
+                            dst_desc, dst_coord_, dst_forward_steps[dst_dim_access_order[i]]);
                     }
                     else
                     {
-                        move_dynamic_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_backward_iterators[dst_dim_access_order[i]]);
+                        move_tensor_coordinate(
+                            dst_desc, dst_coord_, dst_backward_steps[dst_dim_access_order[i]]);
                     }
                 }
             });
@@ -1041,10 +1033,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
         // move dst coordinate back to slice origin (or not)
         if constexpr(DstResetCoordinateAfterRun)
         {
-            const auto dst_reset_iterator =
-                make_dynamic_tensor_coordinate_iterator(dst_desc, GetDstCoordinateResetStep());
+            const auto dst_reset_step =
+                make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep());
 
-            move_dynamic_tensor_coordinate(dst_desc, dst_coord_, dst_reset_iterator);
+            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
         }
     }
 
@@ -1055,11 +1047,11 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
 
         constexpr auto zeros = typename uniform_sequence_gen<ntransform_src, 0>::type{};
 
-        constexpr auto src_iterator_hacks =
+        constexpr auto src_step_hacks =
             make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
                        generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
 
-        RunRead(src_desc, src_buf, src_iterator_hacks);
+        RunRead(src_desc, src_buf, src_step_hacks);
     }
 
     template <typename DstBuffer>
@@ -1069,11 +1061,11 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
 
         constexpr auto zeros = typename uniform_sequence_gen<ntransform_dst, 0>::type{};
 
-        constexpr auto dst_iterator_hacks =
+        constexpr auto dst_step_hacks =
             make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
                        generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
 
-        RunWrite(dst_desc, dst_buf, dst_iterator_hacks);
+        RunWrite(dst_desc, dst_buf, dst_step_hacks);
     }
 
     __device__ static constexpr auto GetSrcCoordinateResetStep()
@@ -1206,18 +1198,17 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
                                        : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
 
         // is it OK to construct a new step every time?
-        const auto adjusted_step =
-            make_dynamic_tensor_coordinate_iterator(src_desc, adjusted_step_idx);
+        const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
 
-        move_dynamic_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
     }
 
     // src_slice_origin_step_idx need to be known at compile-time, for performance reason
-    template <typename SrcMoveSliceWindowIteratorHack>
+    template <typename SrcMoveSliceWindowStepHack>
     __device__ void
     MoveSrcSliceWindow(const SrcDesc& src_desc,
                        const Index& src_slice_origin_step_idx,
-                       const SrcMoveSliceWindowIteratorHack& src_move_slice_window_iterator_hack)
+                       const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
     {
         // if src coord was not reset by RunRead(), then need to adjust the step here
         const auto adjusted_step_idx =
@@ -1225,10 +1216,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
                                        : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
 
         // is it OK to construct a new step every time?
-        const auto adjusted_step = make_dynamic_tensor_coordinate_iterator(
-            src_desc, adjusted_step_idx, src_move_slice_window_iterator_hack);
+        const auto adjusted_step = make_tensor_coordinate_step(
+            src_desc, adjusted_step_idx, src_move_slice_window_step_hack);
 
-        move_dynamic_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
     }
     // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
     __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
@@ -1240,19 +1231,18 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
                                        : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
 
         // is it OK to construct a new step every time?
-        const auto adjusted_step =
-            make_dynamic_tensor_coordinate_iterator(dst_desc, adjusted_step_idx);
+        const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
 
-        move_dynamic_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
+        move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
     }
 
     private:
     static constexpr auto buffer_desc_ =
-        make_dynamic_naive_tensor_descriptor_packed_v2(sequence_to_tuple_of_number(SliceLengths{}));
+        make_naive_tensor_descriptor_packed(sequence_to_tuple_of_number(SliceLengths{}));
 
     static constexpr auto buffer_size_ = buffer_desc_.GetElementSpaceSize();
 
-    StaticBuffer<AddressSpaceEnum_t::Vgpr, SrcData, buffer_size_> buffer_;
+    StaticBuffer<AddressSpaceEnum_t::Vgpr, SrcData, buffer_size_, true> buffer_;
 
     SrcCoord src_coord_;
     DstCoord dst_coord_;
@@ -1264,37 +1254,36 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
 //     2. SrcBuffer is DynamicBuffer
 //     3. src_ref_idx is known at run-time
 //     4. SrcRefToOriginDisplacement is known at compile-time
-//     5. use #-iterator
+//     5. use #-step
 //   2. dst:
 //     1. DstDesc is known at compile-time
 //     2. DstBuffer is StaticBuffer
 //     3. DstOriginIdx is known at compile-time
 //     4. use direct address calculation
 //   3. vector access on src
-template <
-    typename SrcData,
-    typename DstData,
-    typename SrcDesc,
-    typename DstDesc,
-    typename SliceLengths,
-    typename DimAccessOrder,
-    index_t SrcVectorDim,
-    index_t SrcScalarPerVector,
-    index_t SrcScalarStrideInVector,
-    typename std::enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
-                            bool>::type = false>
-struct ThreadwiseDynamicTensorSliceTransfer_v4
+template <typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SliceLengths,
+          typename DimAccessOrder,
+          index_t SrcVectorDim,
+          index_t SrcScalarPerVector,
+          index_t SrcScalarStrideInVector,
+          typename enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
+                             bool>::type = false>
+struct ThreadwiseTensorSliceTransfer_v4
 {
     static constexpr index_t nDim = SliceLengths::Size();
 
     using Index = MultiIndex<nDim>;
 
-    using SrcCoord = decltype(make_dynamic_tensor_coordinate(SrcDesc{}, Index{}));
+    using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
 
-    using SrcCoordIterator = decltype(make_dynamic_tensor_coordinate_iterator(SrcDesc{}, Index{}));
+    using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
 
-    __device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v4(const Index& src_ref_idx)
-        : src_ref_coord_(make_dynamic_tensor_coordinate(SrcDesc{}, src_ref_idx))
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v4(const Index& src_ref_idx)
+        : src_ref_coord_(make_tensor_coordinate(SrcDesc{}, src_ref_idx))
     {
         static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
                       "wrong! SrcDesc and DstDesc need to known at compile-time");
@@ -1390,13 +1379,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4
             constexpr auto src_ref_to_data_disp_idx =
                 src_ref_to_origin_disp_idx + data_to_origin_disp_idx;
 
-            constexpr auto src_ref_to_data_disp_coord_iterator =
-                make_dynamic_tensor_coordinate_iterator(src_desc, src_ref_to_data_disp_idx);
+            constexpr auto src_ref_to_data_disp_coord_step =
+                make_tensor_coordinate_step(src_desc, src_ref_to_data_disp_idx);
 
             auto src_data_coord = src_ref_coord_;
 
-            move_dynamic_tensor_coordinate(
-                src_desc, src_data_coord, src_ref_to_data_disp_coord_iterator);
+            move_tensor_coordinate(src_desc, src_data_coord, src_ref_to_data_disp_coord_step);
 
             vector_type_maker_t<SrcData, SrcScalarPerVector> src_tmp_vector;
 
@@ -1435,10 +1423,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4
     {
         constexpr auto src_desc = SrcDesc{};
 
-        const auto src_slice_move_step_iter = make_dynamic_tensor_coordinate_iterator(
-            src_desc, to_multi_index(src_slice_move_step_idx));
+        const auto src_slice_move_step_iter =
+            make_tensor_coordinate_step(src_desc, to_multi_index(src_slice_move_step_idx));
 
-        move_dynamic_tensor_coordinate(SrcDesc{}, src_ref_coord_, src_slice_move_step_iter);
+        move_tensor_coordinate(SrcDesc{}, src_ref_coord_, src_slice_move_step_iter);
     }
 
     private:
diff --git a/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer_v2.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp
similarity index 76%
rename from composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer_v2.hpp
rename to composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp
index ba60e26c38..ccac4b7b44 100644
--- a/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer_v2.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp
@@ -1,9 +1,9 @@
-#ifndef CK_THREADWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_V2_HPP
-#define CK_THREADWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_V2_HPP
+#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V2_HPP
+#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V2_HPP
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 
 namespace ck {
 
@@ -30,7 +30,7 @@ template <typename SliceLengths,
           bool DstResetCoordinateAfterRun> // control whether to move back dst coordinate after each
                                            // RunWrite(),  will be fused with MoveDstSliceWindow to
                                            // save addr computation
-struct ThreadwiseDynamicTensorSliceTransfer_v3r1
+struct ThreadwiseTensorSliceTransfer_v3r1
 {
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
@@ -38,18 +38,18 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
     static constexpr index_t nDim = SliceLengths::Size();
     using Index                   = MultiIndex<nDim>;
 
-    using SrcCoord = decltype(make_dynamic_tensor_coordinate(SrcDesc{}, Index{}));
-    using DstCoord = decltype(make_dynamic_tensor_coordinate(DstDesc{}, Index{}));
+    using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
+    using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
 
-    using SrcCoordIterator = decltype(make_dynamic_tensor_coordinate_iterator(SrcDesc{}, Index{}));
-    using DstCoordIterator = decltype(make_dynamic_tensor_coordinate_iterator(DstDesc{}, Index{}));
+    using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
+    using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
 
-    __device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v3r1(const SrcDesc& src_desc,
-                                                                   const Index& src_slice_origin,
-                                                                   const DstDesc& dst_desc,
-                                                                   const Index& dst_slice_origin)
-        : src_coord_(make_dynamic_tensor_coordinate(src_desc, src_slice_origin)),
-          dst_coord_(make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin))
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v3r1(const SrcDesc& src_desc,
+                                                            const Index& src_slice_origin,
+                                                            const DstDesc& dst_desc,
+                                                            const Index& dst_slice_origin)
+        : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)),
+          dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin))
     {
         // TODO: fix this
         static_assert(is_same<SrcData, DstData>::value,
@@ -64,18 +64,17 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
 
     __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
     {
-        src_coord_ = make_dynamic_tensor_coordinate(src_desc, src_slice_origin_idx);
+        src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx);
     }
 
     __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
     {
-        dst_coord_ = make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_idx);
+        dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
     }
 
-    template <typename SrcBuffer, typename SrcIteratorHacks>
-    __device__ void RunRead(const SrcDesc& src_desc,
-                            const SrcBuffer& src_buf,
-                            const SrcIteratorHacks& src_iterator_hacks)
+    template <typename SrcBuffer, typename SrcStepHacks>
+    __device__ void
+    RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf, const SrcStepHacks& src_step_hacks)
     {
         static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
                           SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
@@ -92,13 +91,13 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
             container_reverse_exclusive_scan(
                 container_reorder_given_new2old(src_vector_tensor_lengths,
                                                 SrcVectorTensorContiguousDimOrder{}),
-                math::multiplies_v2{},
+                math::multiplies{},
                 I1),
             SrcVectorTensorContiguousDimOrder{});
 
-        constexpr auto src_vector_desc = make_dynamic_naive_tensor_descriptor_v2(
-            sequence_to_tuple_of_number(src_vector_tensor_lengths),
-            sequence_to_tuple_of_number(src_vector_tensor_strides));
+        constexpr auto src_vector_desc =
+            make_naive_tensor_descriptor(sequence_to_tuple_of_number(src_vector_tensor_lengths),
+                                         sequence_to_tuple_of_number(src_vector_tensor_strides));
 
         // access order and lengths
         constexpr auto src_access_lengths = SliceLengths{} / src_vector_tensor_lengths;
@@ -108,31 +107,31 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
         constexpr auto ordered_src_access_lengths =
             container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
 
-        // make forward iterators
-        const auto src_forward_iterators = generate_tuple(
+        // make forward steps
+        const auto src_forward_steps = generate_tuple(
             [&](auto i) {
-                Index forward_step;
+                Index forward_step_idx;
 
                 static_for<0, nDim, 1>{}([&](auto j) {
-                    forward_step(j) = (i.value == j.value) ? src_vector_tensor_lengths[i] : 0;
+                    forward_step_idx(j) = (i.value == j.value) ? src_vector_tensor_lengths[i] : 0;
                 });
 
-                return make_dynamic_tensor_coordinate_iterator(
-                    src_desc, forward_step, src_iterator_hacks[I0][i]);
+                return make_tensor_coordinate_step(
+                    src_desc, forward_step_idx, src_step_hacks[I0][i]);
             },
             Number<nDim>{});
 
-        // make backward iterators
-        const auto src_backward_iterators = generate_tuple(
+        // make backward steps
+        const auto src_backward_steps = generate_tuple(
             [&](auto i) {
-                Index backward_step;
+                Index backward_step_idx;
 
                 static_for<0, nDim, 1>{}([&](auto j) {
-                    backward_step(j) = (i.value == j.value) ? -src_vector_tensor_lengths[i] : 0;
+                    backward_step_idx(j) = (i.value == j.value) ? -src_vector_tensor_lengths[i] : 0;
                 });
 
-                return make_dynamic_tensor_coordinate_iterator(
-                    src_desc, backward_step, src_iterator_hacks[I1][i]);
+                return make_tensor_coordinate_step(
+                    src_desc, backward_step_idx, src_step_hacks[I1][i]);
             },
             Number<nDim>{});
 
@@ -219,13 +218,13 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
                 {
                     if constexpr(forward_sweep[i])
                     {
-                        move_dynamic_tensor_coordinate(
-                            src_desc, src_coord_, src_forward_iterators[src_dim_access_order[i]]);
+                        move_tensor_coordinate(
+                            src_desc, src_coord_, src_forward_steps[src_dim_access_order[i]]);
                     }
                     else
                     {
-                        move_dynamic_tensor_coordinate(
-                            src_desc, src_coord_, src_backward_iterators[src_dim_access_order[i]]);
+                        move_tensor_coordinate(
+                            src_desc, src_coord_, src_backward_steps[src_dim_access_order[i]]);
                     }
                 }
             });
@@ -234,17 +233,16 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
         // move src coordinate back to slice origin (or not)
         if constexpr(SrcResetCoordinateAfterRun)
         {
-            const auto src_reset_iterator =
-                make_dynamic_tensor_coordinate_iterator(src_desc, GetSrcCoordinateResetStep());
+            const auto src_reset_step =
+                make_tensor_coordinate_step(src_desc, GetSrcCoordinateResetStep());
 
-            move_dynamic_tensor_coordinate(src_desc, src_coord_, src_reset_iterator);
+            move_tensor_coordinate(src_desc, src_coord_, src_reset_step);
         }
     }
 
-    template <typename DstBuffer, typename DstIteratorHacks>
-    __device__ void RunWrite(const DstDesc& dst_desc,
-                             DstBuffer& dst_buf,
-                             const DstIteratorHacks& dst_iterator_hacks)
+    template <typename DstBuffer, typename DstStepHacks>
+    __device__ void
+    RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf, const DstStepHacks& dst_step_hacks)
     {
         static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
                           DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
@@ -261,13 +259,13 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
             container_reverse_exclusive_scan(
                 container_reorder_given_new2old(dst_vector_tensor_lengths,
                                                 DstVectorTensorContiguousDimOrder{}),
-                math::multiplies_v2{},
+                math::multiplies{},
                 I1),
             DstVectorTensorContiguousDimOrder{});
 
-        constexpr auto dst_vector_desc = make_dynamic_naive_tensor_descriptor_v2(
-            sequence_to_tuple_of_number(dst_vector_tensor_lengths),
-            sequence_to_tuple_of_number(dst_vector_tensor_strides));
+        constexpr auto dst_vector_desc =
+            make_naive_tensor_descriptor(sequence_to_tuple_of_number(dst_vector_tensor_lengths),
+                                         sequence_to_tuple_of_number(dst_vector_tensor_strides));
 
         // dst access order and lengths
         constexpr auto dst_access_lengths = SliceLengths{} / dst_vector_tensor_lengths;
@@ -277,35 +275,31 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
         constexpr auto ordered_dst_access_lengths =
             container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
 
-        // make forward iterators
-        const auto dst_forward_iterators = generate_tuple(
+        // make forward steps
+        const auto dst_forward_steps = generate_tuple(
             [&](auto i) {
-                Index forward_step;
+                Index forward_step_idx;
 
                 static_for<0, nDim, 1>{}([&](auto j) {
-                    forward_step(j) = (i.value == j.value) ? dst_vector_tensor_lengths[i] : 0;
+                    forward_step_idx(j) = (i.value == j.value) ? dst_vector_tensor_lengths[i] : 0;
                 });
 
-                const auto forward_iterator = make_dynamic_tensor_coordinate_iterator(
-                    dst_desc, forward_step, dst_iterator_hacks[I0][i]);
-
-                return forward_iterator;
+                return make_tensor_coordinate_step(
+                    dst_desc, forward_step_idx, dst_step_hacks[I0][i]);
             },
             Number<nDim>{});
 
-        // make backward iterators
-        const auto dst_backward_iterators = generate_tuple(
+        // make backward steps
+        const auto dst_backward_steps = generate_tuple(
             [&](auto i) {
-                Index backward_step;
+                Index backward_step_idx;
 
                 static_for<0, nDim, 1>{}([&](auto j) {
-                    backward_step(j) = (i.value == j.value) ? -dst_vector_tensor_lengths[i] : 0;
+                    backward_step_idx(j) = (i.value == j.value) ? -dst_vector_tensor_lengths[i] : 0;
                 });
 
-                const auto backward_iterator = make_dynamic_tensor_coordinate_iterator(
-                    dst_desc, backward_step, dst_iterator_hacks[I1][i]);
-
-                return backward_iterator;
+                return make_tensor_coordinate_step(
+                    dst_desc, backward_step_idx, dst_step_hacks[I1][i]);
             },
             Number<nDim>{});
 
@@ -394,13 +388,13 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
                 {
                     if constexpr(forward_sweep[i])
                     {
-                        move_dynamic_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_forward_iterators[dst_dim_access_order[i]]);
+                        move_tensor_coordinate(
+                            dst_desc, dst_coord_, dst_forward_steps[dst_dim_access_order[i]]);
                     }
                     else
                     {
-                        move_dynamic_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_backward_iterators[dst_dim_access_order[i]]);
+                        move_tensor_coordinate(
+                            dst_desc, dst_coord_, dst_backward_steps[dst_dim_access_order[i]]);
                     }
                 }
             });
@@ -409,10 +403,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
         // move dst coordinate back to slice origin (or not)
         if constexpr(DstResetCoordinateAfterRun)
         {
-            const auto dst_reset_iterator =
-                make_dynamic_tensor_coordinate_iterator(dst_desc, GetDstCoordinateResetStep());
+            const auto dst_reset_step =
+                make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep());
 
-            move_dynamic_tensor_coordinate(dst_desc, dst_coord_, dst_reset_iterator);
+            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
         }
     }
 
@@ -423,11 +417,11 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
 
         constexpr auto zeros = typename uniform_sequence_gen<ntransform_src, 0>::type{};
 
-        constexpr auto src_iterator_hacks =
+        constexpr auto src_step_hacks =
             make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
                        generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
 
-        RunRead(src_desc, src_buf, src_iterator_hacks);
+        RunRead(src_desc, src_buf, src_step_hacks);
     }
 
     template <typename DstBuffer>
@@ -437,11 +431,11 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
 
         constexpr auto zeros = typename uniform_sequence_gen<ntransform_dst, 0>::type{};
 
-        constexpr auto dst_iterator_hacks =
+        constexpr auto dst_step_hacks =
             make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
                        generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
 
-        RunWrite(dst_desc, dst_buf, dst_iterator_hacks);
+        RunWrite(dst_desc, dst_buf, dst_step_hacks);
     }
 
     __device__ static constexpr auto GetSrcCoordinateResetStep()
@@ -564,18 +558,17 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
                                        : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
 
         // is it OK to construct a new step every time?
-        const auto adjusted_step =
-            make_dynamic_tensor_coordinate_iterator(src_desc, adjusted_step_idx);
+        const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
 
-        move_dynamic_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
     }
 
     // src_slice_origin_step_idx need to be known at compile-time, for performance reason
-    template <typename SrcMoveSliceWindowIteratorHack>
+    template <typename SrcMoveSliceWindowStepHack>
     __device__ void
     MoveSrcSliceWindow(const SrcDesc& src_desc,
                        const Index& src_slice_origin_step_idx,
-                       const SrcMoveSliceWindowIteratorHack& src_move_slice_window_iterator_hack)
+                       const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
     {
         // if src coord was not reset by RunRead(), then need to adjust the step here
         const auto adjusted_step_idx =
@@ -583,10 +576,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
                                        : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
 
         // is it OK to construct a new step every time?
-        const auto adjusted_step = make_dynamic_tensor_coordinate_iterator(
-            src_desc, adjusted_step_idx, src_move_slice_window_iterator_hack);
+        const auto adjusted_step = make_tensor_coordinate_step(
+            src_desc, adjusted_step_idx, src_move_slice_window_step_hack);
 
-        move_dynamic_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
     }
     // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
     __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
@@ -598,19 +591,18 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
                                        : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
 
         // is it OK to construct a new step every time?
-        const auto adjusted_step =
-            make_dynamic_tensor_coordinate_iterator(dst_desc, adjusted_step_idx);
+        const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
 
-        move_dynamic_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
+        move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
     }
 
     private:
     static constexpr auto buffer_desc_ =
-        make_dynamic_naive_tensor_descriptor_packed_v2(sequence_to_tuple_of_number(SliceLengths{}));
+        make_naive_tensor_descriptor_packed(sequence_to_tuple_of_number(SliceLengths{}));
 
     static constexpr auto buffer_size_ = buffer_desc_.GetElementSpaceSize();
 
-    StaticBuffer<AddressSpaceEnum_t::Vgpr, SrcData, buffer_size_> buffer_;
+    StaticBuffer<AddressSpaceEnum_t::Vgpr, SrcData, buffer_size_, true> buffer_;
 
     SrcCoord src_coord_;
     DstCoord dst_coord_;
@@ -622,25 +614,24 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3r1
 //     2. SrcBuffer is DynamicBuffer
 //     3. src_ref_idx is known at run-time
 //     4. SrcRefToOriginDisplacement is known at compile-time
-//     5. use #-iterator
+//     5. use #-step
 //   2. dst:
 //     1. DstDesc is known at compile-time
 //     2. DstBuffer is StaticBuffer
 //     3. DstOriginIdx is known at compile-time
 //     4. use direct address calculation
 //   3. vector access on src
-template <
-    typename SrcData,
-    typename DstData,
-    typename SrcDesc,
-    typename DstDesc,
-    typename SliceLengths,
-    typename DimAccessOrder,
-    typename SrcVectorTensorLengths,
-    typename SrcVectorTensorContiguousDimOrder,
-    typename std::enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
-                            bool>::type = false>
-struct ThreadwiseDynamicTensorSliceTransfer_v4r1
+template <typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SliceLengths,
+          typename DimAccessOrder,
+          typename SrcVectorTensorLengths,
+          typename SrcVectorTensorContiguousDimOrder,
+          typename enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
+                             bool>::type = false>
+struct ThreadwiseTensorSliceTransfer_v4r1
 {
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
@@ -649,12 +640,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4r1
 
     using Index = MultiIndex<nDim>;
 
-    using SrcCoord = decltype(make_dynamic_tensor_coordinate(SrcDesc{}, Index{}));
+    using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
 
-    using SrcCoordIterator = decltype(make_dynamic_tensor_coordinate_iterator(SrcDesc{}, Index{}));
+    using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
 
-    __device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v4r1(const Index& src_ref_idx)
-        : src_ref_coord_(make_dynamic_tensor_coordinate(SrcDesc{}, src_ref_idx))
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v4r1(const Index& src_ref_idx)
+        : src_ref_coord_(make_tensor_coordinate(SrcDesc{}, src_ref_idx))
     {
         static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
                       "wrong! SrcDesc and DstDesc need to known at compile-time");
@@ -708,13 +699,13 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4r1
             container_reverse_exclusive_scan(
                 container_reorder_given_new2old(src_vector_tensor_lengths,
                                                 SrcVectorTensorContiguousDimOrder{}),
-                math::multiplies_v2{},
+                math::multiplies{},
                 I1),
             SrcVectorTensorContiguousDimOrder{});
 
-        constexpr auto src_vector_desc = make_dynamic_naive_tensor_descriptor_v2(
-            sequence_to_tuple_of_number(src_vector_tensor_lengths),
-            sequence_to_tuple_of_number(src_vector_tensor_strides));
+        constexpr auto src_vector_desc =
+            make_naive_tensor_descriptor(sequence_to_tuple_of_number(src_vector_tensor_lengths),
+                                         sequence_to_tuple_of_number(src_vector_tensor_strides));
 
         // access order and lengths
         constexpr auto access_lengths = SliceLengths{} / src_vector_tensor_lengths;
@@ -734,13 +725,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4r1
             constexpr auto src_ref_to_data_disp_idx =
                 src_ref_to_origin_disp_idx + data_to_origin_disp_idx;
 
-            constexpr auto src_ref_to_data_disp_coord_iterator =
-                make_dynamic_tensor_coordinate_iterator(src_desc, src_ref_to_data_disp_idx);
+            constexpr auto src_ref_to_data_disp_coord_step =
+                make_tensor_coordinate_step(src_desc, src_ref_to_data_disp_idx);
 
             auto src_data_coord = src_ref_coord_;
 
-            move_dynamic_tensor_coordinate(
-                src_desc, src_data_coord, src_ref_to_data_disp_coord_iterator);
+            move_tensor_coordinate(src_desc, src_data_coord, src_ref_to_data_disp_coord_step);
 
             vector_type_maker_t<SrcData, src_vector_desc.GetElementSpaceSize()> src_vector;
 
@@ -775,10 +765,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4r1
     {
         constexpr auto src_desc = SrcDesc{};
 
-        const auto src_slice_move_step_iter = make_dynamic_tensor_coordinate_iterator(
-            src_desc, to_multi_index(src_slice_move_step_idx));
+        const auto src_slice_move_step_iter =
+            make_tensor_coordinate_step(src_desc, to_multi_index(src_slice_move_step_idx));
 
-        move_dynamic_tensor_coordinate(SrcDesc{}, src_ref_coord_, src_slice_move_step_iter);
+        move_tensor_coordinate(SrcDesc{}, src_ref_coord_, src_slice_move_step_iter);
     }
 
     private:
diff --git a/composable_kernel/include/tensor_operation/xdlops_gemm.hpp b/composable_kernel/include/tensor_operation/xdlops_gemm.hpp
index 876a1174e7..affe096ace 100644
--- a/composable_kernel/include/tensor_operation/xdlops_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/xdlops_gemm.hpp
@@ -350,8 +350,8 @@ struct mfma_info<mfma_instr::mfma_f32_32x32x2bf16>
               class FloatC>
     __device__ FloatC run(const FloatA* a, const FloatB* b, FloatC reg_c) const
     {
-        const auto p_a = reinterpret_cast<const ushort2_t*>(a);
-        const auto p_b = reinterpret_cast<const ushort2_t*>(b);
+        const auto p_a = c_style_pointer_cast<const ushort2_t*>(a);
+        const auto p_b = c_style_pointer_cast<const ushort2_t*>(b);
 
         return intrin_mfma_f32_32x32x2bf16<MPerXdlops, NPerXdlops, AStride, BStride>::run(
             p_a, p_b, reg_c);
@@ -384,8 +384,8 @@ struct mfma_info<mfma_instr::mfma_f32_32x32x4bf16>
               class FloatC>
     __device__ FloatC run(const FloatA* a, const FloatB* b, FloatC reg_c) const
     {
-        const auto p_a = reinterpret_cast<const ushort2_t*>(a);
-        const auto p_b = reinterpret_cast<const ushort2_t*>(b);
+        const auto p_a = c_style_pointer_cast<const ushort2_t*>(a);
+        const auto p_b = c_style_pointer_cast<const ushort2_t*>(b);
 
         return intrin_mfma_f32_32x32x4bf16(p_a, p_b, reg_c);
     }
@@ -417,8 +417,8 @@ struct mfma_info<mfma_instr::mfma_f32_16x16x8bf16>
               class FloatC>
     __device__ FloatC run(const FloatA* a, const FloatB* b, FloatC reg_c) const
     {
-        const auto p_a = reinterpret_cast<const ushort2_t*>(a);
-        const auto p_b = reinterpret_cast<const ushort2_t*>(b);
+        const auto p_a = c_style_pointer_cast<const ushort2_t*>(a);
+        const auto p_b = c_style_pointer_cast<const ushort2_t*>(b);
 
         return intrin_mfma_f32_16x16x8bf16(p_a, p_b, reg_c);
     }
@@ -450,8 +450,8 @@ struct mfma_info<mfma_instr::mfma_f32_16x16x2bf16>
               class FloatC>
     __device__ FloatC run(const FloatA* a, const FloatB* b, FloatC reg_c) const
     {
-        const auto p_a = reinterpret_cast<const ushort2_t*>(a);
-        const auto p_b = reinterpret_cast<const ushort2_t*>(b);
+        const auto p_a = c_style_pointer_cast<const ushort2_t*>(a);
+        const auto p_b = c_style_pointer_cast<const ushort2_t*>(b);
 
         return intrin_mfma_f32_16x16x2bf16<MPerXdlops, NPerXdlops>(p_a, p_b, reg_c);
     }
@@ -483,8 +483,8 @@ struct mfma_info<mfma_instr::mfma_f32_4x4x2bf16>
               class FloatC>
     __device__ FloatC run(const FloatA* a, const FloatB* b, FloatC reg_c) const
     {
-        const auto p_a = reinterpret_cast<const ushort2_t*>(a);
-        const auto p_b = reinterpret_cast<const ushort2_t*>(b);
+        const auto p_a = c_style_pointer_cast<const ushort2_t*>(a);
+        const auto p_b = c_style_pointer_cast<const ushort2_t*>(b);
 
         return intrin_mfma_f32_4x4x2bf16<MPerXdlops, NPerXdlops>::run(p_a, p_b, reg_c);
     }
diff --git a/composable_kernel/include/utility/amd_address_space.hpp b/composable_kernel/include/utility/amd_address_space.hpp
new file mode 100644
index 0000000000..24c95b27af
--- /dev/null
+++ b/composable_kernel/include/utility/amd_address_space.hpp
@@ -0,0 +1,44 @@
+#ifndef CK_AMD_ADDRESS_SPACE_HPP
+#define CK_AMD_ADDRESS_SPACE_HPP
+
+#include "config.hpp"
+#include "c_style_pointer_cast.hpp"
+
+// Address Space for AMDGCN
+// https://llvm.org/docs/AMDGPUUsage.html#address-space
+
+namespace ck {
+
+enum AddressSpaceEnum_t
+{
+    Generic,
+    Global,
+    Lds,
+    Sgpr,
+    Vgpr,
+};
+
+template <typename T>
+__device__ T* cast_pointer_to_generic_address_space(T CONSTANT* p)
+{
+    // cast a pointer in "Constant" address space (4) to "Generic" address space (0)
+    // only c-style pointer cast seems be able to be compiled
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wold-style-cast"
+    return (T*)p; // NOLINT(old-style-cast)
+#pragma clang diagnostic pop
+}
+
+template <typename T>
+__host__ __device__ T CONSTANT* cast_pointer_to_constant_address_space(T* p)
+{
+    // cast a pointer in "Generic" address space (0) to "Constant" address space (4)
+    // only c-style pointer cast seems be able to be compiled
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wold-style-cast"
+    return (T CONSTANT*)p; // NOLINT(old-style-cast)
+#pragma clang diagnostic pop
+}
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/amd_buffer_addressing_v2.hpp b/composable_kernel/include/utility/amd_buffer_addressing.hpp
similarity index 87%
rename from composable_kernel/include/utility/amd_buffer_addressing_v2.hpp
rename to composable_kernel/include/utility/amd_buffer_addressing.hpp
index 0139bceb61..57081b7fd7 100644
--- a/composable_kernel/include/utility/amd_buffer_addressing_v2.hpp
+++ b/composable_kernel/include/utility/amd_buffer_addressing.hpp
@@ -1,34 +1,34 @@
-#ifndef CK_AMD_BUFFER_ADDRESSING_V2_HPP
-#define CK_AMD_BUFFER_ADDRESSING_V2_HPP
+#ifndef CK_AMD_BUFFER_ADDRESSING_HPP
+#define CK_AMD_BUFFER_ADDRESSING_HPP
 
 #include "data_type.hpp"
 
 namespace ck {
 
 template <typename T>
-union BufferResource_v2
+union BufferResource
 {
     // 128 bit SGPRs to supply buffer resource in buffer instructions
     // https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
-    int32x4_t data;
+    int32x4_t content;
     StaticallyIndexedArray<T*, 2> address;
     StaticallyIndexedArray<int32_t, 4> range;
     StaticallyIndexedArray<int32_t, 4> config;
 };
 
 template <typename T>
-__device__ int32x4_t make_wave_buffer_resource(T* p_wave, index_t data_space_size)
+__device__ int32x4_t make_wave_buffer_resource(T* p_wave, index_t element_space_size)
 {
-    BufferResource_v2<T> wave_buffer_resource;
+    BufferResource<T> wave_buffer_resource;
 
     // wavewise base address (64 bit)
     wave_buffer_resource.address(Number<0>{}) = const_cast<remove_cv_t<T>*>(p_wave);
     // wavewise range (32 bit)
-    wave_buffer_resource.range(Number<2>{}) = data_space_size * sizeof(T);
+    wave_buffer_resource.range(Number<2>{}) = element_space_size * sizeof(T);
     // wavewise setting (32 bit)
     wave_buffer_resource.config(Number<3>{}) = CK_BUFFER_RESOURCE_3RD_DWORD;
 
-    return wave_buffer_resource.data;
+    return wave_buffer_resource.content;
 }
 
 // load
@@ -204,10 +204,9 @@ llvm_amdgcn_raw_buffer_store_fp32x4(float4_t vdata,
                                     index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32");
 
 template <typename T, index_t N>
-__device__ typename vector_type<T, N>::type
-amd_buffer_load_impl_v2(int32x4_t src_wave_buffer_resource,
-                        index_t src_thread_addr_offset,
-                        index_t src_wave_addr_offset)
+__device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_wave_buffer_resource,
+                                                                 index_t src_thread_addr_offset,
+                                                                 index_t src_wave_addr_offset)
 {
     static_assert(
         (is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
@@ -412,10 +411,10 @@ amd_buffer_load_impl_v2(int32x4_t src_wave_buffer_resource,
 }
 
 template <typename T, index_t N>
-__device__ void amd_buffer_store_impl_v2(const typename vector_type<T, N>::type src_thread_data,
-                                         int32x4_t dst_wave_buffer_resource,
-                                         index_t dst_thread_addr_offset,
-                                         index_t dst_wave_addr_offset)
+__device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src_thread_data,
+                                      int32x4_t dst_wave_buffer_resource,
+                                      index_t dst_thread_addr_offset,
+                                      index_t dst_wave_addr_offset)
 {
     static_assert(
         (is_same<T, float>::value && (N == 1 || N == 2 || N == 4)) ||
@@ -584,67 +583,95 @@ __device__ void amd_buffer_store_impl_v2(const typename vector_type<T, N>::type
 
 // buffer_load requires:
 //   1) p_src_wave must be in global memory space
-//   2) p_src_wave to be a wavewise pointer.
+//   2) p_src_wave must be a wavewise pointer.
 // It is user's responsibility to make sure that is true.
 template <typename T, index_t N>
 __device__ typename vector_type_maker<T, N>::type::type
-amd_buffer_load_v2(const T* p_src_wave,
-                   index_t src_thread_data_offset,
-                   bool src_thread_data_valid,
-                   index_t src_element_space)
+amd_buffer_load_invalid_element_return_return_zero(const T* p_src_wave,
+                                                   index_t src_thread_element_offset,
+                                                   bool src_thread_element_valid,
+                                                   index_t src_element_space_size)
 {
     const int32x4_t src_wave_buffer_resource =
-        make_wave_buffer_resource(p_src_wave, src_element_space);
+        make_wave_buffer_resource(p_src_wave, src_element_space_size);
 
-    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(T);
+    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
+
+    using vector_t = typename vector_type_maker<T, N>::type::type;
+    using scalar_t = typename scalar_type<vector_t>::type;
 
-    using vector_t                = typename vector_type_maker<T, N>::type::type;
-    using scalar_t                = typename scalar_type<vector_t>::type;
     constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
 
 #if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
-    uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
+    uint32_t src_addr_shift = src_thread_element_valid ? 0 : 0x7fffffff;
 
-    return amd_buffer_load_impl_v2<scalar_t, vector_size>(
+    return amd_buffer_load_impl<scalar_t, vector_size>(
         src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);
 #else
-    vector_t tmp = amd_buffer_load_impl_v2<scalar_t, vector_size>(
+    vector_t tmp = amd_buffer_load_impl<scalar_t, vector_size>(
         src_wave_buffer_resource, src_thread_addr_offset, 0);
 
-    return src_thread_data_valid ? tmp : vector_t(0);
+    return src_thread_element_valid ? tmp : vector_t(0);
 #endif
 }
 
+// buffer_load requires:
+//   1) p_src_wave must be in global memory space
+//   2) p_src_wave must be a wavewise pointer.
+// It is user's responsibility to make sure that is true.
+template <typename T, index_t N>
+__device__ typename vector_type_maker<T, N>::type::type
+amd_buffer_load_invalid_element_return_customized_value(const T* p_src_wave,
+                                                        index_t src_thread_element_offset,
+                                                        bool src_thread_element_valid,
+                                                        index_t src_element_space_size,
+                                                        T customized_value)
+{
+    const int32x4_t src_wave_buffer_resource =
+        make_wave_buffer_resource(p_src_wave, src_element_space_size);
+
+    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
+
+    using vector_t = typename vector_type_maker<T, N>::type::type;
+    using scalar_t = typename scalar_type<vector_t>::type;
+
+    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
+
+    vector_t tmp = amd_buffer_load_impl<scalar_t, vector_size>(
+        src_wave_buffer_resource, src_thread_addr_offset, 0);
+
+    return src_thread_element_valid ? tmp : vector_t(customized_value);
+}
+
 // buffer_store requires:
 //   1) p_dst_wave must be global memory
 //   2) p_dst_wave to be a wavewise pointer.
 // It is user's responsibility to make sure that is true.
 template <typename T, index_t N>
-__device__ void
-amd_buffer_store_v2(const typename vector_type_maker<T, N>::type::type src_thread_data,
-                    T* p_dst_wave,
-                    const index_t dst_thread_data_offset,
-                    const bool dst_thread_data_valid,
-                    const index_t dst_element_space)
+__device__ void amd_buffer_store(const typename vector_type_maker<T, N>::type::type src_thread_data,
+                                 T* p_dst_wave,
+                                 const index_t dst_thread_element_offset,
+                                 const bool dst_thread_element_valid,
+                                 const index_t dst_element_space_size)
 {
     const int32x4_t dst_wave_buffer_resource =
-        make_wave_buffer_resource(p_dst_wave, dst_element_space);
+        make_wave_buffer_resource(p_dst_wave, dst_element_space_size);
 
-    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(T);
+    index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
 
     using vector_t                = typename vector_type_maker<T, N>::type::type;
     using scalar_t                = typename scalar_type<vector_t>::type;
     constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
 
 #if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
-    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
+    uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x7fffffff;
 
-    amd_buffer_store_impl_v2<scalar_t, vector_size>(
+    amd_buffer_store_impl<scalar_t, vector_size>(
         src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
 #else
-    if(dst_thread_data_valid)
+    if(dst_thread_element_valid)
     {
-        amd_buffer_store_impl_v2<scalar_t, vector_size>(
+        amd_buffer_store_impl<scalar_t, vector_size>(
             src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
     }
 #endif
diff --git a/composable_kernel/include/utility/amd_dlop.hpp b/composable_kernel/include/utility/amd_dlop.hpp
deleted file mode 100644
index 8ce19012e9..0000000000
--- a/composable_kernel/include/utility/amd_dlop.hpp
+++ /dev/null
@@ -1,188 +0,0 @@
-#ifndef CK_AMD_DLOP_HPP
-#define CK_AMD_DLOP_HPP
-
-#include "data_type.hpp"
-
-namespace ck {
-
-template <typename TA, typename TB, typename TC>
-__device__ void amd_inner_product_dlop(const TA& a, const TB& b, TC& c);
-
-template <>
-__device__ void
-amd_inner_product_dlop<float, float, float>(const float& a, const float& b, float& c)
-{
-#if CK_USE_AMD_DLOP_INLINE_ASM
-    asm volatile("\n \
-            v_fmac_f32 %0, %1, %2 \n \
-            "
-                 : "=v"(c)
-                 : "v"(a), "v"(b), "0"(c));
-#else
-    c += a * b;
-#endif
-}
-
-template <>
-__device__ void
-amd_inner_product_dlop<float2_t, float2_t, float>(const float2_t& a, const float2_t& b, float& c)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-
-    amd_inner_product_dlop(vector_type<float, 2>{a}.AsType<float>()[I0],
-                           vector_type<float, 2>{b}.AsType<float>()[I0],
-                           c);
-
-    amd_inner_product_dlop(vector_type<float, 2>{a}.AsType<float>()[I1],
-                           vector_type<float, 2>{b}.AsType<float>()[I1],
-                           c);
-}
-
-template <>
-__device__ void
-amd_inner_product_dlop<float4_t, float4_t, float>(const float4_t& a, const float4_t& b, float& c)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    amd_inner_product_dlop(vector_type<float, 4>{a}.AsType<float>()[I0],
-                           vector_type<float, 4>{b}.AsType<float>()[I0],
-                           c);
-
-    amd_inner_product_dlop(vector_type<float, 4>{a}.AsType<float>()[I1],
-                           vector_type<float, 4>{b}.AsType<float>()[I1],
-                           c);
-
-    amd_inner_product_dlop(vector_type<float, 4>{a}.AsType<float>()[I2],
-                           vector_type<float, 4>{b}.AsType<float>()[I2],
-                           c);
-
-    amd_inner_product_dlop(vector_type<float, 4>{a}.AsType<float>()[I3],
-                           vector_type<float, 4>{b}.AsType<float>()[I3],
-                           c);
-}
-
-#if CK_USE_AMD_DLOP
-template <>
-__device__ void
-amd_inner_product_dlop<half2_t, half2_t, float>(const half2_t& a, const half2_t& b, float& c)
-{
-#if CK_USE_AMD_DLOP_INLINE_ASM
-    asm volatile("\n \
-            v_dot2_f32_f16 %0, %1, %2, %0\n \
-            "
-                 : "=v"(c)
-                 : "v"(a), "v"(b), "0"(c));
-#else
-    c = __builtin_amdgcn_sdot2(a, b, c, false);
-#endif
-}
-
-template <>
-__device__ void
-amd_inner_product_dlop<half4_t, half4_t, float>(const half4_t& a, const half4_t& b, float& c)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-
-    amd_inner_product_dlop(vector_type<half_t, 4>{a}.AsType<half2_t>()[I0],
-                           vector_type<half_t, 4>{b}.AsType<half2_t>()[I0],
-                           c);
-
-    amd_inner_product_dlop(vector_type<half_t, 4>{a}.AsType<half2_t>()[I1],
-                           vector_type<half_t, 4>{b}.AsType<half2_t>()[I1],
-                           c);
-}
-
-template <>
-__device__ void
-amd_inner_product_dlop<half8_t, half8_t, float>(const half8_t& a, const half8_t& b, float& c)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    amd_inner_product_dlop(vector_type<half_t, 8>{a}.AsType<half2_t>()[I0],
-                           vector_type<half_t, 8>{b}.AsType<half2_t>()[I0],
-                           c);
-
-    amd_inner_product_dlop(vector_type<half_t, 8>{a}.AsType<half2_t>()[I1],
-                           vector_type<half_t, 8>{b}.AsType<half2_t>()[I1],
-                           c);
-
-    amd_inner_product_dlop(vector_type<half_t, 8>{a}.AsType<half2_t>()[I2],
-                           vector_type<half_t, 8>{b}.AsType<half2_t>()[I2],
-                           c);
-
-    amd_inner_product_dlop(vector_type<half_t, 8>{a}.AsType<half2_t>()[I3],
-                           vector_type<half_t, 8>{b}.AsType<half2_t>()[I3],
-                           c);
-}
-
-template <>
-__device__ void amd_inner_product_dlop<int8x4_t, int8x4_t, int32_t>(const int8x4_t& a,
-                                                                    const int8x4_t& b,
-                                                                    int32_t& c)
-{
-#if CK_USE_AMD_DLOP_INLINE_ASM
-    asm volatile("\n \
-            v_dot4_i32_i8 %0, %1, %2, %0\n \
-            "
-                 : "=v"(c)
-                 : "v"(as_type<int32_t>(a)), "v"(as_type<int32_t>(b)), "0"(c));
-#else
-    c = __builtin_amdgcn_sdot4(as_type<int32_t>(a), as_type<int32_t>(b), c, false);
-#endif
-}
-
-template <>
-__device__ void amd_inner_product_dlop<int8x8_t, int8x8_t, int32_t>(const int8x8_t& a,
-                                                                    const int8x8_t& b,
-                                                                    int32_t& c)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-
-    amd_inner_product_dlop(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I0],
-                           vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I0],
-                           c);
-
-    amd_inner_product_dlop(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I1],
-                           vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I1],
-                           c);
-}
-
-template <>
-__device__ void amd_inner_product_dlop<int8x16_t, int8x16_t, int32_t>(const int8x16_t& a,
-                                                                      const int8x16_t& b,
-                                                                      int32_t& c)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    amd_inner_product_dlop(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I0],
-                           vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I0],
-                           c);
-
-    amd_inner_product_dlop(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I1],
-                           vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I1],
-                           c);
-
-    amd_inner_product_dlop(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I2],
-                           vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I2],
-                           c);
-
-    amd_inner_product_dlop(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I3],
-                           vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I3],
-                           c);
-}
-#endif // CK_USE_AMD_DLOP
-
-} // namespace ck
-#endif
diff --git a/composable_kernel/include/utility/amd_inline_asm.hpp b/composable_kernel/include/utility/amd_inline_asm.hpp
index ce80fc0549..a2d9d5f062 100644
--- a/composable_kernel/include/utility/amd_inline_asm.hpp
+++ b/composable_kernel/include/utility/amd_inline_asm.hpp
@@ -2,6 +2,9 @@
 #define CK_AMD_INLINE_ASM_HPP
 
 #include "data_type.hpp"
+#include "c_style_pointer_cast.hpp"
+
+// TODO: deprecate all amd_assembly_outer_product_xxx
 
 namespace ck {
 
@@ -53,9 +56,9 @@ __device__ void
 amd_assembly_outer_product_1x2(half4_t a, half4_t b0, half4_t b1, float& c0, float& c1)
 {
     // TODO remove pointer casting
-    const half2_t* p_a_half2  = reinterpret_cast<const half2_t*>(&a);
-    const half2_t* p_b0_half2 = reinterpret_cast<const half2_t*>(&b0);
-    const half2_t* p_b1_half2 = reinterpret_cast<const half2_t*>(&b1);
+    const half2_t* p_a_half2  = c_style_pointer_cast<const half2_t*>(&a);
+    const half2_t* p_b0_half2 = c_style_pointer_cast<const half2_t*>(&b0);
+    const half2_t* p_b1_half2 = c_style_pointer_cast<const half2_t*>(&b1);
 
     // do dot2 two times
     asm volatile("\n \
@@ -114,11 +117,11 @@ __device__ void amd_assembly_outer_product_1x4(half4_t a,
                                                float& c3)
 {
     // TODO remove pointer casting
-    const half2_t* p_a_half2  = reinterpret_cast<const half2_t*>(&a);
-    const half2_t* p_b0_half2 = reinterpret_cast<const half2_t*>(&b0);
-    const half2_t* p_b1_half2 = reinterpret_cast<const half2_t*>(&b1);
-    const half2_t* p_b2_half2 = reinterpret_cast<const half2_t*>(&b2);
-    const half2_t* p_b3_half2 = reinterpret_cast<const half2_t*>(&b3);
+    const half2_t* p_a_half2  = c_style_pointer_cast<const half2_t*>(&a);
+    const half2_t* p_b0_half2 = c_style_pointer_cast<const half2_t*>(&b0);
+    const half2_t* p_b1_half2 = c_style_pointer_cast<const half2_t*>(&b1);
+    const half2_t* p_b2_half2 = c_style_pointer_cast<const half2_t*>(&b2);
+    const half2_t* p_b3_half2 = c_style_pointer_cast<const half2_t*>(&b3);
 
     // do dot2 two times
     asm volatile("\n \
@@ -160,11 +163,11 @@ __device__ void amd_assembly_outer_product_1x4(half8_t a,
 {
 
     // TODO remove pointer casting
-    const half4_t* p_a_half4  = reinterpret_cast<const half4_t*>(&a);
-    const half4_t* p_b0_half4 = reinterpret_cast<const half4_t*>(&b0);
-    const half4_t* p_b1_half4 = reinterpret_cast<const half4_t*>(&b1);
-    const half4_t* p_b2_half4 = reinterpret_cast<const half4_t*>(&b2);
-    const half4_t* p_b3_half4 = reinterpret_cast<const half4_t*>(&b3);
+    const half4_t* p_a_half4  = c_style_pointer_cast<const half4_t*>(&a);
+    const half4_t* p_b0_half4 = c_style_pointer_cast<const half4_t*>(&b0);
+    const half4_t* p_b1_half4 = c_style_pointer_cast<const half4_t*>(&b1);
+    const half4_t* p_b2_half4 = c_style_pointer_cast<const half4_t*>(&b2);
+    const half4_t* p_b3_half4 = c_style_pointer_cast<const half4_t*>(&b3);
 
     amd_assembly_outer_product_1x4(
         p_a_half4[0], p_b0_half4[0], p_b1_half4[0], p_b2_half4[0], p_b3_half4[0], c0, c1, c2, c3);
@@ -184,11 +187,11 @@ __device__ void amd_assembly_outer_product_1x4(half16_t a,
                                                float& c3)
 {
     // TODO remove pointer casting
-    const half8_t* p_a_half8  = reinterpret_cast<const half8_t*>(&a);
-    const half8_t* p_b0_half8 = reinterpret_cast<const half8_t*>(&b0);
-    const half8_t* p_b1_half8 = reinterpret_cast<const half8_t*>(&b1);
-    const half8_t* p_b2_half8 = reinterpret_cast<const half8_t*>(&b2);
-    const half8_t* p_b3_half8 = reinterpret_cast<const half8_t*>(&b3);
+    const half8_t* p_a_half8  = c_style_pointer_cast<const half8_t*>(&a);
+    const half8_t* p_b0_half8 = c_style_pointer_cast<const half8_t*>(&b0);
+    const half8_t* p_b1_half8 = c_style_pointer_cast<const half8_t*>(&b1);
+    const half8_t* p_b2_half8 = c_style_pointer_cast<const half8_t*>(&b2);
+    const half8_t* p_b3_half8 = c_style_pointer_cast<const half8_t*>(&b3);
 
     amd_assembly_outer_product_1x4(
         p_a_half8[0], p_b0_half8[0], p_b1_half8[0], p_b2_half8[0], p_b3_half8[0], c0, c1, c2, c3);
diff --git a/composable_kernel/include/utility/c_style_pointer_cast.hpp b/composable_kernel/include/utility/c_style_pointer_cast.hpp
new file mode 100644
index 0000000000..8acf5790c6
--- /dev/null
+++ b/composable_kernel/include/utility/c_style_pointer_cast.hpp
@@ -0,0 +1,22 @@
+#ifndef CK_C_STYLE_POINTER_CAST_HPP
+#define CK_C_STYLE_POINTER_CAST_HPP
+
+#include "type.hpp"
+#include "enable_if.hpp"
+
+namespace ck {
+
+template <typename PY,
+          typename PX,
+          typename enable_if<is_pointer_v<PY> && is_pointer_v<PX>, bool>::type = false>
+__host__ __device__ PY c_style_pointer_cast(PX p_x)
+{
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wold-style-cast"
+#pragma clang diagnostic ignored "-Wcast-align"
+    return (PY)p_x; // NOLINT(old-style-cast, cast-align)
+#pragma clang diagnostic pop
+}
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/common_header.hpp b/composable_kernel/include/utility/common_header.hpp
index 5ff7688a1c..85c02a1b99 100644
--- a/composable_kernel/include/utility/common_header.hpp
+++ b/composable_kernel/include/utility/common_header.hpp
@@ -7,13 +7,14 @@
 #include "statically_indexed_array.hpp"
 #include "container_element_picker.hpp"
 #include "multi_index.hpp"
-#include "data_type_enum.hpp"
 #include "data_type.hpp"
-#include "data_type_helper.hpp"
+#include "data_type_enum.hpp"
+#include "data_type_enum_helper.hpp"
 #include "functional.hpp"
 #include "functional2.hpp"
 #include "functional3.hpp"
 #include "functional4.hpp"
+#include "enable_if.hpp"
 #include "integral_constant.hpp"
 #include "math.hpp"
 #include "number.hpp"
@@ -23,21 +24,21 @@
 #include "tuple.hpp"
 #include "tuple_helper.hpp"
 #include "type.hpp"
-#include "utility.hpp"
 #include "magic_division.hpp"
-#include "amd_buffer_addressing_v2.hpp"
+#include "utility.hpp"
+#include "c_style_pointer_cast.hpp"
+#include "amd_address_space.hpp"
+#include "amd_buffer_addressing.hpp"
 #include "static_buffer.hpp"
 #include "dynamic_buffer.hpp"
 
+#include "inner_product.hpp"
+
 // TODO: remove this
 #if CK_USE_AMD_INLINE_ASM
 #include "amd_inline_asm.hpp"
 #endif
 
-#if CK_USE_AMD_DLOP
-#include "amd_dlop.hpp"
-#endif
-
 #if CK_USE_AMD_XDLOPS
 #include "amd_xdlops.hpp"
 #endif
diff --git a/composable_kernel/include/utility/config.hpp b/composable_kernel/include/utility/config.hpp
index 4908d8d818..521ad24d47 100644
--- a/composable_kernel/include/utility/config.hpp
+++ b/composable_kernel/include/utility/config.hpp
@@ -7,19 +7,14 @@
 #endif
 #include "bfloat16_dev.hpp"
 
-// address space for kernel parameter
+// "Constant" address space for kernel parameter
 #define CONSTANT __attribute__((address_space(4)))
 
 // GPU target
 // should enable one and only one GPU target
 #if !(defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900) || defined(CK_AMD_GPU_GFX906) || \
       defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90A) || defined(CK_AMD_GPU_GFX1030))
-#error Need to define a single GPU target
-#endif
-
-// HIP version
-#ifndef CK_HIP_VERSION_FLAT
-#define CK_HIP_VERSION_FLAT 0
+#error Need to define (only) one GPU target
 #endif
 
 // launch bounds
@@ -38,6 +33,16 @@
 #define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
 #endif
 
+// FMA instruction
+#if defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900)
+#define CK_USE_AMD_V_MAC_F32
+#elif defined(CK_AMD_GPU_GFX906) || defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90a) || \
+    defined(CK_AMD_GPU_GFX1030)
+#define CK_USE_AMD_V_FMAC_F32
+#define CK_USE_AMD_V_DOT2_F32_F16
+#define CK_USE_AMD_V_DOT4_I32_I8
+#endif
+
 // multi index
 #define CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0
 
@@ -46,13 +51,9 @@
 #define CK_USE_AMD_INLINE_ASM 1
 #endif
 
-// AMD DLOPS
-#ifndef CK_USE_AMD_DLOP
-#define CK_USE_AMD_DLOP 1
-#endif
-
-#ifndef CK_USE_AMD_DLOP_INLINE_ASM
-#define CK_USE_AMD_DLOP_INLINE_ASM 1
+// AMD inner product (DLOP)
+#ifndef CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
+#define CK_USE_AMD_INNER_PRODUCT_INLINE_ASM 1
 #endif
 
 // AMD buffer addressing
@@ -99,8 +100,8 @@
 // hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
 // thread-invariant, otherwise it's a bug
 // TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread"
-#ifndef CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
-#define CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0
+#ifndef CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
+#define CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0
 #endif
 
 // workaround for compiler crash when compiling recursive lambda
@@ -120,15 +121,6 @@
 
 namespace ck {
 
-enum AddressSpaceEnum_t
-{
-    Generic,
-    Global,
-    Lds,
-    Sgpr,
-    Vgpr
-};
-
 enum InMemoryDataOperationEnum_t
 {
     Set,
diff --git a/composable_kernel/include/utility/data_type_enum.hpp b/composable_kernel/include/utility/data_type_enum.hpp
index 43499605dc..35df0067a9 100644
--- a/composable_kernel/include/utility/data_type_enum.hpp
+++ b/composable_kernel/include/utility/data_type_enum.hpp
@@ -3,8 +3,7 @@
 
 namespace ck {
 
-// this enumerate should be synchronized with include/miopen.h
-typedef enum
+enum DataTypeEnum_t
 {
     Half     = 0,
     Float    = 1,
@@ -14,7 +13,7 @@ typedef enum
     BFloat16 = 5,
     Double   = 6,
     Unknown  = 100,
-} DataTypeEnum_t;
+};
 
 } // namespace ck
 #endif
diff --git a/composable_kernel/include/utility/data_type_helper.hpp b/composable_kernel/include/utility/data_type_enum_helper.hpp
similarity index 94%
rename from composable_kernel/include/utility/data_type_helper.hpp
rename to composable_kernel/include/utility/data_type_enum_helper.hpp
index 6a234cd10b..451ce992b1 100644
--- a/composable_kernel/include/utility/data_type_helper.hpp
+++ b/composable_kernel/include/utility/data_type_enum_helper.hpp
@@ -1,5 +1,5 @@
-#ifndef CK_DATA_TYPE_HELPER_HPP
-#define CK_DATA_TYPE_HELPER_HPP
+#ifndef CK_DATA_TYPE_ENUM_HELPER_HPP
+#define CK_DATA_TYPE_ENUM_HELPER_HPP
 
 #include "data_type.hpp"
 #include "data_type_enum.hpp"
diff --git a/composable_kernel/include/utility/dynamic_buffer.hpp b/composable_kernel/include/utility/dynamic_buffer.hpp
index 5f5f386306..4d583e3ce7 100644
--- a/composable_kernel/include/utility/dynamic_buffer.hpp
+++ b/composable_kernel/include/utility/dynamic_buffer.hpp
@@ -1,38 +1,49 @@
-#ifndef CK_DYNAMIC_BUFFER_HPP
-#define CK_DYNAMIC_BUFFER_HPP
+#ifndef CK_BUFFER_HPP
+#define CK_BUFFER_HPP
 
-namespace ck {
+#include "amd_buffer_addressing.hpp"
+#include "c_style_pointer_cast.hpp"
+#include "enable_if.hpp"
 
-#include "amd_buffer_addressing_v2.hpp"
+namespace ck {
 
-template <AddressSpaceEnum_t BufferAddressSpace, typename T, typename ElementSpaceSize>
+template <AddressSpaceEnum_t BufferAddressSpace,
+          typename T,
+          typename ElementSpaceSize,
+          bool InvalidElementUseNumericalZeroValue>
 struct DynamicBuffer
 {
     using type = T;
 
     T* p_data_;
     ElementSpaceSize element_space_size_;
+    T invalid_element_value_ = T{0};
 
     __host__ __device__ constexpr DynamicBuffer(T* p_data, ElementSpaceSize element_space_size)
         : p_data_{p_data}, element_space_size_{element_space_size}
     {
     }
 
+    __host__ __device__ constexpr DynamicBuffer(T* p_data,
+                                                ElementSpaceSize element_space_size,
+                                                T invalid_element_value)
+        : p_data_{p_data},
+          element_space_size_{element_space_size},
+          invalid_element_value_{invalid_element_value}
+    {
+    }
+
     __host__ __device__ static constexpr AddressSpaceEnum_t GetAddressSpace()
     {
         return BufferAddressSpace;
     }
 
-    __host__ __device__ constexpr const T& operator[](index_t i) const { return p_data_[i]; }
-
-    __host__ __device__ constexpr T& operator()(index_t i) { return p_data_[i]; }
-
     template <typename X,
-              typename std::enable_if<
+              typename enable_if<
                   is_same<typename scalar_type<remove_cv_t<remove_reference_t<X>>>::type,
                           typename scalar_type<remove_cv_t<remove_reference_t<T>>>::type>::value,
                   bool>::type = false>
-    __host__ __device__ constexpr auto Get(index_t i, bool is_valid_offset) const
+    __host__ __device__ constexpr auto Get(index_t i, bool is_valid_element) const
     {
         // X contains multiple T
         constexpr index_t scalar_per_t_vector =
@@ -44,29 +55,50 @@ struct DynamicBuffer
         static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
                       "wrong! X need to be multiple T");
 
-        constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
-
-        if constexpr(GetAddressSpace() == AddressSpaceEnum_t::Global)
-        {
 #if CK_USE_AMD_BUFFER_ADDRESSING
-            return amd_buffer_load_v2<remove_cv_t<remove_reference_t<T>>, t_per_x>(
-                p_data_, i, is_valid_offset, element_space_size_);
+        bool constexpr use_amd_buffer_addressing = true;
 #else
-            return is_valid_offset ? *reinterpret_cast<const X*>(&p_data_[i]) : X{0};
+        bool constexpr use_amd_buffer_addressing = false;
 #endif
+
+        if constexpr(GetAddressSpace() == AddressSpaceEnum_t::Global && use_amd_buffer_addressing)
+        {
+            constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
+
+            if constexpr(InvalidElementUseNumericalZeroValue)
+            {
+                return amd_buffer_load_invalid_element_return_return_zero<
+                    remove_cv_t<remove_reference_t<T>>,
+                    t_per_x>(p_data_, i, is_valid_element, element_space_size_);
+            }
+            else
+            {
+                return amd_buffer_load_invalid_element_return_customized_value<
+                    remove_cv_t<remove_reference_t<T>>,
+                    t_per_x>(
+                    p_data_, i, is_valid_element, element_space_size_, invalid_element_value_);
+            }
         }
         else
         {
-            return is_valid_offset ? *reinterpret_cast<const X*>(&p_data_[i]) : X{0};
+            if constexpr(InvalidElementUseNumericalZeroValue)
+            {
+                return is_valid_element ? *c_style_pointer_cast<const X*>(&p_data_[i]) : X{0};
+            }
+            else
+            {
+                return is_valid_element ? *c_style_pointer_cast<const X*>(&p_data_[i])
+                                        : X{invalid_element_value_};
+            }
         }
     }
 
     template <typename X,
-              typename std::enable_if<
+              typename enable_if<
                   is_same<typename scalar_type<remove_cv_t<remove_reference_t<X>>>::type,
                           typename scalar_type<remove_cv_t<remove_reference_t<T>>>::type>::value,
                   bool>::type = false>
-    __host__ __device__ void Set(index_t i, bool is_valid_offset, const X& x)
+    __host__ __device__ void Set(index_t i, bool is_valid_element, const X& x)
     {
         // X contains multiple T
         constexpr index_t scalar_per_t_vector =
@@ -78,26 +110,26 @@ struct DynamicBuffer
         static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
                       "wrong! X need to be multiple T");
 
-        constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
-
         if constexpr(GetAddressSpace() == AddressSpaceEnum_t::Global)
         {
 #if CK_USE_AMD_BUFFER_ADDRESSING
-            amd_buffer_store_v2<remove_cv_t<remove_reference_t<T>>, t_per_x>(
-                x, p_data_, i, is_valid_offset, element_space_size_);
+            constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
+
+            amd_buffer_store<remove_cv_t<remove_reference_t<T>>, t_per_x>(
+                x, p_data_, i, is_valid_element, element_space_size_);
 #else
-            if(is_valid_offset)
+            if(is_valid_element)
             {
-                *reinterpret_cast<X*>(&p_data_[i]) = x;
+                *c_style_pointer_cast<X*>(&p_data_[i]) = x;
             }
 #endif
         }
         else if constexpr(GetAddressSpace() == AddressSpaceEnum_t::Lds)
         {
-            if(is_valid_offset)
+            if(is_valid_element)
             {
 #if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
-                *reinterpret_cast<X*>(&p_data_[i]) = x;
+                *c_style_pointer_cast<X*>(&p_data_[i]) = x;
 #else
                 // HACK: compiler would lower IR "store<i8, 16> address_space(3)" into
                 // inefficient
@@ -128,24 +160,24 @@ struct DynamicBuffer
                     {
                         // HACK: cast pointer of x is bad
                         // TODO: remove this after compiler fix
-                        *reinterpret_cast<int8_t*>(&p_data_[i]) =
-                            *reinterpret_cast<const int8_t*>(&x);
+                        *c_style_pointer_cast<int8_t*>(&p_data_[i]) =
+                            *c_style_pointer_cast<const int8_t*>(&x);
                     }
                     else if constexpr(is_same<remove_cv_t<remove_reference_t<T>>, int8_t>::value &&
                                       is_same<remove_cv_t<remove_reference_t<X>>, int8x2_t>::value)
                     {
                         // HACK: cast pointer of x is bad
                         // TODO: remove this after compiler fix
-                        *reinterpret_cast<int16_t*>(&p_data_[i]) =
-                            *reinterpret_cast<const int16_t*>(&x);
+                        *c_style_pointer_cast<int16_t*>(&p_data_[i]) =
+                            *c_style_pointer_cast<const int16_t*>(&x);
                     }
                     else if constexpr(is_same<remove_cv_t<remove_reference_t<T>>, int8_t>::value &&
                                       is_same<remove_cv_t<remove_reference_t<X>>, int8x4_t>::value)
                     {
                         // HACK: cast pointer of x is bad
                         // TODO: remove this after compiler fix
-                        *reinterpret_cast<int32_t*>(&p_data_[i]) =
-                            *reinterpret_cast<const int32_t*>(&x);
+                        *c_style_pointer_cast<int32_t*>(&p_data_[i]) =
+                            *c_style_pointer_cast<const int32_t*>(&x);
                     }
                     else if constexpr(is_same<remove_cv_t<remove_reference_t<T>>,
                                               int8x4_t>::value &&
@@ -153,8 +185,8 @@ struct DynamicBuffer
                     {
                         // HACK: cast pointer of x is bad
                         // TODO: remove this after compiler fix
-                        *reinterpret_cast<int32_t*>(&p_data_[i]) =
-                            *reinterpret_cast<const int32_t*>(&x);
+                        *c_style_pointer_cast<int32_t*>(&p_data_[i]) =
+                            *c_style_pointer_cast<const int32_t*>(&x);
                     }
                     else if constexpr(is_same<remove_cv_t<remove_reference_t<T>>,
                                               int8x8_t>::value &&
@@ -162,8 +194,8 @@ struct DynamicBuffer
                     {
                         // HACK: cast pointer of x is bad
                         // TODO: remove this after compiler fix
-                        *reinterpret_cast<int32x2_t*>(&p_data_[i]) =
-                            *reinterpret_cast<const int32x2_t*>(&x);
+                        *c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
+                            *c_style_pointer_cast<const int32x2_t*>(&x);
                     }
                     else if constexpr(is_same<remove_cv_t<remove_reference_t<T>>,
                                               int8x16_t>::value &&
@@ -171,22 +203,22 @@ struct DynamicBuffer
                     {
                         // HACK: cast pointer of x is bad
                         // TODO: remove this after compiler fix
-                        *reinterpret_cast<int32x4_t*>(&p_data_[i]) =
-                            *reinterpret_cast<const int32x4_t*>(&x);
+                        *c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
+                            *c_style_pointer_cast<const int32x4_t*>(&x);
                     }
                 }
                 else
                 {
-                    *reinterpret_cast<X*>(&p_data_[i]) = x;
+                    *c_style_pointer_cast<X*>(&p_data_[i]) = x;
                 }
 #endif
             }
         }
         else
         {
-            if(is_valid_offset)
+            if(is_valid_element)
             {
-                *reinterpret_cast<X*>(&p_data_[i]) = x;
+                *c_style_pointer_cast<X*>(&p_data_[i]) = x;
             }
         }
     }
@@ -196,12 +228,18 @@ struct DynamicBuffer
     __host__ __device__ static constexpr bool IsDynamicBuffer() { return true; }
 };
 
-template <AddressSpaceEnum_t BufferAddressSpace = AddressSpaceEnum_t::Generic,
-          typename T,
-          typename ElementSpaceSize>
+template <AddressSpaceEnum_t BufferAddressSpace, typename T, typename ElementSpaceSize>
 __host__ __device__ constexpr auto make_dynamic_buffer(T* p, ElementSpaceSize element_space_size)
 {
-    return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize>{p, element_space_size};
+    return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, true>{p, element_space_size};
+}
+
+template <AddressSpaceEnum_t BufferAddressSpace, typename T, typename ElementSpaceSize>
+__host__ __device__ constexpr auto
+make_dynamic_buffer(T* p, ElementSpaceSize element_space_size, T invalid_element_value)
+{
+    return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, false>{
+        p, element_space_size, invalid_element_value};
 }
 
 } // namespace ck
diff --git a/composable_kernel/include/utility/enable_if.hpp b/composable_kernel/include/utility/enable_if.hpp
new file mode 100644
index 0000000000..501e1bfc1c
--- /dev/null
+++ b/composable_kernel/include/utility/enable_if.hpp
@@ -0,0 +1,13 @@
+#ifndef CK_ENABLE_IF_HPP
+#define CK_ENABLE_IF_HPP
+
+namespace ck {
+
+template <bool B, typename T = void>
+using enable_if = std::enable_if<B, T>;
+
+template <bool B, typename T = void>
+using enable_if_t = typename std::enable_if<B, T>::type;
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/inner_product.hpp b/composable_kernel/include/utility/inner_product.hpp
new file mode 100644
index 0000000000..51753accf3
--- /dev/null
+++ b/composable_kernel/include/utility/inner_product.hpp
@@ -0,0 +1,207 @@
+#ifndef CK_INNER_PRODUCT_HPP
+#define CK_INNER_PRODUCT_HPP
+
+#include "data_type.hpp"
+
+namespace ck {
+
+template <typename TA, typename TB, typename TC>
+__device__ void inner_product(const TA& a, const TB& b, TC& c);
+
+template <>
+__device__ void inner_product<float, float, float>(const float& a, const float& b, float& c)
+{
+#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM && defined(CK_USE_AMD_V_MAC_F32)
+    asm volatile("\n \
+            v_mac_f32 %0, %1, %2 \n \
+            "
+                 : "=v"(c)
+                 : "v"(a), "v"(b), "0"(c));
+#elif CK_USE_AMD_INNER_PRODUCT_INLINE_ASM && defined(CK_USE_AMD_V_FMAC_F32)
+    asm volatile("\n \
+            v_fmac_f32 %0, %1, %2 \n \
+            "
+                 : "=v"(c)
+                 : "v"(a), "v"(b), "0"(c));
+#else
+    c += a * b;
+#endif
+}
+
+template <>
+__device__ void
+inner_product<float2_t, float2_t, float>(const float2_t& a, const float2_t& b, float& c)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+
+    inner_product(vector_type<float, 2>{a}.AsType<float>()[I0],
+                  vector_type<float, 2>{b}.AsType<float>()[I0],
+                  c);
+
+    inner_product(vector_type<float, 2>{a}.AsType<float>()[I1],
+                  vector_type<float, 2>{b}.AsType<float>()[I1],
+                  c);
+}
+
+template <>
+__device__ void
+inner_product<float4_t, float4_t, float>(const float4_t& a, const float4_t& b, float& c)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    inner_product(vector_type<float, 4>{a}.AsType<float>()[I0],
+                  vector_type<float, 4>{b}.AsType<float>()[I0],
+                  c);
+
+    inner_product(vector_type<float, 4>{a}.AsType<float>()[I1],
+                  vector_type<float, 4>{b}.AsType<float>()[I1],
+                  c);
+
+    inner_product(vector_type<float, 4>{a}.AsType<float>()[I2],
+                  vector_type<float, 4>{b}.AsType<float>()[I2],
+                  c);
+
+    inner_product(vector_type<float, 4>{a}.AsType<float>()[I3],
+                  vector_type<float, 4>{b}.AsType<float>()[I3],
+                  c);
+}
+
+template <>
+__device__ void inner_product<half2_t, half2_t, float>(const half2_t& a, const half2_t& b, float& c)
+{
+#if defined(CK_USE_AMD_V_DOT2_F32_F16)
+#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
+    asm volatile("\n \
+            v_dot2_f32_f16 %0, %1, %2, %0\n \
+            "
+                 : "=v"(c)
+                 : "v"(a), "v"(b), "0"(c));
+#else
+    c = __builtin_amdgcn_sdot2(a, b, c, false);
+#endif
+#else
+    const auto convert = type_convert<int32_t>{};
+
+    const vector_type<half_t, 2> a_vector{a};
+    const vector_type<half_t, 2> b_vector{b};
+
+    static_for<0, 2, 1>{}([&](auto i) {
+        c += convert(a_vector.AsType<half_t>()[i]) * convert(b_vector.AsType<half_t>()[i]);
+    });
+#endif
+}
+
+template <>
+__device__ void inner_product<half4_t, half4_t, float>(const half4_t& a, const half4_t& b, float& c)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+
+    inner_product(vector_type<half_t, 4>{a}.AsType<half2_t>()[I0],
+                  vector_type<half_t, 4>{b}.AsType<half2_t>()[I0],
+                  c);
+
+    inner_product(vector_type<half_t, 4>{a}.AsType<half2_t>()[I1],
+                  vector_type<half_t, 4>{b}.AsType<half2_t>()[I1],
+                  c);
+}
+
+template <>
+__device__ void inner_product<half8_t, half8_t, float>(const half8_t& a, const half8_t& b, float& c)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    inner_product(vector_type<half_t, 8>{a}.AsType<half2_t>()[I0],
+                  vector_type<half_t, 8>{b}.AsType<half2_t>()[I0],
+                  c);
+
+    inner_product(vector_type<half_t, 8>{a}.AsType<half2_t>()[I1],
+                  vector_type<half_t, 8>{b}.AsType<half2_t>()[I1],
+                  c);
+
+    inner_product(vector_type<half_t, 8>{a}.AsType<half2_t>()[I2],
+                  vector_type<half_t, 8>{b}.AsType<half2_t>()[I2],
+                  c);
+
+    inner_product(vector_type<half_t, 8>{a}.AsType<half2_t>()[I3],
+                  vector_type<half_t, 8>{b}.AsType<half2_t>()[I3],
+                  c);
+}
+
+template <>
+__device__ void
+inner_product<int8x4_t, int8x4_t, int32_t>(const int8x4_t& a, const int8x4_t& b, int32_t& c)
+{
+#if defined(CK_USE_DOT4_I32_I8)
+#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
+    asm volatile("\n \
+            v_dot4_i32_i8 %0, %1, %2, %0\n \
+            "
+                 : "=v"(c)
+                 : "v"(as_type<int32_t>(a)), "v"(as_type<int32_t>(b)), "0"(c));
+#else
+    c = __builtin_amdgcn_sdot4(as_type<int32_t>(a), as_type<int32_t>(b), c, false);
+#endif
+#else
+    const auto convert = type_convert<int32_t>{};
+
+    const vector_type<int8_t, 4> a_vector{a};
+    const vector_type<int8_t, 4> b_vector{b};
+
+    static_for<0, 4, 1>{}([&](auto i) {
+        c += convert(a_vector.AsType<int8_t>()[i]) * convert(b_vector.AsType<int8_t>()[i]);
+    });
+#endif
+}
+
+template <>
+__device__ void
+inner_product<int8x8_t, int8x8_t, int32_t>(const int8x8_t& a, const int8x8_t& b, int32_t& c)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+
+    inner_product(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I0],
+                  vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I0],
+                  c);
+
+    inner_product(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I1],
+                  vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I1],
+                  c);
+}
+
+template <>
+__device__ void
+inner_product<int8x16_t, int8x16_t, int32_t>(const int8x16_t& a, const int8x16_t& b, int32_t& c)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I0],
+                  vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I0],
+                  c);
+
+    inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I1],
+                  vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I1],
+                  c);
+
+    inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I2],
+                  vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I2],
+                  c);
+
+    inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I3],
+                  vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I3],
+                  c);
+}
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/math.hpp b/composable_kernel/include/utility/math.hpp
index e451059647..48438e6179 100644
--- a/composable_kernel/include/utility/math.hpp
+++ b/composable_kernel/include/utility/math.hpp
@@ -5,6 +5,7 @@
 #include "integral_constant.hpp"
 #include "number.hpp"
 #include "type.hpp"
+#include "enable_if.hpp"
 
 namespace ck {
 namespace math {
@@ -27,13 +28,7 @@ struct minus
     __host__ __device__ constexpr T operator()(T a, T b) const { return a - b; }
 };
 
-template <typename T>
 struct multiplies
-{
-    __host__ __device__ constexpr T operator()(T a, T b) const { return a * b; }
-};
-
-struct multiplies_v2
 {
     template <typename A, typename B>
     __host__ __device__ constexpr auto operator()(const A& a, const B& b) const
@@ -184,9 +179,7 @@ __host__ __device__ constexpr auto gcd(Number<X>, Number<Y>)
     return Number<r>{};
 }
 
-template <typename X,
-          typename... Ys,
-          typename std::enable_if<sizeof...(Ys) >= 2, bool>::type = false>
+template <typename X, typename... Ys, typename enable_if<sizeof...(Ys) >= 2, bool>::type = false>
 __host__ __device__ constexpr auto gcd(X x, Ys... ys)
 {
     return gcd(x, gcd(ys...));
@@ -199,9 +192,7 @@ __host__ __device__ constexpr auto lcm(X x, Y y)
     return (x * y) / gcd(x, y);
 }
 
-template <typename X,
-          typename... Ys,
-          typename std::enable_if<sizeof...(Ys) >= 2, bool>::type = false>
+template <typename X, typename... Ys, typename enable_if<sizeof...(Ys) >= 2, bool>::type = false>
 __host__ __device__ constexpr auto lcm(X x, Ys... ys)
 {
     return lcm(x, lcm(ys...));
diff --git a/composable_kernel/include/utility/print.hpp b/composable_kernel/include/utility/print.hpp
index 0dd646153a..d7d58bbb83 100644
--- a/composable_kernel/include/utility/print.hpp
+++ b/composable_kernel/include/utility/print.hpp
@@ -11,59 +11,11 @@ namespace ck {
 template <typename T>
 __host__ __device__ void print_array(const char* s, T a)
 {
-    using data_type         = decltype(a.At(Number<0>{}));
     constexpr index_t nsize = a.Size();
 
-#if 0
-    if constexpr(is_same<data_type, uint32_t>{})
-    {
-        printf("%s size %u, {", s, nsize);
-        static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("%u, ", uint32_t{a[i]}); });
-        printf("}\n");
-    }
-    else if constexpr(is_same<data_type, int32_t>{})
-    {
-        printf("%s size %d, {", s, nsize);
-        static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("%d, ", int32_t{a[i]}); });
-        printf("}\n");
-    }
-    else if constexpr(is_same<data_type, bool>{})
-    {
-        printf("%s size %d, {", s, nsize);
-        static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("%d, ", bool{a[i]}); });
-        printf("}\n");
-    }
-#else
     printf("%s size %d, {", s, nsize);
     static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("%d, ", int32_t{a[i]}); });
     printf("}\n");
-#endif
-}
-
-template <typename T>
-__host__ __device__ void print_array_v2(const char* s, T a)
-{
-    using data_type         = decltype(a.At(Number<0>{}));
-    constexpr index_t nsize = a.Size();
-
-#if 0
-    if constexpr(is_same<data_type, uint32_t>{})
-    {
-        printf("%s size %u, {", s, nsize);
-        static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("[%u] %u, ", i.value, a[i]); });
-        printf("}\n");
-    }
-    else if constexpr(is_same<data_type, int32_t>{})
-    {
-        printf("%s size %d, {", s, nsize);
-        static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("[%d] %d, ", i.value, a[i]); });
-        printf("}\n");
-    }
-#else
-    printf("%s size %d, {", s, nsize);
-    static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("[%d] %d, ", i.value, a[i]); });
-    printf("}\n");
-#endif
 }
 
 } // namespace ck
diff --git a/composable_kernel/include/utility/sequence.hpp b/composable_kernel/include/utility/sequence.hpp
index 81eb488715..b35999d56f 100644
--- a/composable_kernel/include/utility/sequence.hpp
+++ b/composable_kernel/include/utility/sequence.hpp
@@ -685,8 +685,6 @@ __host__ __device__ constexpr auto operator+(Number<Y>, Sequence<Xs...>)
 template <index_t Y, index_t... Xs>
 __host__ __device__ constexpr auto operator-(Number<Y>, Sequence<Xs...>)
 {
-    constexpr auto seq_x = Sequence<Xs...>{};
-
     return Sequence<(Y - Xs)...>{};
 }
 
diff --git a/composable_kernel/include/utility/static_buffer.hpp b/composable_kernel/include/utility/static_buffer.hpp
index a23cf4f80d..cd67b8a0be 100644
--- a/composable_kernel/include/utility/static_buffer.hpp
+++ b/composable_kernel/include/utility/static_buffer.hpp
@@ -5,30 +5,66 @@
 
 namespace ck {
 
-template <AddressSpaceEnum_t BufferAddressSpace, typename T, index_t N>
+template <AddressSpaceEnum_t BufferAddressSpace,
+          typename T,
+          index_t N,
+          bool InvalidElementUseNumericalZeroValue>
 struct StaticBuffer : public StaticallyIndexedArray<T, N>
 {
     using type = T;
     using base = StaticallyIndexedArray<T, N>;
 
+    T invalid_element_value_ = T{0};
+
     __host__ __device__ constexpr StaticBuffer() : base{} {}
 
+    __host__ __device__ constexpr StaticBuffer(T invalid_element_value)
+        : base{}, invalid_element_value_{invalid_element_value}
+    {
+    }
+
     __host__ __device__ static constexpr AddressSpaceEnum_t GetAddressSpace()
     {
         return BufferAddressSpace;
     }
 
+    template <index_t I>
+    __host__ __device__ constexpr auto Get(Number<I> i, bool is_valid_element) const
+    {
+        if constexpr(InvalidElementUseNumericalZeroValue)
+        {
+            return is_valid_element ? At(i) : T{0};
+        }
+        else
+        {
+            return is_valid_element ? At(i) : invalid_element_value_;
+        }
+    }
+
+    template <index_t I>
+    __host__ __device__ void Set(Number<I> i, bool is_valid_element, const T& x)
+    {
+        if(is_valid_element)
+        {
+            At(i) = x;
+        }
+    }
+
     __host__ __device__ static constexpr bool IsStaticBuffer() { return true; }
 
     __host__ __device__ static constexpr bool IsDynamicBuffer() { return false; }
 };
 
-template <AddressSpaceEnum_t BufferAddressSpace = AddressSpaceEnum_t::Generic,
-          typename T,
-          index_t N>
+template <AddressSpaceEnum_t BufferAddressSpace, typename T, index_t N>
 __host__ __device__ constexpr auto make_static_buffer(Number<N>)
 {
-    return StaticBuffer<BufferAddressSpace, T, N>{};
+    return StaticBuffer<BufferAddressSpace, T, N, true>{};
+}
+
+template <AddressSpaceEnum_t BufferAddressSpace, typename T, index_t N>
+__host__ __device__ constexpr auto make_static_buffer(Number<N>, T invalid_element_value)
+{
+    return StaticBuffer<BufferAddressSpace, T, N, false>{invalid_element_value};
 }
 
 } // namespace ck
diff --git a/composable_kernel/include/utility/tuple.hpp b/composable_kernel/include/utility/tuple.hpp
index 15b73011b4..ee96a8b435 100644
--- a/composable_kernel/include/utility/tuple.hpp
+++ b/composable_kernel/include/utility/tuple.hpp
@@ -4,6 +4,7 @@
 #include "integral_constant.hpp"
 #include "sequence.hpp"
 #include "type.hpp"
+#include "enable_if.hpp"
 
 namespace ck {
 
@@ -20,10 +21,9 @@ struct TupleElement
 {
     __host__ __device__ constexpr TupleElement() = default;
 
-    template <
-        typename T,
-        typename std::enable_if<!is_same<remove_reference_t<remove_cv_t<T>>, TupleElement>::value,
-                                bool>::type = false>
+    template <typename T,
+              typename enable_if<!is_same<remove_reference_t<remove_cv_t<T>>, TupleElement>::value,
+                                 bool>::type = false>
     __host__ __device__ constexpr TupleElement(T&& v) : mData(std::forward<T>(v))
     {
     }
@@ -58,17 +58,16 @@ struct TupleImpl<Sequence<Is...>, Xs...> : TupleElement<TupleElementKey<Is>, Xs>
 {
     __host__ __device__ constexpr TupleImpl() = default;
 
-    template <
-        typename Y,
-        typename std::enable_if<sizeof...(Is) == 1 && sizeof...(Xs) == 1 &&
-                                    !is_same<remove_reference_t<remove_cv_t<Y>>, TupleImpl>::value,
-                                bool>::type = false>
+    template <typename Y,
+              typename enable_if<sizeof...(Is) == 1 && sizeof...(Xs) == 1 &&
+                                     !is_same<remove_reference_t<remove_cv_t<Y>>, TupleImpl>::value,
+                                 bool>::type = false>
     __host__ __device__ constexpr TupleImpl(Y&& y)
         : TupleElement<TupleElementKey<Is>, Xs>(std::forward<Y>(y))...
     {
     }
 
-    template <typename... Ys, typename std::enable_if<sizeof...(Ys) >= 2, bool>::type = false>
+    template <typename... Ys, typename enable_if<sizeof...(Ys) >= 2, bool>::type = false>
     __host__ __device__ constexpr TupleImpl(Ys&&... ys)
         : TupleElement<TupleElementKey<Is>, Xs>(std::forward<Ys>(ys))...
     {
@@ -102,16 +101,16 @@ struct Tuple : detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(X
     __host__ __device__ constexpr Tuple() = default;
 
     template <typename Y,
-              typename std::enable_if<
-                  sizeof...(Xs) == 1 && !is_same<remove_reference_t<remove_cv_t<Y>>, Tuple>::value,
-                  bool>::type = false>
+              typename enable_if<sizeof...(Xs) == 1 &&
+                                     !is_same<remove_reference_t<remove_cv_t<Y>>, Tuple>::value,
+                                 bool>::type = false>
     __host__ __device__ constexpr Tuple(Y&& y) : base(std::forward<Y>(y))
     {
     }
 
     template <typename... Ys,
-              typename std::enable_if<sizeof...(Ys) == sizeof...(Xs) && sizeof...(Ys) >= 2,
-                                      bool>::type = false>
+              typename enable_if<sizeof...(Ys) == sizeof...(Xs) && sizeof...(Ys) >= 2, bool>::type =
+                  false>
     __host__ __device__ constexpr Tuple(Ys&&... ys) : base(std::forward<Ys>(ys)...)
     {
     }
diff --git a/composable_kernel/include/utility/type.hpp b/composable_kernel/include/utility/type.hpp
index 32f7dfb569..b7902ad496 100644
--- a/composable_kernel/include/utility/type.hpp
+++ b/composable_kernel/include/utility/type.hpp
@@ -2,6 +2,7 @@
 #define CK_TYPE_HPP
 
 #include "integral_constant.hpp"
+#include "enable_if.hpp"
 
 namespace ck {
 
@@ -22,10 +23,7 @@ template <typename T>
 using remove_cv_t = typename std::remove_cv<T>::type;
 
 template <typename T>
-constexpr std::remove_reference_t<T>&& move(T&& t) noexcept
-{
-    return static_cast<typename std::remove_reference<T>::type&&>(t);
-}
+inline constexpr bool is_pointer_v = std::is_pointer<T>::value;
 
 template <typename T>
 struct is_known_at_compile_time;
@@ -42,9 +40,7 @@ struct is_known_at_compile_time<integral_constant<T, X>>
     static constexpr bool value = true;
 };
 
-template <typename Y,
-          typename X,
-          typename std::enable_if<sizeof(X) == sizeof(Y), bool>::type = false>
+template <typename Y, typename X, typename enable_if<sizeof(X) == sizeof(Y), bool>::type = false>
 __host__ __device__ constexpr Y as_type(X x)
 {
     union AsType
diff --git a/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp b/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp
new file mode 100644
index 0000000000..09a7fffa3e
--- /dev/null
+++ b/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp
@@ -0,0 +1,370 @@
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "gridwise_gemm_dlops_v1r2.hpp"
+#include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp"
+
+using namespace ck;
+
+constexpr DataTypeEnum_t ABDataTypeEnum  = static_cast<DataTypeEnum_t>(CK_PARAM_ABDataTypeEnum);
+constexpr DataTypeEnum_t AccDataTypeEnum = static_cast<DataTypeEnum_t>(CK_PARAM_AccDataTypeEnum);
+constexpr DataTypeEnum_t CDataTypeEnum   = static_cast<DataTypeEnum_t>(CK_PARAM_CDataTypeEnum);
+
+using FloatAB  = typename get_datatype_from_enum<ABDataTypeEnum>::type;
+using FloatAcc = typename get_datatype_from_enum<AccDataTypeEnum>::type;
+using FloatC   = typename get_datatype_from_enum<CDataTypeEnum>::type;
+
+constexpr index_t BlockSize = CK_PARAM_BlockSize;
+
+constexpr index_t MPerBlock            = CK_PARAM_MPerBlock;
+constexpr index_t NPerBlock            = CK_PARAM_NPerBlock;
+constexpr index_t KPerBlock            = CK_PARAM_KPerBlock;
+constexpr index_t M1PerThread          = CK_PARAM_M1PerThread;
+constexpr index_t N1PerThread          = CK_PARAM_N1PerThread;
+constexpr index_t KPerThread           = CK_PARAM_KPerThread;
+constexpr index_t M1N1ThreadClusterM10 = CK_PARAM_M1N1ThreadClusterM10;
+constexpr index_t M1N1ThreadClusterN10 = CK_PARAM_M1N1ThreadClusterN10;
+constexpr index_t M1N1ThreadClusterM11 = CK_PARAM_M1N1ThreadClusterM11;
+constexpr index_t M1N1ThreadClusterN11 = CK_PARAM_M1N1ThreadClusterN11;
+
+using ABlockTransferThreadSliceLengths_K_M0_M1 =
+    Sequence<CK_PARAM_ABlockTransferThreadSliceLengths_K_M0_M1>;
+using ABlockTransferThreadClusterLengths_K_M0_M1 =
+    Sequence<CK_PARAM_ABlockTransferThreadClusterLengths_K_M0_M1>;
+using ABlockTransferThreadClusterArrangeOrder =
+    Sequence<CK_PARAM_ABlockTransferThreadClusterArrangeOrder>;
+using ABlockTransferSrcAccessOrder = Sequence<CK_PARAM_ABlockTransferSrcAccessOrder>;
+
+constexpr index_t ABlockTransferSrcVectorDim       = CK_PARAM_ABlockTransferSrcVectorDim;
+constexpr index_t ABlockTransferSrcScalarPerVector = CK_PARAM_ABlockTransferSrcScalarPerVector;
+constexpr index_t ABlockTransferDstScalarPerVector_M1 =
+    CK_PARAM_ABlockTransferDstScalarPerVector_M1;
+constexpr bool AThreadTransferSrcResetCoordinateAfterRun =
+    static_cast<bool>(CK_PARAM_AThreadTransferSrcResetCoordinateAfterRun);
+
+using BBlockTransferThreadSliceLengths_K_N0_N1 =
+    Sequence<CK_PARAM_BBlockTransferThreadSliceLengths_K_N0_N1>;
+using BBlockTransferThreadClusterLengths_K_N0_N1 =
+    Sequence<CK_PARAM_BBlockTransferThreadClusterLengths_K_N0_N1>;
+using BBlockTransferThreadClusterArrangeOrder =
+    Sequence<CK_PARAM_BBlockTransferThreadClusterArrangeOrder>;
+using BBlockTransferSrcAccessOrder = Sequence<CK_PARAM_BBlockTransferSrcAccessOrder>;
+
+constexpr index_t BBlockTransferSrcVectorDim       = CK_PARAM_BBlockTransferSrcVectorDim;
+constexpr index_t BBlockTransferSrcScalarPerVector = CK_PARAM_BBlockTransferSrcScalarPerVector;
+constexpr index_t BBlockTransferDstScalarPerVector_N1 =
+    CK_PARAM_BBlockTransferDstScalarPerVector_N1;
+constexpr bool BThreadTransferSrcResetCoordinateAfterRun =
+    static_cast<bool>(CK_PARAM_BThreadTransferSrcResetCoordinateAfterRun);
+
+using CThreadTransferSrcDstAccessOrder = Sequence<CK_PARAM_CThreadTransferSrcDstAccessOrder>;
+constexpr index_t CThreadTransferSrcDstVectorDim    = CK_PARAM_CThreadTransferSrcDstVectorDim;
+constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDstScalarPerVector;
+
+constexpr bool HasMainKBlockLoop       = static_cast<bool>(CK_PARAM_HAS_MAIN_KBLOCK_LOOP);
+constexpr bool HasDoubleTailKBlockLoop = static_cast<bool>(CK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP);
+
+extern "C" __global__ void convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw_prepare(
+    int n,
+    int c,
+    int hi,
+    int wi,
+    int k,
+    int y,
+    int x,
+    int convStrideH,
+    int convStrideW,
+    int convDilationY,
+    int convDilationX,
+    int leftPadH,
+    int leftPadW,
+    int rightPadH,
+    int rightPadW,
+    void* p_a_k_m0_m1_grid_desc,
+    void* p_b_k_n0_n1_grid_desc,
+    void* p_c_m0_m10_m11_n0_n10_n11_grid_desc,
+    void* p_c_blockid_to_m0_n0_block_cluster_adaptor)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+
+    const index_t ho = (hi + leftPadH + rightPadH - convDilationY * (y - 1) - 1) / convStrideH + 1;
+    const index_t wo = (wi + leftPadW + rightPadW - convDilationX * (x - 1) - 1) / convStrideW + 1;
+
+    const auto in_n_c_hi_wi_desc  = make_naive_tensor_descriptor_packed(make_tuple(n, c, hi, wi));
+    const auto wei_k_c_y_x_desc   = make_naive_tensor_descriptor_packed(make_tuple(k, c, y, x));
+    const auto out_n_k_ho_wo_desc = make_naive_tensor_descriptor_packed(make_tuple(n, k, ho, wo));
+
+    const auto descs = transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(
+        wei_k_c_y_x_desc,
+        in_n_c_hi_wi_desc,
+        out_n_k_ho_wo_desc,
+        make_tuple(convStrideH, convStrideW),
+        make_tuple(convDilationY, convDilationX),
+        make_tuple(leftPadH, leftPadW),
+        make_tuple(rightPadH, rightPadW));
+
+    const auto a_k_m_grid_desc = descs[I0];
+    const auto b_k_n_grid_desc = descs[I1];
+    const auto c_m_n_grid_desc = descs[I2];
+
+    using AKMGridDesc = decltype(a_k_m_grid_desc);
+    using BKNGridDesc = decltype(b_k_n_grid_desc);
+    using CMNGridDesc = decltype(c_m_n_grid_desc);
+
+    using AGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{}),
+                                               make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{})));
+
+    using BGridStepHacks =
+        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}),
+                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{})));
+
+    using CGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{}),
+                                               make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{})));
+
+    using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0>;
+    using BGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
+
+    using GridwiseGemm =
+        GridwiseGemmDlops_km_kn_mn_v1r2<BlockSize,
+                                        FloatAB,
+                                        FloatAcc,
+                                        FloatC,
+                                        InMemoryDataOperationEnum_t::Set, /* ToDo tunable */
+                                        AKMGridDesc,
+                                        BKNGridDesc,
+                                        CMNGridDesc,
+                                        MPerBlock,
+                                        NPerBlock,
+                                        KPerBlock,
+                                        M1PerThread,
+                                        N1PerThread,
+                                        KPerThread,
+                                        M1N1ThreadClusterM10,
+                                        M1N1ThreadClusterN10,
+                                        M1N1ThreadClusterM11,
+                                        M1N1ThreadClusterN11,
+                                        ABlockTransferThreadSliceLengths_K_M0_M1,
+                                        ABlockTransferThreadClusterLengths_K_M0_M1,
+                                        ABlockTransferThreadClusterArrangeOrder,
+                                        ABlockTransferSrcAccessOrder,
+                                        ABlockTransferSrcVectorDim,
+                                        ABlockTransferSrcScalarPerVector,
+                                        ABlockTransferDstScalarPerVector_M1,
+                                        AThreadTransferSrcResetCoordinateAfterRun,
+                                        BBlockTransferThreadSliceLengths_K_N0_N1,
+                                        BBlockTransferThreadClusterLengths_K_N0_N1,
+                                        BBlockTransferThreadClusterArrangeOrder,
+                                        BBlockTransferSrcAccessOrder,
+                                        BBlockTransferSrcVectorDim,
+                                        BBlockTransferSrcScalarPerVector,
+                                        BBlockTransferDstScalarPerVector_N1,
+                                        BThreadTransferSrcResetCoordinateAfterRun,
+                                        CThreadTransferSrcDstAccessOrder,
+                                        CThreadTransferSrcDstVectorDim,
+                                        CThreadTransferDstScalarPerVector,
+                                        AGridStepHacks,
+                                        BGridStepHacks,
+                                        CGridStepHacks,
+                                        AGridMoveSliceWindowStepHacks,
+                                        BGridMoveSliceWindowStepHacks>;
+
+    auto a_k_m0_m1_grid_desc = GridwiseGemm::MakeAKM0M1GridDescriptor(a_k_m_grid_desc);
+    auto b_k_n0_n1_grid_desc = GridwiseGemm::MakeBKN0N1GridDescriptor(b_k_n_grid_desc);
+    auto c_m0_m10_m11_n0_n10_n11_grid_desc =
+        GridwiseGemm::MakeCM0M10M11N0N10N11GridDescriptor(c_m_n_grid_desc);
+    auto c_blockid_to_m0_n0_block_cluster_adaptor =
+        GridwiseGemm::MakeCBlockIdToM0N0BlockClusterAdaptor(c_m_n_grid_desc);
+
+    if(hipThreadIdx_x == 0)
+    {
+        *static_cast<decltype(a_k_m0_m1_grid_desc)*>(p_a_k_m0_m1_grid_desc) = a_k_m0_m1_grid_desc;
+        *static_cast<decltype(b_k_n0_n1_grid_desc)*>(p_b_k_n0_n1_grid_desc) = b_k_n0_n1_grid_desc;
+        *static_cast<decltype(c_m0_m10_m11_n0_n10_n11_grid_desc)*>(
+            p_c_m0_m10_m11_n0_n10_n11_grid_desc) = c_m0_m10_m11_n0_n10_n11_grid_desc;
+        *static_cast<decltype(c_blockid_to_m0_n0_block_cluster_adaptor)*>(
+            p_c_blockid_to_m0_n0_block_cluster_adaptor) = c_blockid_to_m0_n0_block_cluster_adaptor;
+    };
+};
+
+extern "C" __global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            const void CONSTANT* p_a_k_m0_m1_grid_desc,
+            const void CONSTANT* p_b_k_n0_n1_grid_desc,
+            const void CONSTANT* p_c_m0_m10_m11_n0_n10_n11_grid_desc,
+            const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+
+    constexpr auto in_n_c_hi_wi_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28));
+    constexpr auto wei_k_c_y_x_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(256, 256, 3, 3));
+    constexpr auto out_n_k_ho_wo_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28));
+
+    constexpr auto descs =
+        transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
+                                                                        in_n_c_hi_wi_desc,
+                                                                        out_n_k_ho_wo_desc,
+                                                                        make_tuple(1, 1),
+                                                                        make_tuple(1, 1),
+                                                                        make_tuple(1, 1),
+                                                                        make_tuple(1, 1));
+
+    constexpr auto a_k_m_grid_desc = descs[I0];
+    constexpr auto b_k_n_grid_desc = descs[I1];
+    constexpr auto c_m_n_grid_desc = descs[I2];
+
+    using AKMGridDesc = decltype(a_k_m_grid_desc);
+    using BKNGridDesc = decltype(b_k_n_grid_desc);
+    using CMNGridDesc = decltype(c_m_n_grid_desc);
+
+    using AGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{}),
+                                               make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{})));
+
+    using BGridStepHacks =
+        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}),
+                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{})));
+
+    using CGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{}),
+                                               make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{})));
+
+    using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0>;
+    using BGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
+
+    using GridwiseGemm =
+        GridwiseGemmDlops_km_kn_mn_v1r2<BlockSize,
+                                        FloatAB,
+                                        FloatAcc,
+                                        FloatC,
+                                        InMemoryDataOperationEnum_t::Set, /* ToDo tunable */
+                                        AKMGridDesc,
+                                        BKNGridDesc,
+                                        CMNGridDesc,
+                                        MPerBlock,
+                                        NPerBlock,
+                                        KPerBlock,
+                                        M1PerThread,
+                                        N1PerThread,
+                                        KPerThread,
+                                        M1N1ThreadClusterM10,
+                                        M1N1ThreadClusterN10,
+                                        M1N1ThreadClusterM11,
+                                        M1N1ThreadClusterN11,
+                                        ABlockTransferThreadSliceLengths_K_M0_M1,
+                                        ABlockTransferThreadClusterLengths_K_M0_M1,
+                                        ABlockTransferThreadClusterArrangeOrder,
+                                        ABlockTransferSrcAccessOrder,
+                                        ABlockTransferSrcVectorDim,
+                                        ABlockTransferSrcScalarPerVector,
+                                        ABlockTransferDstScalarPerVector_M1,
+                                        AThreadTransferSrcResetCoordinateAfterRun,
+                                        BBlockTransferThreadSliceLengths_K_N0_N1,
+                                        BBlockTransferThreadClusterLengths_K_N0_N1,
+                                        BBlockTransferThreadClusterArrangeOrder,
+                                        BBlockTransferSrcAccessOrder,
+                                        BBlockTransferSrcVectorDim,
+                                        BBlockTransferSrcScalarPerVector,
+                                        BBlockTransferDstScalarPerVector_N1,
+                                        BThreadTransferSrcResetCoordinateAfterRun,
+                                        CThreadTransferSrcDstAccessOrder,
+                                        CThreadTransferSrcDstVectorDim,
+                                        CThreadTransferDstScalarPerVector,
+                                        AGridStepHacks,
+                                        BGridStepHacks,
+                                        CGridStepHacks,
+                                        AGridMoveSliceWindowStepHacks,
+                                        BGridMoveSliceWindowStepHacks>;
+
+    constexpr auto a_k_m0_m1_grid_desc_tmp =
+        GridwiseGemm::MakeAKM0M1GridDescriptor(a_k_m_grid_desc);
+    constexpr auto b_k_n0_n1_grid_desc_tmp =
+        GridwiseGemm::MakeBKN0N1GridDescriptor(b_k_n_grid_desc);
+    constexpr auto c_m0_m10_m11_n0_n10_n11_grid_desc_tmp =
+        GridwiseGemm::MakeCM0M10M11N0N10N11GridDescriptor(c_m_n_grid_desc);
+    constexpr auto c_blockid_to_m0_n0_block_cluster_adaptor_tmp =
+        GridwiseGemm::MakeCBlockIdToM0N0BlockClusterAdaptor(c_m_n_grid_desc);
+
+    using AKM0M1GridDesc            = decltype(a_k_m0_m1_grid_desc_tmp);
+    using BKN0N1GridDesc            = decltype(b_k_n0_n1_grid_desc_tmp);
+    using CM0M10M11N0N10N11GridDesc = decltype(c_m0_m10_m11_n0_n10_n11_grid_desc_tmp);
+    using CBlockIdToM0N0BlockClusterAdaptor =
+        decltype(c_blockid_to_m0_n0_block_cluster_adaptor_tmp);
+
+    const auto a_k_m0_m1_grid_desc =
+        *reinterpret_cast<const AKM0M1GridDesc*>((const void*)p_a_k_m0_m1_grid_desc);
+    const auto b_k_n0_n1_grid_desc =
+        *reinterpret_cast<const BKN0N1GridDesc*>((const void*)p_b_k_n0_n1_grid_desc);
+    const auto c_m0_m10_m11_n0_n10_n11_grid_desc =
+        *reinterpret_cast<const CM0M10M11N0N10N11GridDesc*>(
+            (const void*)p_c_m0_m10_m11_n0_n10_n11_grid_desc);
+    const auto c_blockid_to_m0_n0_block_cluster_adaptor =
+        *reinterpret_cast<const CBlockIdToM0N0BlockClusterAdaptor*>(
+            (const void*)p_c_blockid_to_m0_n0_block_cluster_adaptor);
+
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseGemm::Run(p_a_grid,
+                      p_b_grid,
+                      p_c_grid,
+                      p_shared_block,
+                      a_k_m0_m1_grid_desc,
+                      b_k_n0_n1_grid_desc,
+                      c_m0_m10_m11_n0_n10_n11_grid_desc,
+                      c_blockid_to_m0_n0_block_cluster_adaptor,
+                      integral_constant<bool, HasMainKBlockLoop>{},
+                      integral_constant<bool, HasDoubleTailKBlockLoop>{});
+};
diff --git a/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp b/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
new file mode 100644
index 0000000000..51d852617f
--- /dev/null
+++ b/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
@@ -0,0 +1,358 @@
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "gridwise_gemm_xdlops_v2r3.hpp"
+#include "transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp"
+
+using namespace ck;
+
+constexpr DataTypeEnum_t ABDataTypeEnum  = static_cast<DataTypeEnum_t>(CK_PARAM_ABDataTypeEnum);
+constexpr DataTypeEnum_t AccDataTypeEnum = static_cast<DataTypeEnum_t>(CK_PARAM_AccDataTypeEnum);
+constexpr DataTypeEnum_t CDataTypeEnum   = static_cast<DataTypeEnum_t>(CK_PARAM_CDataTypeEnum);
+
+using FloatAB  = typename get_datatype_from_enum<ABDataTypeEnum>::type;
+using FloatAcc = typename get_datatype_from_enum<AccDataTypeEnum>::type;
+using FloatC   = typename get_datatype_from_enum<CDataTypeEnum>::type;
+
+constexpr index_t BlockSize = CK_PARAM_BlockSize;
+
+constexpr index_t MPerBlock = CK_PARAM_MPerBlock;
+constexpr index_t NPerBlock = CK_PARAM_NPerBlock;
+constexpr index_t KPerBlock = CK_PARAM_KPerBlock;
+
+constexpr index_t MPerWave = CK_PARAM_MPerWave;
+constexpr index_t NPerWave = CK_PARAM_NPerWave;
+constexpr index_t MRepeat  = CK_PARAM_MRepeat;
+constexpr index_t NRepeat  = CK_PARAM_NRepeat;
+constexpr index_t K1       = CK_PARAM_K1;
+
+using ABlockTransferThreadSliceLengths_K0_M_K1 =
+    Sequence<CK_PARAM_ABlockTransferThreadSliceLengths_K0_M_K1>;
+using ABlockTransferThreadClusterLengths_K0_M_K1 =
+    Sequence<CK_PARAM_ABlockTransferThreadClusterLengths_K0_M_K1>;
+using ABlockTransferThreadClusterArrangeOrder =
+    Sequence<CK_PARAM_ABlockTransferThreadClusterArrangeOrder>;
+using ABlockTransferSrcAccessOrder = Sequence<CK_PARAM_ABlockTransferSrcAccessOrder>;
+
+constexpr index_t ABlockTransferSrcVectorDim       = CK_PARAM_ABlockTransferSrcVectorDim;
+constexpr index_t ABlockTransferSrcScalarPerVector = CK_PARAM_ABlockTransferSrcScalarPerVector;
+constexpr index_t ABlockTransferDstScalarPerVector_K1 =
+    CK_PARAM_ABlockTransferDstScalarPerVector_K1;
+constexpr bool AThreadTransferSrcResetCoordinateAfterRun =
+    static_cast<bool>(CK_PARAM_AThreadTransferSrcResetCoordinateAfterRun);
+
+using BBlockTransferThreadSliceLengths_K0_N_K1 =
+    Sequence<CK_PARAM_BBlockTransferThreadSliceLengths_K0_N_K1>;
+using BBlockTransferThreadClusterLengths_K0_N_K1 =
+    Sequence<CK_PARAM_BBlockTransferThreadClusterLengths_K0_N_K1>;
+using BBlockTransferThreadClusterArrangeOrder =
+    Sequence<CK_PARAM_BBlockTransferThreadClusterArrangeOrder>;
+using BBlockTransferSrcAccessOrder = Sequence<CK_PARAM_BBlockTransferSrcAccessOrder>;
+
+constexpr index_t BBlockTransferSrcVectorDim       = CK_PARAM_BBlockTransferSrcVectorDim;
+constexpr index_t BBlockTransferSrcScalarPerVector = CK_PARAM_BBlockTransferSrcScalarPerVector;
+constexpr index_t BBlockTransferDstScalarPerVector_K1 =
+    CK_PARAM_BBlockTransferDstScalarPerVector_K1;
+constexpr bool BThreadTransferSrcResetCoordinateAfterRun =
+    static_cast<bool>(CK_PARAM_BThreadTransferSrcResetCoordinateAfterRun);
+
+using CThreadTransferSrcDstAccessOrder = Sequence<CK_PARAM_CThreadTransferSrcDstAccessOrder>;
+constexpr index_t CThreadTransferSrcDstVectorDim    = CK_PARAM_CThreadTransferSrcDstVectorDim;
+constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDstScalarPerVector;
+
+extern "C" __global__ void convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_prepare(
+    int n,
+    int c,
+    int hi,
+    int wi,
+    int k,
+    int y,
+    int x,
+    int convStrideH,
+    int convStrideW,
+    int convDilationY,
+    int convDilationX,
+    int leftPadH,
+    int leftPadW,
+    int rightPadH,
+    int rightPadW,
+    void* p_a_k0_m_k1_grid_desc,
+    void* p_b_k0_n_k1_grid_desc,
+    void* p_c_m0_m1_m2_n_grid_desc,
+    void* p_c_blockid_to_m0_n0_block_cluster_adaptor)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+
+    const index_t ho = (hi + leftPadH + rightPadH - convDilationY * (y - 1) - 1) / convStrideH + 1;
+    const index_t wo = (wi + leftPadW + rightPadW - convDilationX * (x - 1) - 1) / convStrideW + 1;
+
+    const auto in_n_c_hi_wi_desc  = make_naive_tensor_descriptor_packed(make_tuple(n, c, hi, wi));
+    const auto wei_k_c_y_x_desc   = make_naive_tensor_descriptor_packed(make_tuple(k, c, y, x));
+    const auto out_n_k_ho_wo_desc = make_naive_tensor_descriptor_packed(make_tuple(n, k, ho, wo));
+
+    const auto descs = transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(
+        wei_k_c_y_x_desc,
+        in_n_c_hi_wi_desc,
+        out_n_k_ho_wo_desc,
+        make_tuple(convStrideH, convStrideW),
+        make_tuple(convDilationY, convDilationX),
+        make_tuple(leftPadH, leftPadW),
+        make_tuple(rightPadH, rightPadW),
+        Number<K1>{});
+
+    const auto a_k0_m_k1_grid_desc = descs[I0];
+    const auto b_k0_n_k1_grid_desc = descs[I1];
+    const auto c_m_n_grid_desc     = descs[I2];
+
+    using AK0MK1GridDesc = decltype(a_k0_m_k1_grid_desc);
+    using BK0NK1GridDesc = decltype(b_k0_n_k1_grid_desc);
+    using CMNGridDesc    = decltype(c_m_n_grid_desc);
+
+    using AGridStepHacks = decltype(make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{})));
+
+    using BGridStepHacks =
+        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
+                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})));
+
+    using CGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{}),
+                                               make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{})));
+
+    using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0>;
+    using BGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
+
+    using GridwiseGemm =
+        GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
+                                                FloatAB,
+                                                FloatAcc,
+                                                FloatC,
+                                                InMemoryDataOperationEnum_t::Set,
+                                                AK0MK1GridDesc,
+                                                BK0NK1GridDesc,
+                                                CMNGridDesc,
+                                                MPerBlock,
+                                                NPerBlock,
+                                                KPerBlock,
+                                                MPerWave,
+                                                NPerWave,
+                                                K1,
+                                                MRepeat,
+                                                NRepeat,
+                                                ABlockTransferThreadSliceLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                ABlockTransferSrcAccessOrder,
+                                                ABlockTransferSrcVectorDim,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_K1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                BBlockTransferThreadSliceLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                BBlockTransferSrcAccessOrder,
+                                                BBlockTransferSrcVectorDim,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_K1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                CThreadTransferSrcDstAccessOrder,
+                                                CThreadTransferSrcDstVectorDim,
+                                                CThreadTransferDstScalarPerVector,
+                                                AGridStepHacks,
+                                                BGridStepHacks,
+                                                CGridStepHacks,
+                                                AGridMoveSliceWindowStepHacks,
+                                                BGridMoveSliceWindowStepHacks,
+                                                false>;
+
+    auto c_m0_m1_m2_n_grid_desc = GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
+
+    auto c_blockid_to_m0_n0_block_cluster_adaptor =
+        GridwiseGemm::MakeCBlockClusterAdaptor(c_m_n_grid_desc);
+
+    if(hipThreadIdx_x == 0)
+    {
+        *static_cast<remove_cv_t<decltype(a_k0_m_k1_grid_desc)>*>(p_a_k0_m_k1_grid_desc) =
+            a_k0_m_k1_grid_desc;
+        *static_cast<remove_cv_t<decltype(b_k0_n_k1_grid_desc)>*>(p_b_k0_n_k1_grid_desc) =
+            b_k0_n_k1_grid_desc;
+        *static_cast<decltype(c_m0_m1_m2_n_grid_desc)*>(p_c_m0_m1_m2_n_grid_desc) =
+            c_m0_m1_m2_n_grid_desc;
+        *static_cast<decltype(c_blockid_to_m0_n0_block_cluster_adaptor)*>(
+            p_c_blockid_to_m0_n0_block_cluster_adaptor) = c_blockid_to_m0_n0_block_cluster_adaptor;
+    }
+};
+
+extern "C" __global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            const void CONSTANT* p_a_k0_m_k1_grid_desc,
+            const void CONSTANT* p_b_k0_n_k1_grid_desc,
+            const void CONSTANT* p_c_m0_m1_m2_n_grid_desc,
+            const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor)
+{
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+
+    constexpr auto in_n_c_hi_wi_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28));
+    constexpr auto wei_k_c_y_x_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(256, 256, 3, 3));
+    constexpr auto out_n_k_ho_wo_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28));
+
+    constexpr auto descs =
+        transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
+                                                                          in_n_c_hi_wi_desc,
+                                                                          out_n_k_ho_wo_desc,
+                                                                          make_tuple(1, 1),
+                                                                          make_tuple(1, 1),
+                                                                          make_tuple(1, 1),
+                                                                          make_tuple(1, 1),
+                                                                          Number<K1>{});
+
+    constexpr auto a_k0_m_k1_grid_desc_tmp = descs[I0];
+    constexpr auto b_k0_n_k1_grid_desc_tmp = descs[I1];
+    constexpr auto c_m_n_grid_desc         = descs[I2];
+
+    using AGridStepHacks = decltype(make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{})));
+
+    using BGridStepHacks =
+        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
+                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})));
+
+    using CGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{}),
+                                               make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{})));
+
+    using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0>;
+    using BGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
+
+    using AK0MK1GridDesc = decltype(a_k0_m_k1_grid_desc_tmp);
+    using BK0NK1GridDesc = decltype(b_k0_n_k1_grid_desc_tmp);
+    using CMNGridDesc    = decltype(c_m_n_grid_desc);
+
+    using GridwiseGemm =
+        GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
+                                                FloatAB,
+                                                FloatAcc,
+                                                FloatC,
+                                                InMemoryDataOperationEnum_t::Set,
+                                                AK0MK1GridDesc,
+                                                BK0NK1GridDesc,
+                                                CMNGridDesc,
+                                                MPerBlock,
+                                                NPerBlock,
+                                                KPerBlock,
+                                                MPerWave,
+                                                NPerWave,
+                                                K1,
+                                                MRepeat,
+                                                NRepeat,
+                                                ABlockTransferThreadSliceLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                ABlockTransferSrcAccessOrder,
+                                                ABlockTransferSrcVectorDim,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_K1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                BBlockTransferThreadSliceLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                BBlockTransferSrcAccessOrder,
+                                                BBlockTransferSrcVectorDim,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_K1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                CThreadTransferSrcDstAccessOrder,
+                                                CThreadTransferSrcDstVectorDim,
+                                                CThreadTransferDstScalarPerVector,
+                                                AGridStepHacks,
+                                                BGridStepHacks,
+                                                CGridStepHacks,
+                                                AGridMoveSliceWindowStepHacks,
+                                                BGridMoveSliceWindowStepHacks,
+                                                false>;
+
+    constexpr auto c_m0_m1_m2_n_grid_desc_tmp =
+        GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
+    constexpr auto c_blockid_to_m0_n0_block_cluster_adaptor_tmp =
+        GridwiseGemm::MakeCBlockClusterAdaptor(c_m_n_grid_desc);
+
+    using CM0M1M2NGridDesc = decltype(c_m0_m1_m2_n_grid_desc_tmp);
+    using CBlockIdToM0N0BlockClusterAdaptor =
+        decltype(c_blockid_to_m0_n0_block_cluster_adaptor_tmp);
+
+    const auto a_k0_m_k1_grid_desc =
+        *reinterpret_cast<const AK0MK1GridDesc*>((const void*)p_a_k0_m_k1_grid_desc);
+    const auto b_k0_n_k1_grid_desc =
+        *reinterpret_cast<const BK0NK1GridDesc*>((const void*)p_b_k0_n_k1_grid_desc);
+    const auto c_m0_m1_m2_n_grid_desc =
+        *reinterpret_cast<const CM0M1M2NGridDesc*>((const void*)p_c_m0_m1_m2_n_grid_desc);
+    const auto c_blockid_to_m0_n0_block_cluster_adaptor =
+        *reinterpret_cast<const CBlockIdToM0N0BlockClusterAdaptor*>(
+            (const void*)p_c_blockid_to_m0_n0_block_cluster_adaptor);
+
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseGemm::Run(p_a_grid,
+                      p_b_grid,
+                      p_c_grid,
+                      p_shared_block,
+                      a_k0_m_k1_grid_desc,
+                      b_k0_n_k1_grid_desc,
+                      c_m0_m1_m2_n_grid_desc,
+                      c_blockid_to_m0_n0_block_cluster_adaptor);
+};
diff --git a/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp b/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp
new file mode 100644
index 0000000000..30e4c518ce
--- /dev/null
+++ b/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp
@@ -0,0 +1,357 @@
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "gridwise_gemm_xdlops_v2r3.hpp"
+#include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
+
+using namespace ck;
+
+constexpr DataTypeEnum_t ABDataTypeEnum  = static_cast<DataTypeEnum_t>(CK_PARAM_ABDataTypeEnum);
+constexpr DataTypeEnum_t AccDataTypeEnum = static_cast<DataTypeEnum_t>(CK_PARAM_AccDataTypeEnum);
+constexpr DataTypeEnum_t CDataTypeEnum   = static_cast<DataTypeEnum_t>(CK_PARAM_CDataTypeEnum);
+
+using FloatAB  = typename get_datatype_from_enum<ABDataTypeEnum>::type;
+using FloatAcc = typename get_datatype_from_enum<AccDataTypeEnum>::type;
+using FloatC   = typename get_datatype_from_enum<CDataTypeEnum>::type;
+
+constexpr index_t BlockSize = CK_PARAM_BlockSize;
+
+constexpr index_t MPerBlock = CK_PARAM_MPerBlock;
+constexpr index_t NPerBlock = CK_PARAM_NPerBlock;
+constexpr index_t KPerBlock = CK_PARAM_KPerBlock;
+
+constexpr index_t MPerWave = CK_PARAM_MPerWave;
+constexpr index_t NPerWave = CK_PARAM_NPerWave;
+constexpr index_t MRepeat  = CK_PARAM_MRepeat;
+constexpr index_t NRepeat  = CK_PARAM_NRepeat;
+constexpr index_t K1       = CK_PARAM_K1;
+
+using ABlockTransferThreadSliceLengths_K0_M_K1 =
+    Sequence<CK_PARAM_ABlockTransferThreadSliceLengths_K0_M_K1>;
+using ABlockTransferThreadClusterLengths_K0_M_K1 =
+    Sequence<CK_PARAM_ABlockTransferThreadClusterLengths_K0_M_K1>;
+using ABlockTransferThreadClusterArrangeOrder =
+    Sequence<CK_PARAM_ABlockTransferThreadClusterArrangeOrder>;
+using ABlockTransferSrcAccessOrder = Sequence<CK_PARAM_ABlockTransferSrcAccessOrder>;
+
+constexpr index_t ABlockTransferSrcVectorDim       = CK_PARAM_ABlockTransferSrcVectorDim;
+constexpr index_t ABlockTransferSrcScalarPerVector = CK_PARAM_ABlockTransferSrcScalarPerVector;
+constexpr index_t ABlockTransferDstScalarPerVector_K1 =
+    CK_PARAM_ABlockTransferDstScalarPerVector_K1;
+constexpr bool AThreadTransferSrcResetCoordinateAfterRun =
+    static_cast<bool>(CK_PARAM_AThreadTransferSrcResetCoordinateAfterRun);
+
+using BBlockTransferThreadSliceLengths_K0_N_K1 =
+    Sequence<CK_PARAM_BBlockTransferThreadSliceLengths_K0_N_K1>;
+using BBlockTransferThreadClusterLengths_K0_N_K1 =
+    Sequence<CK_PARAM_BBlockTransferThreadClusterLengths_K0_N_K1>;
+using BBlockTransferThreadClusterArrangeOrder =
+    Sequence<CK_PARAM_BBlockTransferThreadClusterArrangeOrder>;
+using BBlockTransferSrcAccessOrder = Sequence<CK_PARAM_BBlockTransferSrcAccessOrder>;
+
+constexpr index_t BBlockTransferSrcVectorDim       = CK_PARAM_BBlockTransferSrcVectorDim;
+constexpr index_t BBlockTransferSrcScalarPerVector = CK_PARAM_BBlockTransferSrcScalarPerVector;
+constexpr index_t BBlockTransferDstScalarPerVector_K1 =
+    CK_PARAM_BBlockTransferDstScalarPerVector_K1;
+constexpr bool BThreadTransferSrcResetCoordinateAfterRun =
+    static_cast<bool>(CK_PARAM_BThreadTransferSrcResetCoordinateAfterRun);
+
+using CThreadTransferSrcDstAccessOrder = Sequence<CK_PARAM_CThreadTransferSrcDstAccessOrder>;
+constexpr index_t CThreadTransferSrcDstVectorDim    = CK_PARAM_CThreadTransferSrcDstVectorDim;
+constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDstScalarPerVector;
+
+extern "C" __global__ void convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk_prepare(
+    int n,
+    int hi,
+    int wi,
+    int c,
+    int k,
+    int y,
+    int x,
+    int convStrideH,
+    int convStrideW,
+    int convDilationY,
+    int convDilationX,
+    int leftPadH,
+    int leftPadW,
+    int rightPadH,
+    int rightPadW,
+    void* p_a_k0_m_k1_grid_desc,
+    void* p_b_k0_n_k1_grid_desc,
+    void* p_c_m0_m1_m2_n_grid_desc,
+    void* p_c_blockid_to_m0_n0_block_cluster_adaptor)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+
+    const index_t ho = (hi + leftPadH + rightPadH - convDilationY * (y - 1) - 1) / convStrideH + 1;
+    const index_t wo = (wi + leftPadW + rightPadW - convDilationX * (x - 1) - 1) / convStrideW + 1;
+
+    const auto in_n_hi_wi_c_desc  = make_naive_tensor_descriptor_packed(make_tuple(n, hi, wi, c));
+    const auto wei_k_y_x_c_desc   = make_naive_tensor_descriptor_packed(make_tuple(k, y, x, c));
+    const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(make_tuple(n, ho, wo, k));
+
+    const auto descs = transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad(
+        in_n_hi_wi_c_desc,
+        wei_k_y_x_c_desc,
+        out_n_ho_wo_k_desc,
+        make_tuple(convStrideH, convStrideW),
+        make_tuple(convDilationY, convDilationX),
+        make_tuple(leftPadH, leftPadW),
+        make_tuple(rightPadH, rightPadW),
+        Number<K1>{});
+
+    const auto a_k0_m_k1_grid_desc = descs[I0];
+    const auto b_k0_n_k1_grid_desc = descs[I1];
+    const auto c_m_n_grid_desc     = descs[I2];
+
+    using AK0MK1GridDesc = decltype(a_k0_m_k1_grid_desc);
+    using BK0NK1GridDesc = decltype(b_k0_n_k1_grid_desc);
+    using CMNGridDesc    = decltype(c_m_n_grid_desc);
+
+    using BGridStepHacks = decltype(make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{})));
+
+    using AGridStepHacks =
+        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
+                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})));
+
+    using CGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{}),
+                                               make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{})));
+
+    using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
+    using BGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0>;
+
+    using GridwiseGemm =
+        GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
+                                                FloatAB,
+                                                FloatAcc,
+                                                FloatC,
+                                                InMemoryDataOperationEnum_t::Set,
+                                                AK0MK1GridDesc,
+                                                BK0NK1GridDesc,
+                                                CMNGridDesc,
+                                                MPerBlock,
+                                                NPerBlock,
+                                                KPerBlock,
+                                                MPerWave,
+                                                NPerWave,
+                                                K1,
+                                                MRepeat,
+                                                NRepeat,
+                                                ABlockTransferThreadSliceLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                ABlockTransferSrcAccessOrder,
+                                                ABlockTransferSrcVectorDim,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_K1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                BBlockTransferThreadSliceLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                BBlockTransferSrcAccessOrder,
+                                                BBlockTransferSrcVectorDim,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_K1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                CThreadTransferSrcDstAccessOrder,
+                                                CThreadTransferSrcDstVectorDim,
+                                                CThreadTransferDstScalarPerVector,
+                                                AGridStepHacks,
+                                                BGridStepHacks,
+                                                CGridStepHacks,
+                                                AGridMoveSliceWindowStepHacks,
+                                                BGridMoveSliceWindowStepHacks,
+                                                false>;
+
+    auto c_m0_m1_m2_n_grid_desc = GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
+
+    auto c_blockid_to_m0_n0_block_cluster_adaptor =
+        GridwiseGemm::MakeCBlockClusterAdaptor(c_m_n_grid_desc);
+
+    if(hipThreadIdx_x == 0)
+    {
+        *static_cast<remove_cv_t<decltype(a_k0_m_k1_grid_desc)>*>(p_a_k0_m_k1_grid_desc) =
+            a_k0_m_k1_grid_desc;
+        *static_cast<remove_cv_t<decltype(b_k0_n_k1_grid_desc)>*>(p_b_k0_n_k1_grid_desc) =
+            b_k0_n_k1_grid_desc;
+        *static_cast<decltype(c_m0_m1_m2_n_grid_desc)*>(p_c_m0_m1_m2_n_grid_desc) =
+            c_m0_m1_m2_n_grid_desc;
+        *static_cast<decltype(c_blockid_to_m0_n0_block_cluster_adaptor)*>(
+            p_c_blockid_to_m0_n0_block_cluster_adaptor) = c_blockid_to_m0_n0_block_cluster_adaptor;
+    }
+};
+
+extern "C" __global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            const void CONSTANT* p_a_k0_m_k1_grid_desc,
+            const void CONSTANT* p_b_k0_n_k1_grid_desc,
+            const void CONSTANT* p_c_m0_m1_m2_n_grid_desc,
+            const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor)
+{
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+
+    constexpr auto in_n_hi_wi_c_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(256, 28, 28, 256));
+    constexpr auto wei_k_y_x_c_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(256, 3, 3, 256));
+    constexpr auto out_n_ho_wo_k_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(256, 28, 28, 256));
+
+    constexpr auto descs =
+        transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad(in_n_hi_wi_c_desc,
+                                                                          wei_k_y_x_c_desc,
+                                                                          out_n_ho_wo_k_desc,
+                                                                          make_tuple(1, 1),
+                                                                          make_tuple(1, 1),
+                                                                          make_tuple(1, 1),
+                                                                          make_tuple(1, 1),
+                                                                          Number<K1>{});
+
+    constexpr auto a_k0_m_k1_grid_desc_tmp = descs[I0];
+    constexpr auto b_k0_n_k1_grid_desc_tmp = descs[I1];
+    constexpr auto c_m_n_grid_desc         = descs[I2];
+
+    using AK0MK1GridDesc = decltype(a_k0_m_k1_grid_desc_tmp);
+    using BK0NK1GridDesc = decltype(b_k0_n_k1_grid_desc_tmp);
+    using CMNGridDesc    = decltype(c_m_n_grid_desc);
+
+    using BGridStepHacks = decltype(make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{})));
+
+    using AGridStepHacks =
+        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
+                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})));
+
+    using CGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{}),
+                                               make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{})));
+
+    using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
+    using BGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0>;
+
+    using GridwiseGemm =
+        GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
+                                                FloatAB,
+                                                FloatAcc,
+                                                FloatC,
+                                                InMemoryDataOperationEnum_t::Set,
+                                                AK0MK1GridDesc,
+                                                BK0NK1GridDesc,
+                                                CMNGridDesc,
+                                                MPerBlock,
+                                                NPerBlock,
+                                                KPerBlock,
+                                                MPerWave,
+                                                NPerWave,
+                                                K1,
+                                                MRepeat,
+                                                NRepeat,
+                                                ABlockTransferThreadSliceLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                ABlockTransferSrcAccessOrder,
+                                                ABlockTransferSrcVectorDim,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_K1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                BBlockTransferThreadSliceLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                BBlockTransferSrcAccessOrder,
+                                                BBlockTransferSrcVectorDim,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_K1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                CThreadTransferSrcDstAccessOrder,
+                                                CThreadTransferSrcDstVectorDim,
+                                                CThreadTransferDstScalarPerVector,
+                                                AGridStepHacks,
+                                                BGridStepHacks,
+                                                CGridStepHacks,
+                                                AGridMoveSliceWindowStepHacks,
+                                                BGridMoveSliceWindowStepHacks,
+                                                false>;
+    constexpr auto c_m0_m1_m2_n_grid_desc_tmp =
+        GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
+    constexpr auto c_blockid_to_m0_n0_block_cluster_adaptor_tmp =
+        GridwiseGemm::MakeCBlockClusterAdaptor(c_m_n_grid_desc);
+
+    using CM0M1M2NGridDesc = decltype(c_m0_m1_m2_n_grid_desc_tmp);
+    using CBlockIdToM0N0BlockClusterAdaptor =
+        decltype(c_blockid_to_m0_n0_block_cluster_adaptor_tmp);
+
+    const auto a_k0_m_k1_grid_desc =
+        *reinterpret_cast<const AK0MK1GridDesc*>((const void*)p_a_k0_m_k1_grid_desc);
+    const auto b_k0_n_k1_grid_desc =
+        *reinterpret_cast<const BK0NK1GridDesc*>((const void*)p_b_k0_n_k1_grid_desc);
+    const auto c_m0_m1_m2_n_grid_desc =
+        *reinterpret_cast<const CM0M1M2NGridDesc*>((const void*)p_c_m0_m1_m2_n_grid_desc);
+    const auto c_blockid_to_m0_n0_block_cluster_adaptor =
+        *reinterpret_cast<const CBlockIdToM0N0BlockClusterAdaptor*>(
+            (const void*)p_c_blockid_to_m0_n0_block_cluster_adaptor);
+
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseGemm::Run(p_a_grid,
+                      p_b_grid,
+                      p_c_grid,
+                      p_shared_block,
+                      a_k0_m_k1_grid_desc,
+                      b_k0_n_k1_grid_desc,
+                      c_m0_m1_m2_n_grid_desc,
+                      c_blockid_to_m0_n0_block_cluster_adaptor);
+};
diff --git a/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp b/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp
similarity index 87%
rename from composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp
rename to composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp
index 90c957bb0b..c1208ac3cb 100644
--- a/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp
+++ b/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp
@@ -1,7 +1,7 @@
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "gridwise_dynamic_contraction_dlops_v1r2.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "gridwise_contraction_dlops_v1r2.hpp"
 #include "transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp"
 
 using namespace ck;
@@ -62,23 +62,39 @@ constexpr bool HasMainKBlockLoop       = static_cast<bool>(CK_PARAM_HasMainKBloc
 constexpr bool HasDoubleTailKBlockLoop = static_cast<bool>(CK_PARAM_HasDoubleTailKBlockLoop);
 
 extern "C" __global__ void
-dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare(index_t N,
-                                                                            index_t C,
-                                                                            index_t Hi,
-                                                                            index_t Wi,
-                                                                            index_t K,
-                                                                            index_t Y,
-                                                                            index_t X,
-                                                                            index_t ConvStrideH,
-                                                                            index_t ConvStrideW,
-                                                                            index_t ConvDilationH,
-                                                                            index_t ConvDilationW,
-                                                                            index_t InLeftPadH,
-                                                                            index_t InLeftPadW,
-                                                                            index_t InRightPadH,
-                                                                            index_t InRightPadW,
-                                                                            void* p_desc_tuple)
+convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare(int N_,
+                                                                    int C_,
+                                                                    int Hi_,
+                                                                    int Wi_,
+                                                                    int K_,
+                                                                    int Y_,
+                                                                    int X_,
+                                                                    int ConvStrideH_,
+                                                                    int ConvStrideW_,
+                                                                    int ConvDilationH_,
+                                                                    int ConvDilationW_,
+                                                                    int InLeftPadH_,
+                                                                    int InLeftPadW_,
+                                                                    int InRightPadH_,
+                                                                    int InRightPadW_,
+                                                                    void* p_desc_tuple)
 {
+    index_t N             = static_cast<index_t>(N_);
+    index_t C             = static_cast<index_t>(C_);
+    index_t Hi            = static_cast<index_t>(Hi_);
+    index_t Wi            = static_cast<index_t>(Wi_);
+    index_t K             = static_cast<index_t>(K_);
+    index_t Y             = static_cast<index_t>(Y_);
+    index_t X             = static_cast<index_t>(X_);
+    index_t ConvStrideH   = static_cast<index_t>(ConvStrideH_);
+    index_t ConvStrideW   = static_cast<index_t>(ConvStrideW_);
+    index_t ConvDilationH = static_cast<index_t>(ConvDilationH_);
+    index_t ConvDilationW = static_cast<index_t>(ConvDilationW_);
+    index_t InLeftPadH    = static_cast<index_t>(InLeftPadH_);
+    index_t InLeftPadW    = static_cast<index_t>(InLeftPadW_);
+    index_t InRightPadH   = static_cast<index_t>(InRightPadH_);
+    index_t InRightPadW   = static_cast<index_t>(InRightPadW_);
+
     constexpr auto I0 = Number<0>{};
     constexpr auto I1 = Number<1>{};
     constexpr auto I2 = Number<2>{};
@@ -88,12 +104,9 @@ dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare(inde
     const index_t Wo =
         (Wi + InLeftPadW + InRightPadW - ConvDilationW * (X - 1) - 1) / ConvStrideW + 1;
 
-    const auto in_n_c_hi_wi_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, C, Hi, Wi));
-    const auto wei_k_c_y_x_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C, Y, X));
-    const auto out_n_k_ho_wo_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K, Ho, Wo));
+    const auto in_n_c_hi_wi_desc  = make_naive_tensor_descriptor_packed(make_tuple(N, C, Hi, Wi));
+    const auto wei_k_c_y_x_desc   = make_naive_tensor_descriptor_packed(make_tuple(K, C, Y, X));
+    const auto out_n_k_ho_wo_desc = make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho, Wo));
 
     const auto descs = transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad(
         wei_k_c_y_x_desc,
@@ -114,7 +127,7 @@ dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare(inde
     using BGridDesc_GK0_GN0_GN1_GK1 = decltype(b_grid_desc_gk0_gn0_gn1_gk1);
     using CGridDesc_GM0_GM1_GN0_GN1 = decltype(c_grid_desc_gm0_gm1_gn0_gn1);
 
-    using AGridIteratorHacks =
+    using AGridStepHacks =
         decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 0+: GK0
                                        Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 1+: GM0
                                        Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 2+: GM10
@@ -126,7 +139,7 @@ dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare(inde
                                        Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 3-: GM11
                                        Sequence<0, 0, 0, 0, 0, 0, 0>{}))); // 4-: GK1
 
-    using BGridIteratorHacks = decltype(make_tuple(
+    using BGridStepHacks = decltype(make_tuple(
         make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0>{},    // 0+: GK0
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},    // 1+: GN0
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},    // 2+: GN10
@@ -138,7 +151,7 @@ dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare(inde
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},    // 3-: GN11
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}))); // 4-: GK1
 
-    using CGridIteratorHacks = decltype(make_tuple(
+    using CGridStepHacks = decltype(make_tuple(
         make_tuple(
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 0+: GM10
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},  // 1+: BM0
@@ -154,13 +167,13 @@ dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare(inde
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{},    // 4-: BN0
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{}))); // 5-: GN1
 
-    using AGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0>;
+    using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0>;
 
-    using BGridMoveSliceWindowIteratorHacks =
+    using BGridMoveSliceWindowStepHacks =
         Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0>;
 
     using GridwiseContraction =
-        GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1<
+        GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1<
             BlockSize,
             FloatAB,
             FloatAcc,
@@ -194,11 +207,11 @@ dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare(inde
             CThreadTransferSrcDstAccessOrder,
             CThreadTransferSrcDstVectorDim,
             CThreadTransferDstScalarPerVector,
-            AGridIteratorHacks,
-            BGridIteratorHacks,
-            CGridIteratorHacks,
-            AGridMoveSliceWindowIteratorHacks,
-            BGridMoveSliceWindowIteratorHacks>;
+            AGridStepHacks,
+            BGridStepHacks,
+            CGridStepHacks,
+            AGridMoveSliceWindowStepHacks,
+            BGridMoveSliceWindowStepHacks>;
 
     if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0)
     {
@@ -220,7 +233,7 @@ extern "C" __global__ void
 #if CK_USE_LAUNCH_BOUNDS
     __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
+        convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
             const FloatAB* __restrict__ p_a_grid,
             const FloatAB* __restrict__ p_b_grid,
             FloatC* __restrict__ p_c_grid,
@@ -232,11 +245,11 @@ extern "C" __global__ void
     constexpr auto I3 = Number<3>{};
 
     constexpr auto in_n_c_hi_wi_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 256, 28, 28));
+        make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28));
     constexpr auto wei_k_c_y_x_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 256, 3, 3));
+        make_naive_tensor_descriptor_packed(make_tuple(256, 256, 3, 3));
     constexpr auto out_n_k_ho_wo_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 256, 28, 28));
+        make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28));
 
     constexpr auto descs =
         transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
@@ -257,7 +270,7 @@ extern "C" __global__ void
     using BGridDesc_GK0_GN0_GN1_GK1 = decltype(b_grid_desc_gk0_gn0_gn1_gk1);
     using CGridDesc_GM0_GM1_GN0_GN1 = decltype(c_grid_desc_gm0_gm1_gn0_gn1);
 
-    using AGridIteratorHacks =
+    using AGridStepHacks =
         decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 0+: GK0
                                        Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 1+: GM0
                                        Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 2+: GM10
@@ -269,7 +282,7 @@ extern "C" __global__ void
                                        Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 3-: GM11
                                        Sequence<0, 0, 0, 0, 0, 0, 0>{}))); // 4-: GK1
 
-    using BGridIteratorHacks = decltype(make_tuple(
+    using BGridStepHacks = decltype(make_tuple(
         make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0>{},    // 0+: GK0
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},    // 1+: GN0
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},    // 2+: GN10
@@ -281,7 +294,7 @@ extern "C" __global__ void
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},    // 3-: GN11
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}))); // 4-: GK1
 
-    using CGridIteratorHacks = decltype(make_tuple(
+    using CGridStepHacks = decltype(make_tuple(
         make_tuple(
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 0+: GM10
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},  // 1+: BM0
@@ -297,13 +310,13 @@ extern "C" __global__ void
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{},    // 4-: BN0
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{}))); // 5-: GN1
 
-    using AGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0>;
+    using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0>;
 
-    using BGridMoveSliceWindowIteratorHacks =
+    using BGridMoveSliceWindowStepHacks =
         Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0>;
 
     using GridwiseContraction =
-        GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1<
+        GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1<
             BlockSize,
             FloatAB,
             FloatAcc,
@@ -337,11 +350,11 @@ extern "C" __global__ void
             CThreadTransferSrcDstAccessOrder,
             CThreadTransferSrcDstVectorDim,
             CThreadTransferDstScalarPerVector,
-            AGridIteratorHacks,
-            BGridIteratorHacks,
-            CGridIteratorHacks,
-            AGridMoveSliceWindowIteratorHacks,
-            BGridMoveSliceWindowIteratorHacks>;
+            AGridStepHacks,
+            BGridStepHacks,
+            CGridStepHacks,
+            AGridMoveSliceWindowStepHacks,
+            BGridMoveSliceWindowStepHacks>;
 
     using AGridDesc_GK0_GM0_GM10_GM11_GK1 =
         decltype(GridwiseContraction::MakeAGridDescriptor_GK0_GM0_GM10_GM11_GK1(
diff --git a/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp b/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp
deleted file mode 100644
index 652ccdb926..0000000000
--- a/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp
+++ /dev/null
@@ -1,374 +0,0 @@
-#include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "gridwise_dynamic_gemm_dlops_v1r2.hpp"
-#include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp"
-
-using namespace ck;
-
-constexpr DataTypeEnum_t ABDataTypeEnum  = static_cast<DataTypeEnum_t>(CK_PARAM_ABDataTypeEnum);
-constexpr DataTypeEnum_t AccDataTypeEnum = static_cast<DataTypeEnum_t>(CK_PARAM_AccDataTypeEnum);
-constexpr DataTypeEnum_t CDataTypeEnum   = static_cast<DataTypeEnum_t>(CK_PARAM_CDataTypeEnum);
-
-using FloatAB  = typename get_datatype_from_enum<ABDataTypeEnum>::type;
-using FloatAcc = typename get_datatype_from_enum<AccDataTypeEnum>::type;
-using FloatC   = typename get_datatype_from_enum<CDataTypeEnum>::type;
-
-constexpr index_t BlockSize = CK_PARAM_BlockSize;
-
-constexpr index_t MPerBlock            = CK_PARAM_MPerBlock;
-constexpr index_t NPerBlock            = CK_PARAM_NPerBlock;
-constexpr index_t KPerBlock            = CK_PARAM_KPerBlock;
-constexpr index_t M1PerThread          = CK_PARAM_M1PerThread;
-constexpr index_t N1PerThread          = CK_PARAM_N1PerThread;
-constexpr index_t KPerThread           = CK_PARAM_KPerThread;
-constexpr index_t M1N1ThreadClusterM10 = CK_PARAM_M1N1ThreadClusterM10;
-constexpr index_t M1N1ThreadClusterN10 = CK_PARAM_M1N1ThreadClusterN10;
-constexpr index_t M1N1ThreadClusterM11 = CK_PARAM_M1N1ThreadClusterM11;
-constexpr index_t M1N1ThreadClusterN11 = CK_PARAM_M1N1ThreadClusterN11;
-
-using ABlockTransferThreadSliceLengths_K_M0_M1 =
-    Sequence<CK_PARAM_ABlockTransferThreadSliceLengths_K_M0_M1>;
-using ABlockTransferThreadClusterLengths_K_M0_M1 =
-    Sequence<CK_PARAM_ABlockTransferThreadClusterLengths_K_M0_M1>;
-using ABlockTransferThreadClusterArrangeOrder =
-    Sequence<CK_PARAM_ABlockTransferThreadClusterArrangeOrder>;
-using ABlockTransferSrcAccessOrder = Sequence<CK_PARAM_ABlockTransferSrcAccessOrder>;
-
-constexpr index_t ABlockTransferSrcVectorDim       = CK_PARAM_ABlockTransferSrcVectorDim;
-constexpr index_t ABlockTransferSrcScalarPerVector = CK_PARAM_ABlockTransferSrcScalarPerVector;
-constexpr index_t ABlockTransferDstScalarPerVector_M1 =
-    CK_PARAM_ABlockTransferDstScalarPerVector_M1;
-constexpr bool AThreadTransferSrcResetCoordinateAfterRun =
-    static_cast<bool>(CK_PARAM_AThreadTransferSrcResetCoordinateAfterRun);
-
-using BBlockTransferThreadSliceLengths_K_N0_N1 =
-    Sequence<CK_PARAM_BBlockTransferThreadSliceLengths_K_N0_N1>;
-using BBlockTransferThreadClusterLengths_K_N0_N1 =
-    Sequence<CK_PARAM_BBlockTransferThreadClusterLengths_K_N0_N1>;
-using BBlockTransferThreadClusterArrangeOrder =
-    Sequence<CK_PARAM_BBlockTransferThreadClusterArrangeOrder>;
-using BBlockTransferSrcAccessOrder = Sequence<CK_PARAM_BBlockTransferSrcAccessOrder>;
-
-constexpr index_t BBlockTransferSrcVectorDim       = CK_PARAM_BBlockTransferSrcVectorDim;
-constexpr index_t BBlockTransferSrcScalarPerVector = CK_PARAM_BBlockTransferSrcScalarPerVector;
-constexpr index_t BBlockTransferDstScalarPerVector_N1 =
-    CK_PARAM_BBlockTransferDstScalarPerVector_N1;
-constexpr bool BThreadTransferSrcResetCoordinateAfterRun =
-    static_cast<bool>(CK_PARAM_BThreadTransferSrcResetCoordinateAfterRun);
-
-using CThreadTransferSrcDstAccessOrder = Sequence<CK_PARAM_CThreadTransferSrcDstAccessOrder>;
-constexpr index_t CThreadTransferSrcDstVectorDim    = CK_PARAM_CThreadTransferSrcDstVectorDim;
-constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDstScalarPerVector;
-
-constexpr bool HasMainKBlockLoop       = static_cast<bool>(CK_PARAM_HAS_MAIN_KBLOCK_LOOP);
-constexpr bool HasDoubleTailKBlockLoop = static_cast<bool>(CK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP);
-
-extern "C" __global__ void
-dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw_prepare(
-    int n,
-    int c,
-    int hi,
-    int wi,
-    int k,
-    int y,
-    int x,
-    int convStrideH,
-    int convStrideW,
-    int convDilationY,
-    int convDilationX,
-    int leftPadH,
-    int leftPadW,
-    int rightPadH,
-    int rightPadW,
-    void* p_a_k_m0_m1_grid_desc,
-    void* p_b_k_n0_n1_grid_desc,
-    void* p_c_m0_m10_m11_n0_n10_n11_grid_desc,
-    void* p_c_blockid_to_m0_n0_block_cluster_adaptor)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-
-    const index_t ho = (hi + leftPadH + rightPadH - convDilationY * (y - 1) - 1) / convStrideH + 1;
-    const index_t wo = (wi + leftPadW + rightPadW - convDilationX * (x - 1) - 1) / convStrideW + 1;
-
-    const auto in_n_c_hi_wi_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(n, c, hi, wi));
-    const auto wei_k_c_y_x_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(k, c, y, x));
-    const auto out_n_k_ho_wo_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(n, k, ho, wo));
-
-    const auto descs = transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(
-        wei_k_c_y_x_desc,
-        in_n_c_hi_wi_desc,
-        out_n_k_ho_wo_desc,
-        make_tuple(convStrideH, convStrideW),
-        make_tuple(convDilationY, convDilationX),
-        make_tuple(leftPadH, leftPadW),
-        make_tuple(rightPadH, rightPadW));
-
-    const auto a_k_m_grid_desc = descs[I0];
-    const auto b_k_n_grid_desc = descs[I1];
-    const auto c_m_n_grid_desc = descs[I2];
-
-    using AKMGridDesc = decltype(a_k_m_grid_desc);
-    using BKNGridDesc = decltype(b_k_n_grid_desc);
-    using CMNGridDesc = decltype(c_m_n_grid_desc);
-
-    using AGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{}),
-                                                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{})));
-
-    using BGridIteratorHacks =
-        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}),
-                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{})));
-
-    using CGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 1, 0, 0>{},
-                                                              Sequence<0, 0, 1, 0, 0>{},
-                                                              Sequence<0, 0, 1, 0, 0>{}),
-                                                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 2, 0, 0>{},
-                                                              Sequence<0, 0, 2, 0, 0>{},
-                                                              Sequence<0, 0, 2, 0, 0>{})));
-
-    using AGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0>;
-    using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
-
-    using GridwiseGemm =
-        GridwiseDynamicGemmDlops_km_kn_mn_v1r2<BlockSize,
-                                               FloatAB,
-                                               FloatAcc,
-                                               FloatC,
-                                               InMemoryDataOperationEnum_t::Set, /* ToDo tunable */
-                                               AKMGridDesc,
-                                               BKNGridDesc,
-                                               CMNGridDesc,
-                                               MPerBlock,
-                                               NPerBlock,
-                                               KPerBlock,
-                                               M1PerThread,
-                                               N1PerThread,
-                                               KPerThread,
-                                               M1N1ThreadClusterM10,
-                                               M1N1ThreadClusterN10,
-                                               M1N1ThreadClusterM11,
-                                               M1N1ThreadClusterN11,
-                                               ABlockTransferThreadSliceLengths_K_M0_M1,
-                                               ABlockTransferThreadClusterLengths_K_M0_M1,
-                                               ABlockTransferThreadClusterArrangeOrder,
-                                               ABlockTransferSrcAccessOrder,
-                                               ABlockTransferSrcVectorDim,
-                                               ABlockTransferSrcScalarPerVector,
-                                               ABlockTransferDstScalarPerVector_M1,
-                                               AThreadTransferSrcResetCoordinateAfterRun,
-                                               BBlockTransferThreadSliceLengths_K_N0_N1,
-                                               BBlockTransferThreadClusterLengths_K_N0_N1,
-                                               BBlockTransferThreadClusterArrangeOrder,
-                                               BBlockTransferSrcAccessOrder,
-                                               BBlockTransferSrcVectorDim,
-                                               BBlockTransferSrcScalarPerVector,
-                                               BBlockTransferDstScalarPerVector_N1,
-                                               BThreadTransferSrcResetCoordinateAfterRun,
-                                               CThreadTransferSrcDstAccessOrder,
-                                               CThreadTransferSrcDstVectorDim,
-                                               CThreadTransferDstScalarPerVector,
-                                               AGridIteratorHacks,
-                                               BGridIteratorHacks,
-                                               CGridIteratorHacks,
-                                               AGridMoveSliceWindowIteratorHacks,
-                                               BGridMoveSliceWindowIteratorHacks>;
-
-    auto a_k_m0_m1_grid_desc = GridwiseGemm::MakeAKM0M1GridDescriptor(a_k_m_grid_desc);
-    auto b_k_n0_n1_grid_desc = GridwiseGemm::MakeBKN0N1GridDescriptor(b_k_n_grid_desc);
-    auto c_m0_m10_m11_n0_n10_n11_grid_desc =
-        GridwiseGemm::MakeCM0M10M11N0N10N11GridDescriptor(c_m_n_grid_desc);
-    auto c_blockid_to_m0_n0_block_cluster_adaptor =
-        GridwiseGemm::MakeCBlockIdToM0N0BlockClusterAdaptor(c_m_n_grid_desc);
-
-    if(hipThreadIdx_x == 0)
-    {
-        *static_cast<decltype(a_k_m0_m1_grid_desc)*>(p_a_k_m0_m1_grid_desc) = a_k_m0_m1_grid_desc;
-        *static_cast<decltype(b_k_n0_n1_grid_desc)*>(p_b_k_n0_n1_grid_desc) = b_k_n0_n1_grid_desc;
-        *static_cast<decltype(c_m0_m10_m11_n0_n10_n11_grid_desc)*>(
-            p_c_m0_m10_m11_n0_n10_n11_grid_desc) = c_m0_m10_m11_n0_n10_n11_grid_desc;
-        *static_cast<decltype(c_blockid_to_m0_n0_block_cluster_adaptor)*>(
-            p_c_blockid_to_m0_n0_block_cluster_adaptor) = c_blockid_to_m0_n0_block_cluster_adaptor;
-    };
-};
-
-extern "C" __global__ void
-#if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
-#endif
-        dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            FloatC* __restrict__ p_c_grid,
-            const void CONSTANT* p_a_k_m0_m1_grid_desc,
-            const void CONSTANT* p_b_k_n0_n1_grid_desc,
-            const void CONSTANT* p_c_m0_m10_m11_n0_n10_n11_grid_desc,
-            const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-
-    constexpr auto in_n_c_hi_wi_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 256, 28, 28));
-    constexpr auto wei_k_c_y_x_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 256, 3, 3));
-    constexpr auto out_n_k_ho_wo_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 256, 28, 28));
-
-    constexpr auto descs =
-        transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
-                                                                        in_n_c_hi_wi_desc,
-                                                                        out_n_k_ho_wo_desc,
-                                                                        make_tuple(1, 1),
-                                                                        make_tuple(1, 1),
-                                                                        make_tuple(1, 1),
-                                                                        make_tuple(1, 1));
-
-    constexpr auto a_k_m_grid_desc = descs[I0];
-    constexpr auto b_k_n_grid_desc = descs[I1];
-    constexpr auto c_m_n_grid_desc = descs[I2];
-
-    using AKMGridDesc = decltype(a_k_m_grid_desc);
-    using BKNGridDesc = decltype(b_k_n_grid_desc);
-    using CMNGridDesc = decltype(c_m_n_grid_desc);
-
-    using AGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{}),
-                                                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{})));
-
-    using BGridIteratorHacks =
-        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}),
-                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{})));
-
-    using CGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 1, 0, 0>{},
-                                                              Sequence<0, 0, 1, 0, 0>{},
-                                                              Sequence<0, 0, 1, 0, 0>{}),
-                                                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 2, 0, 0>{},
-                                                              Sequence<0, 0, 2, 0, 0>{},
-                                                              Sequence<0, 0, 2, 0, 0>{})));
-
-    using AGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0>;
-    using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
-
-    using GridwiseGemm =
-        GridwiseDynamicGemmDlops_km_kn_mn_v1r2<BlockSize,
-                                               FloatAB,
-                                               FloatAcc,
-                                               FloatC,
-                                               InMemoryDataOperationEnum_t::Set, /* ToDo tunable */
-                                               AKMGridDesc,
-                                               BKNGridDesc,
-                                               CMNGridDesc,
-                                               MPerBlock,
-                                               NPerBlock,
-                                               KPerBlock,
-                                               M1PerThread,
-                                               N1PerThread,
-                                               KPerThread,
-                                               M1N1ThreadClusterM10,
-                                               M1N1ThreadClusterN10,
-                                               M1N1ThreadClusterM11,
-                                               M1N1ThreadClusterN11,
-                                               ABlockTransferThreadSliceLengths_K_M0_M1,
-                                               ABlockTransferThreadClusterLengths_K_M0_M1,
-                                               ABlockTransferThreadClusterArrangeOrder,
-                                               ABlockTransferSrcAccessOrder,
-                                               ABlockTransferSrcVectorDim,
-                                               ABlockTransferSrcScalarPerVector,
-                                               ABlockTransferDstScalarPerVector_M1,
-                                               AThreadTransferSrcResetCoordinateAfterRun,
-                                               BBlockTransferThreadSliceLengths_K_N0_N1,
-                                               BBlockTransferThreadClusterLengths_K_N0_N1,
-                                               BBlockTransferThreadClusterArrangeOrder,
-                                               BBlockTransferSrcAccessOrder,
-                                               BBlockTransferSrcVectorDim,
-                                               BBlockTransferSrcScalarPerVector,
-                                               BBlockTransferDstScalarPerVector_N1,
-                                               BThreadTransferSrcResetCoordinateAfterRun,
-                                               CThreadTransferSrcDstAccessOrder,
-                                               CThreadTransferSrcDstVectorDim,
-                                               CThreadTransferDstScalarPerVector,
-                                               AGridIteratorHacks,
-                                               BGridIteratorHacks,
-                                               CGridIteratorHacks,
-                                               AGridMoveSliceWindowIteratorHacks,
-                                               BGridMoveSliceWindowIteratorHacks>;
-
-    constexpr auto a_k_m0_m1_grid_desc_tmp =
-        GridwiseGemm::MakeAKM0M1GridDescriptor(a_k_m_grid_desc);
-    constexpr auto b_k_n0_n1_grid_desc_tmp =
-        GridwiseGemm::MakeBKN0N1GridDescriptor(b_k_n_grid_desc);
-    constexpr auto c_m0_m10_m11_n0_n10_n11_grid_desc_tmp =
-        GridwiseGemm::MakeCM0M10M11N0N10N11GridDescriptor(c_m_n_grid_desc);
-    constexpr auto c_blockid_to_m0_n0_block_cluster_adaptor_tmp =
-        GridwiseGemm::MakeCBlockIdToM0N0BlockClusterAdaptor(c_m_n_grid_desc);
-
-    using AKM0M1GridDesc            = decltype(a_k_m0_m1_grid_desc_tmp);
-    using BKN0N1GridDesc            = decltype(b_k_n0_n1_grid_desc_tmp);
-    using CM0M10M11N0N10N11GridDesc = decltype(c_m0_m10_m11_n0_n10_n11_grid_desc_tmp);
-    using CBlockIdToM0N0BlockClusterAdaptor =
-        decltype(c_blockid_to_m0_n0_block_cluster_adaptor_tmp);
-
-    const auto a_k_m0_m1_grid_desc =
-        *reinterpret_cast<const AKM0M1GridDesc*>((const void*)p_a_k_m0_m1_grid_desc);
-    const auto b_k_n0_n1_grid_desc =
-        *reinterpret_cast<const BKN0N1GridDesc*>((const void*)p_b_k_n0_n1_grid_desc);
-    const auto c_m0_m10_m11_n0_n10_n11_grid_desc =
-        *reinterpret_cast<const CM0M10M11N0N10N11GridDesc*>(
-            (const void*)p_c_m0_m10_m11_n0_n10_n11_grid_desc);
-    const auto c_blockid_to_m0_n0_block_cluster_adaptor =
-        *reinterpret_cast<const CBlockIdToM0N0BlockClusterAdaptor*>(
-            (const void*)p_c_blockid_to_m0_n0_block_cluster_adaptor);
-
-    constexpr index_t shared_block_size =
-        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
-
-    __shared__ FloatAB p_shared_block[shared_block_size];
-
-    GridwiseGemm::Run(p_a_grid,
-                      p_b_grid,
-                      p_c_grid,
-                      p_shared_block,
-                      a_k_m0_m1_grid_desc,
-                      b_k_n0_n1_grid_desc,
-                      c_m0_m10_m11_n0_n10_n11_grid_desc,
-                      c_blockid_to_m0_n0_block_cluster_adaptor,
-                      integral_constant<bool, HasMainKBlockLoop>{},
-                      integral_constant<bool, HasDoubleTailKBlockLoop>{});
-};
diff --git a/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp b/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
deleted file mode 100644
index d33bc74aa6..0000000000
--- a/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
+++ /dev/null
@@ -1,362 +0,0 @@
-#include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "gridwise_dynamic_gemm_xdlops_v2r3.hpp"
-#include "transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp"
-
-using namespace ck;
-
-constexpr DataTypeEnum_t ABDataTypeEnum  = static_cast<DataTypeEnum_t>(CK_PARAM_ABDataTypeEnum);
-constexpr DataTypeEnum_t AccDataTypeEnum = static_cast<DataTypeEnum_t>(CK_PARAM_AccDataTypeEnum);
-constexpr DataTypeEnum_t CDataTypeEnum   = static_cast<DataTypeEnum_t>(CK_PARAM_CDataTypeEnum);
-
-using FloatAB  = typename get_datatype_from_enum<ABDataTypeEnum>::type;
-using FloatAcc = typename get_datatype_from_enum<AccDataTypeEnum>::type;
-using FloatC   = typename get_datatype_from_enum<CDataTypeEnum>::type;
-
-constexpr index_t BlockSize = CK_PARAM_BlockSize;
-
-constexpr index_t MPerBlock = CK_PARAM_MPerBlock;
-constexpr index_t NPerBlock = CK_PARAM_NPerBlock;
-constexpr index_t KPerBlock = CK_PARAM_KPerBlock;
-
-constexpr index_t MPerWave = CK_PARAM_MPerWave;
-constexpr index_t NPerWave = CK_PARAM_NPerWave;
-constexpr index_t MRepeat  = CK_PARAM_MRepeat;
-constexpr index_t NRepeat  = CK_PARAM_NRepeat;
-constexpr index_t K1       = CK_PARAM_K1;
-
-using ABlockTransferThreadSliceLengths_K0_M_K1 =
-    Sequence<CK_PARAM_ABlockTransferThreadSliceLengths_K0_M_K1>;
-using ABlockTransferThreadClusterLengths_K0_M_K1 =
-    Sequence<CK_PARAM_ABlockTransferThreadClusterLengths_K0_M_K1>;
-using ABlockTransferThreadClusterArrangeOrder =
-    Sequence<CK_PARAM_ABlockTransferThreadClusterArrangeOrder>;
-using ABlockTransferSrcAccessOrder = Sequence<CK_PARAM_ABlockTransferSrcAccessOrder>;
-
-constexpr index_t ABlockTransferSrcVectorDim       = CK_PARAM_ABlockTransferSrcVectorDim;
-constexpr index_t ABlockTransferSrcScalarPerVector = CK_PARAM_ABlockTransferSrcScalarPerVector;
-constexpr index_t ABlockTransferDstScalarPerVector_K1 =
-    CK_PARAM_ABlockTransferDstScalarPerVector_K1;
-constexpr bool AThreadTransferSrcResetCoordinateAfterRun =
-    static_cast<bool>(CK_PARAM_AThreadTransferSrcResetCoordinateAfterRun);
-
-using BBlockTransferThreadSliceLengths_K0_N_K1 =
-    Sequence<CK_PARAM_BBlockTransferThreadSliceLengths_K0_N_K1>;
-using BBlockTransferThreadClusterLengths_K0_N_K1 =
-    Sequence<CK_PARAM_BBlockTransferThreadClusterLengths_K0_N_K1>;
-using BBlockTransferThreadClusterArrangeOrder =
-    Sequence<CK_PARAM_BBlockTransferThreadClusterArrangeOrder>;
-using BBlockTransferSrcAccessOrder = Sequence<CK_PARAM_BBlockTransferSrcAccessOrder>;
-
-constexpr index_t BBlockTransferSrcVectorDim       = CK_PARAM_BBlockTransferSrcVectorDim;
-constexpr index_t BBlockTransferSrcScalarPerVector = CK_PARAM_BBlockTransferSrcScalarPerVector;
-constexpr index_t BBlockTransferDstScalarPerVector_K1 =
-    CK_PARAM_BBlockTransferDstScalarPerVector_K1;
-constexpr bool BThreadTransferSrcResetCoordinateAfterRun =
-    static_cast<bool>(CK_PARAM_BThreadTransferSrcResetCoordinateAfterRun);
-
-using CThreadTransferSrcDstAccessOrder = Sequence<CK_PARAM_CThreadTransferSrcDstAccessOrder>;
-constexpr index_t CThreadTransferSrcDstVectorDim    = CK_PARAM_CThreadTransferSrcDstVectorDim;
-constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDstScalarPerVector;
-
-extern "C" __global__ void
-dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_prepare(
-    int n,
-    int c,
-    int hi,
-    int wi,
-    int k,
-    int y,
-    int x,
-    int convStrideH,
-    int convStrideW,
-    int convDilationY,
-    int convDilationX,
-    int leftPadH,
-    int leftPadW,
-    int rightPadH,
-    int rightPadW,
-    void* p_a_k0_m_k1_grid_desc,
-    void* p_b_k0_n_k1_grid_desc,
-    void* p_c_m0_m1_m2_n_grid_desc,
-    void* p_c_blockid_to_m0_n0_block_cluster_adaptor)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-
-    const index_t ho = (hi + leftPadH + rightPadH - convDilationY * (y - 1) - 1) / convStrideH + 1;
-    const index_t wo = (wi + leftPadW + rightPadW - convDilationX * (x - 1) - 1) / convStrideW + 1;
-
-    const auto in_n_c_hi_wi_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(n, c, hi, wi));
-    const auto wei_k_c_y_x_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(k, c, y, x));
-    const auto out_n_k_ho_wo_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(n, k, ho, wo));
-
-    const auto descs = transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(
-        wei_k_c_y_x_desc,
-        in_n_c_hi_wi_desc,
-        out_n_k_ho_wo_desc,
-        make_tuple(convStrideH, convStrideW),
-        make_tuple(convDilationY, convDilationX),
-        make_tuple(leftPadH, leftPadW),
-        make_tuple(rightPadH, rightPadW),
-        Number<K1>{});
-
-    const auto a_k0_m_k1_grid_desc = descs[I0];
-    const auto b_k0_n_k1_grid_desc = descs[I1];
-    const auto c_m_n_grid_desc     = descs[I2];
-
-    using AK0MK1GridDesc = decltype(a_k0_m_k1_grid_desc);
-    using BK0NK1GridDesc = decltype(b_k0_n_k1_grid_desc);
-    using CMNGridDesc    = decltype(c_m_n_grid_desc);
-
-    using AGridIteratorHacks = decltype(make_tuple(
-        make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
-        make_tuple(
-            Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{})));
-
-    using BGridIteratorHacks =
-        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
-                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})));
-
-    using CGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 1, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 1, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 1, 0, 0>{}),
-                                                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 2, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 2, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 2, 0, 0>{})));
-
-    using AGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0>;
-    using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
-
-    using GridwiseGemm =
-        GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
-                                                       FloatAB,
-                                                       FloatAcc,
-                                                       FloatC,
-                                                       InMemoryDataOperationEnum_t::Set,
-                                                       AK0MK1GridDesc,
-                                                       BK0NK1GridDesc,
-                                                       CMNGridDesc,
-                                                       MPerBlock,
-                                                       NPerBlock,
-                                                       KPerBlock,
-                                                       MPerWave,
-                                                       NPerWave,
-                                                       K1,
-                                                       MRepeat,
-                                                       NRepeat,
-                                                       ABlockTransferThreadSliceLengths_K0_M_K1,
-                                                       ABlockTransferThreadClusterLengths_K0_M_K1,
-                                                       ABlockTransferThreadClusterArrangeOrder,
-                                                       ABlockTransferSrcAccessOrder,
-                                                       ABlockTransferSrcVectorDim,
-                                                       ABlockTransferSrcScalarPerVector,
-                                                       ABlockTransferDstScalarPerVector_K1,
-                                                       AThreadTransferSrcResetCoordinateAfterRun,
-                                                       BBlockTransferThreadSliceLengths_K0_N_K1,
-                                                       BBlockTransferThreadClusterLengths_K0_N_K1,
-                                                       BBlockTransferThreadClusterArrangeOrder,
-                                                       BBlockTransferSrcAccessOrder,
-                                                       BBlockTransferSrcVectorDim,
-                                                       BBlockTransferSrcScalarPerVector,
-                                                       BBlockTransferDstScalarPerVector_K1,
-                                                       BThreadTransferSrcResetCoordinateAfterRun,
-                                                       CThreadTransferSrcDstAccessOrder,
-                                                       CThreadTransferSrcDstVectorDim,
-                                                       CThreadTransferDstScalarPerVector,
-                                                       AGridIteratorHacks,
-                                                       BGridIteratorHacks,
-                                                       CGridIteratorHacks,
-                                                       AGridMoveSliceWindowIteratorHacks,
-                                                       BGridMoveSliceWindowIteratorHacks,
-                                                       false>;
-
-    auto c_m0_m1_m2_n_grid_desc = GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
-
-    auto c_blockid_to_m0_n0_block_cluster_adaptor =
-        GridwiseGemm::MakeCBlockClusterAdaptor(c_m_n_grid_desc);
-
-    if(hipThreadIdx_x == 0)
-    {
-        *static_cast<remove_cv_t<decltype(a_k0_m_k1_grid_desc)>*>(p_a_k0_m_k1_grid_desc) =
-            a_k0_m_k1_grid_desc;
-        *static_cast<remove_cv_t<decltype(b_k0_n_k1_grid_desc)>*>(p_b_k0_n_k1_grid_desc) =
-            b_k0_n_k1_grid_desc;
-        *static_cast<decltype(c_m0_m1_m2_n_grid_desc)*>(p_c_m0_m1_m2_n_grid_desc) =
-            c_m0_m1_m2_n_grid_desc;
-        *static_cast<decltype(c_blockid_to_m0_n0_block_cluster_adaptor)*>(
-            p_c_blockid_to_m0_n0_block_cluster_adaptor) = c_blockid_to_m0_n0_block_cluster_adaptor;
-    }
-};
-
-extern "C" __global__ void
-#if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
-#endif
-        dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            FloatC* __restrict__ p_c_grid,
-            const void CONSTANT* p_a_k0_m_k1_grid_desc,
-            const void CONSTANT* p_b_k0_n_k1_grid_desc,
-            const void CONSTANT* p_c_m0_m1_m2_n_grid_desc,
-            const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor)
-{
-
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-
-    constexpr auto in_n_c_hi_wi_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 256, 28, 28));
-    constexpr auto wei_k_c_y_x_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 256, 3, 3));
-    constexpr auto out_n_k_ho_wo_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 256, 28, 28));
-
-    constexpr auto descs =
-        transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
-                                                                          in_n_c_hi_wi_desc,
-                                                                          out_n_k_ho_wo_desc,
-                                                                          make_tuple(1, 1),
-                                                                          make_tuple(1, 1),
-                                                                          make_tuple(1, 1),
-                                                                          make_tuple(1, 1),
-                                                                          Number<K1>{});
-
-    constexpr auto a_k0_m_k1_grid_desc_tmp = descs[I0];
-    constexpr auto b_k0_n_k1_grid_desc_tmp = descs[I1];
-    constexpr auto c_m_n_grid_desc         = descs[I2];
-
-    using AGridIteratorHacks = decltype(make_tuple(
-        make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
-        make_tuple(
-            Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{})));
-
-    using BGridIteratorHacks =
-        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
-                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})));
-
-    using CGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 1, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 1, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 1, 0, 0>{}),
-                                                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 2, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 2, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 2, 0, 0>{})));
-
-    using AGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0>;
-    using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
-
-    using AK0MK1GridDesc = decltype(a_k0_m_k1_grid_desc_tmp);
-    using BK0NK1GridDesc = decltype(b_k0_n_k1_grid_desc_tmp);
-    using CMNGridDesc    = decltype(c_m_n_grid_desc);
-
-    using GridwiseGemm =
-        GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
-                                                       FloatAB,
-                                                       FloatAcc,
-                                                       FloatC,
-                                                       InMemoryDataOperationEnum_t::Set,
-                                                       AK0MK1GridDesc,
-                                                       BK0NK1GridDesc,
-                                                       CMNGridDesc,
-                                                       MPerBlock,
-                                                       NPerBlock,
-                                                       KPerBlock,
-                                                       MPerWave,
-                                                       NPerWave,
-                                                       K1,
-                                                       MRepeat,
-                                                       NRepeat,
-                                                       ABlockTransferThreadSliceLengths_K0_M_K1,
-                                                       ABlockTransferThreadClusterLengths_K0_M_K1,
-                                                       ABlockTransferThreadClusterArrangeOrder,
-                                                       ABlockTransferSrcAccessOrder,
-                                                       ABlockTransferSrcVectorDim,
-                                                       ABlockTransferSrcScalarPerVector,
-                                                       ABlockTransferDstScalarPerVector_K1,
-                                                       AThreadTransferSrcResetCoordinateAfterRun,
-                                                       BBlockTransferThreadSliceLengths_K0_N_K1,
-                                                       BBlockTransferThreadClusterLengths_K0_N_K1,
-                                                       BBlockTransferThreadClusterArrangeOrder,
-                                                       BBlockTransferSrcAccessOrder,
-                                                       BBlockTransferSrcVectorDim,
-                                                       BBlockTransferSrcScalarPerVector,
-                                                       BBlockTransferDstScalarPerVector_K1,
-                                                       BThreadTransferSrcResetCoordinateAfterRun,
-                                                       CThreadTransferSrcDstAccessOrder,
-                                                       CThreadTransferSrcDstVectorDim,
-                                                       CThreadTransferDstScalarPerVector,
-                                                       AGridIteratorHacks,
-                                                       BGridIteratorHacks,
-                                                       CGridIteratorHacks,
-                                                       AGridMoveSliceWindowIteratorHacks,
-                                                       BGridMoveSliceWindowIteratorHacks,
-                                                       false>;
-
-    constexpr auto c_m0_m1_m2_n_grid_desc_tmp =
-        GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
-    constexpr auto c_blockid_to_m0_n0_block_cluster_adaptor_tmp =
-        GridwiseGemm::MakeCBlockClusterAdaptor(c_m_n_grid_desc);
-
-    using CM0M1M2NGridDesc = decltype(c_m0_m1_m2_n_grid_desc_tmp);
-    using CBlockIdToM0N0BlockClusterAdaptor =
-        decltype(c_blockid_to_m0_n0_block_cluster_adaptor_tmp);
-
-    const auto a_k0_m_k1_grid_desc =
-        *reinterpret_cast<const AK0MK1GridDesc*>((const void*)p_a_k0_m_k1_grid_desc);
-    const auto b_k0_n_k1_grid_desc =
-        *reinterpret_cast<const BK0NK1GridDesc*>((const void*)p_b_k0_n_k1_grid_desc);
-    const auto c_m0_m1_m2_n_grid_desc =
-        *reinterpret_cast<const CM0M1M2NGridDesc*>((const void*)p_c_m0_m1_m2_n_grid_desc);
-    const auto c_blockid_to_m0_n0_block_cluster_adaptor =
-        *reinterpret_cast<const CBlockIdToM0N0BlockClusterAdaptor*>(
-            (const void*)p_c_blockid_to_m0_n0_block_cluster_adaptor);
-
-    constexpr index_t shared_block_size =
-        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
-
-    __shared__ FloatAB p_shared_block[shared_block_size];
-
-    GridwiseGemm::Run(p_a_grid,
-                      p_b_grid,
-                      p_c_grid,
-                      p_shared_block,
-                      a_k0_m_k1_grid_desc,
-                      b_k0_n_k1_grid_desc,
-                      c_m0_m1_m2_n_grid_desc,
-                      c_blockid_to_m0_n0_block_cluster_adaptor);
-};
diff --git a/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp b/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp
deleted file mode 100644
index d49693b511..0000000000
--- a/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp
+++ /dev/null
@@ -1,362 +0,0 @@
-#include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "gridwise_dynamic_gemm_xdlops_v2r3.hpp"
-#include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
-
-using namespace ck;
-
-constexpr DataTypeEnum_t ABDataTypeEnum  = static_cast<DataTypeEnum_t>(CK_PARAM_ABDataTypeEnum);
-constexpr DataTypeEnum_t AccDataTypeEnum = static_cast<DataTypeEnum_t>(CK_PARAM_AccDataTypeEnum);
-constexpr DataTypeEnum_t CDataTypeEnum   = static_cast<DataTypeEnum_t>(CK_PARAM_CDataTypeEnum);
-
-using FloatAB  = typename get_datatype_from_enum<ABDataTypeEnum>::type;
-using FloatAcc = typename get_datatype_from_enum<AccDataTypeEnum>::type;
-using FloatC   = typename get_datatype_from_enum<CDataTypeEnum>::type;
-
-constexpr index_t BlockSize = CK_PARAM_BlockSize;
-
-constexpr index_t MPerBlock = CK_PARAM_MPerBlock;
-constexpr index_t NPerBlock = CK_PARAM_NPerBlock;
-constexpr index_t KPerBlock = CK_PARAM_KPerBlock;
-
-constexpr index_t MPerWave = CK_PARAM_MPerWave;
-constexpr index_t NPerWave = CK_PARAM_NPerWave;
-constexpr index_t MRepeat  = CK_PARAM_MRepeat;
-constexpr index_t NRepeat  = CK_PARAM_NRepeat;
-constexpr index_t K1       = CK_PARAM_K1;
-
-using ABlockTransferThreadSliceLengths_K0_M_K1 =
-    Sequence<CK_PARAM_ABlockTransferThreadSliceLengths_K0_M_K1>;
-using ABlockTransferThreadClusterLengths_K0_M_K1 =
-    Sequence<CK_PARAM_ABlockTransferThreadClusterLengths_K0_M_K1>;
-using ABlockTransferThreadClusterArrangeOrder =
-    Sequence<CK_PARAM_ABlockTransferThreadClusterArrangeOrder>;
-using ABlockTransferSrcAccessOrder = Sequence<CK_PARAM_ABlockTransferSrcAccessOrder>;
-
-constexpr index_t ABlockTransferSrcVectorDim       = CK_PARAM_ABlockTransferSrcVectorDim;
-constexpr index_t ABlockTransferSrcScalarPerVector = CK_PARAM_ABlockTransferSrcScalarPerVector;
-constexpr index_t ABlockTransferDstScalarPerVector_K1 =
-    CK_PARAM_ABlockTransferDstScalarPerVector_K1;
-constexpr bool AThreadTransferSrcResetCoordinateAfterRun =
-    static_cast<bool>(CK_PARAM_AThreadTransferSrcResetCoordinateAfterRun);
-
-using BBlockTransferThreadSliceLengths_K0_N_K1 =
-    Sequence<CK_PARAM_BBlockTransferThreadSliceLengths_K0_N_K1>;
-using BBlockTransferThreadClusterLengths_K0_N_K1 =
-    Sequence<CK_PARAM_BBlockTransferThreadClusterLengths_K0_N_K1>;
-using BBlockTransferThreadClusterArrangeOrder =
-    Sequence<CK_PARAM_BBlockTransferThreadClusterArrangeOrder>;
-using BBlockTransferSrcAccessOrder = Sequence<CK_PARAM_BBlockTransferSrcAccessOrder>;
-
-constexpr index_t BBlockTransferSrcVectorDim       = CK_PARAM_BBlockTransferSrcVectorDim;
-constexpr index_t BBlockTransferSrcScalarPerVector = CK_PARAM_BBlockTransferSrcScalarPerVector;
-constexpr index_t BBlockTransferDstScalarPerVector_K1 =
-    CK_PARAM_BBlockTransferDstScalarPerVector_K1;
-constexpr bool BThreadTransferSrcResetCoordinateAfterRun =
-    static_cast<bool>(CK_PARAM_BThreadTransferSrcResetCoordinateAfterRun);
-
-using CThreadTransferSrcDstAccessOrder = Sequence<CK_PARAM_CThreadTransferSrcDstAccessOrder>;
-constexpr index_t CThreadTransferSrcDstVectorDim    = CK_PARAM_CThreadTransferSrcDstVectorDim;
-constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDstScalarPerVector;
-
-extern "C" __global__ void
-dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk_prepare(
-    int n,
-    int hi,
-    int wi,
-    int c,
-    int k,
-    int y,
-    int x,
-    int convStrideH,
-    int convStrideW,
-    int convDilationY,
-    int convDilationX,
-    int leftPadH,
-    int leftPadW,
-    int rightPadH,
-    int rightPadW,
-    void* p_a_k0_m_k1_grid_desc,
-    void* p_b_k0_n_k1_grid_desc,
-    void* p_c_m0_m1_m2_n_grid_desc,
-    void* p_c_blockid_to_m0_n0_block_cluster_adaptor)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-
-    const index_t ho = (hi + leftPadH + rightPadH - convDilationY * (y - 1) - 1) / convStrideH + 1;
-    const index_t wo = (wi + leftPadW + rightPadW - convDilationX * (x - 1) - 1) / convStrideW + 1;
-
-    const auto in_n_hi_wi_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(n, hi, wi, c));
-    const auto wei_k_y_x_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(k, y, x, c));
-    const auto out_n_ho_wo_k_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(n, ho, wo, k));
-
-    const auto descs = transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad(
-        in_n_hi_wi_c_desc,
-        wei_k_y_x_c_desc,
-        out_n_ho_wo_k_desc,
-        make_tuple(convStrideH, convStrideW),
-        make_tuple(convDilationY, convDilationX),
-        make_tuple(leftPadH, leftPadW),
-        make_tuple(rightPadH, rightPadW),
-        Number<K1>{});
-
-    const auto a_k0_m_k1_grid_desc = descs[I0];
-    const auto b_k0_n_k1_grid_desc = descs[I1];
-    const auto c_m_n_grid_desc     = descs[I2];
-
-    using AK0MK1GridDesc = decltype(a_k0_m_k1_grid_desc);
-    using BK0NK1GridDesc = decltype(b_k0_n_k1_grid_desc);
-    using CMNGridDesc    = decltype(c_m_n_grid_desc);
-
-    using BGridIteratorHacks = decltype(make_tuple(
-        make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
-        make_tuple(
-            Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{})));
-
-    using AGridIteratorHacks =
-        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
-                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})));
-
-    using CGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 1, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 1, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 1, 0, 0>{}),
-                                                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 2, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 2, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 2, 0, 0>{})));
-
-    using AGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
-    using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0>;
-
-    using GridwiseGemm =
-        GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
-                                                       FloatAB,
-                                                       FloatAcc,
-                                                       FloatC,
-                                                       InMemoryDataOperationEnum_t::Set,
-                                                       AK0MK1GridDesc,
-                                                       BK0NK1GridDesc,
-                                                       CMNGridDesc,
-                                                       MPerBlock,
-                                                       NPerBlock,
-                                                       KPerBlock,
-                                                       MPerWave,
-                                                       NPerWave,
-                                                       K1,
-                                                       MRepeat,
-                                                       NRepeat,
-                                                       ABlockTransferThreadSliceLengths_K0_M_K1,
-                                                       ABlockTransferThreadClusterLengths_K0_M_K1,
-                                                       ABlockTransferThreadClusterArrangeOrder,
-                                                       ABlockTransferSrcAccessOrder,
-                                                       ABlockTransferSrcVectorDim,
-                                                       ABlockTransferSrcScalarPerVector,
-                                                       ABlockTransferDstScalarPerVector_K1,
-                                                       AThreadTransferSrcResetCoordinateAfterRun,
-                                                       BBlockTransferThreadSliceLengths_K0_N_K1,
-                                                       BBlockTransferThreadClusterLengths_K0_N_K1,
-                                                       BBlockTransferThreadClusterArrangeOrder,
-                                                       BBlockTransferSrcAccessOrder,
-                                                       BBlockTransferSrcVectorDim,
-                                                       BBlockTransferSrcScalarPerVector,
-                                                       BBlockTransferDstScalarPerVector_K1,
-                                                       BThreadTransferSrcResetCoordinateAfterRun,
-                                                       CThreadTransferSrcDstAccessOrder,
-                                                       CThreadTransferSrcDstVectorDim,
-                                                       CThreadTransferDstScalarPerVector,
-                                                       AGridIteratorHacks,
-                                                       BGridIteratorHacks,
-                                                       CGridIteratorHacks,
-                                                       AGridMoveSliceWindowIteratorHacks,
-                                                       BGridMoveSliceWindowIteratorHacks,
-                                                       false>;
-
-    auto c_m0_m1_m2_n_grid_desc = GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
-
-    auto c_blockid_to_m0_n0_block_cluster_adaptor =
-        GridwiseGemm::MakeCBlockClusterAdaptor(c_m_n_grid_desc);
-
-    if(hipThreadIdx_x == 0)
-    {
-        *static_cast<remove_cv_t<decltype(a_k0_m_k1_grid_desc)>*>(p_a_k0_m_k1_grid_desc) =
-            a_k0_m_k1_grid_desc;
-        *static_cast<remove_cv_t<decltype(b_k0_n_k1_grid_desc)>*>(p_b_k0_n_k1_grid_desc) =
-            b_k0_n_k1_grid_desc;
-        *static_cast<decltype(c_m0_m1_m2_n_grid_desc)*>(p_c_m0_m1_m2_n_grid_desc) =
-            c_m0_m1_m2_n_grid_desc;
-        *static_cast<decltype(c_blockid_to_m0_n0_block_cluster_adaptor)*>(
-            p_c_blockid_to_m0_n0_block_cluster_adaptor) = c_blockid_to_m0_n0_block_cluster_adaptor;
-    }
-};
-
-extern "C" __global__ void
-#if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
-#endif
-        dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            FloatC* __restrict__ p_c_grid,
-            const void CONSTANT* p_a_k0_m_k1_grid_desc,
-            const void CONSTANT* p_b_k0_n_k1_grid_desc,
-            const void CONSTANT* p_c_m0_m1_m2_n_grid_desc,
-            const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor)
-{
-
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    constexpr auto in_n_hi_wi_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 28, 28, 256));
-    constexpr auto wei_k_y_x_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 3, 3, 256));
-    constexpr auto out_n_ho_wo_k_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(256, 28, 28, 256));
-
-    constexpr auto descs =
-        transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad(in_n_hi_wi_c_desc,
-                                                                          wei_k_y_x_c_desc,
-                                                                          out_n_ho_wo_k_desc,
-                                                                          make_tuple(1, 1),
-                                                                          make_tuple(1, 1),
-                                                                          make_tuple(1, 1),
-                                                                          make_tuple(1, 1),
-                                                                          Number<K1>{});
-
-    constexpr auto a_k0_m_k1_grid_desc_tmp = descs[I0];
-    constexpr auto b_k0_n_k1_grid_desc_tmp = descs[I1];
-    constexpr auto c_m_n_grid_desc         = descs[I2];
-
-    using AK0MK1GridDesc = decltype(a_k0_m_k1_grid_desc_tmp);
-    using BK0NK1GridDesc = decltype(b_k0_n_k1_grid_desc_tmp);
-    using CMNGridDesc    = decltype(c_m_n_grid_desc);
-
-    using BGridIteratorHacks = decltype(make_tuple(
-        make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
-        make_tuple(
-            Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{})));
-
-    using AGridIteratorHacks =
-        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
-                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
-                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})));
-
-    using CGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 1, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 1, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 1, 0, 0>{}),
-                                                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 2, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 2, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 0, 0, 0>{},
-                                                              Sequence<0, 0, 2, 0, 0>{})));
-
-    using AGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
-    using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0>;
-
-    using GridwiseGemm =
-        GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
-                                                       FloatAB,
-                                                       FloatAcc,
-                                                       FloatC,
-                                                       InMemoryDataOperationEnum_t::Set,
-                                                       AK0MK1GridDesc,
-                                                       BK0NK1GridDesc,
-                                                       CMNGridDesc,
-                                                       MPerBlock,
-                                                       NPerBlock,
-                                                       KPerBlock,
-                                                       MPerWave,
-                                                       NPerWave,
-                                                       K1,
-                                                       MRepeat,
-                                                       NRepeat,
-                                                       ABlockTransferThreadSliceLengths_K0_M_K1,
-                                                       ABlockTransferThreadClusterLengths_K0_M_K1,
-                                                       ABlockTransferThreadClusterArrangeOrder,
-                                                       ABlockTransferSrcAccessOrder,
-                                                       ABlockTransferSrcVectorDim,
-                                                       ABlockTransferSrcScalarPerVector,
-                                                       ABlockTransferDstScalarPerVector_K1,
-                                                       AThreadTransferSrcResetCoordinateAfterRun,
-                                                       BBlockTransferThreadSliceLengths_K0_N_K1,
-                                                       BBlockTransferThreadClusterLengths_K0_N_K1,
-                                                       BBlockTransferThreadClusterArrangeOrder,
-                                                       BBlockTransferSrcAccessOrder,
-                                                       BBlockTransferSrcVectorDim,
-                                                       BBlockTransferSrcScalarPerVector,
-                                                       BBlockTransferDstScalarPerVector_K1,
-                                                       BThreadTransferSrcResetCoordinateAfterRun,
-                                                       CThreadTransferSrcDstAccessOrder,
-                                                       CThreadTransferSrcDstVectorDim,
-                                                       CThreadTransferDstScalarPerVector,
-                                                       AGridIteratorHacks,
-                                                       BGridIteratorHacks,
-                                                       CGridIteratorHacks,
-                                                       AGridMoveSliceWindowIteratorHacks,
-                                                       BGridMoveSliceWindowIteratorHacks,
-                                                       false>;
-    constexpr auto c_m0_m1_m2_n_grid_desc_tmp =
-        GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
-    constexpr auto c_blockid_to_m0_n0_block_cluster_adaptor_tmp =
-        GridwiseGemm::MakeCBlockClusterAdaptor(c_m_n_grid_desc);
-
-    using CM0M1M2NGridDesc = decltype(c_m0_m1_m2_n_grid_desc_tmp);
-    using CBlockIdToM0N0BlockClusterAdaptor =
-        decltype(c_blockid_to_m0_n0_block_cluster_adaptor_tmp);
-
-    const auto a_k0_m_k1_grid_desc =
-        *reinterpret_cast<const AK0MK1GridDesc*>((const void*)p_a_k0_m_k1_grid_desc);
-    const auto b_k0_n_k1_grid_desc =
-        *reinterpret_cast<const BK0NK1GridDesc*>((const void*)p_b_k0_n_k1_grid_desc);
-    const auto c_m0_m1_m2_n_grid_desc =
-        *reinterpret_cast<const CM0M1M2NGridDesc*>((const void*)p_c_m0_m1_m2_n_grid_desc);
-    const auto c_blockid_to_m0_n0_block_cluster_adaptor =
-        *reinterpret_cast<const CBlockIdToM0N0BlockClusterAdaptor*>(
-            (const void*)p_c_blockid_to_m0_n0_block_cluster_adaptor);
-
-    constexpr index_t shared_block_size =
-        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
-
-    __shared__ FloatAB p_shared_block[shared_block_size];
-
-    GridwiseGemm::Run(p_a_grid,
-                      p_b_grid,
-                      p_c_grid,
-                      p_shared_block,
-                      a_k0_m_k1_grid_desc,
-                      b_k0_n_k1_grid_desc,
-                      c_m0_m1_m2_n_grid_desc,
-                      c_blockid_to_m0_n0_block_cluster_adaptor);
-};
diff --git a/external/half/include/half.hpp b/external/half/include/half.hpp
deleted file mode 100644
index 25f543881f..0000000000
--- a/external/half/include/half.hpp
+++ /dev/null
@@ -1,5670 +0,0 @@
-// half - IEEE 754-based half-precision floating-point library.
-//
-// Copyright (c) 2012-2019 Christian Rau <rauy@users.sourceforge.net>
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
-// associated documentation
-// files (the "Software"), to deal in the Software without restriction, including without limitation
-// the rights to use, copy,
-// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit
-// persons to whom the
-// Software is furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in all copies or
-// substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
-// NOT LIMITED TO THE
-// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
-// SHALL THE AUTHORS OR
-// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
-// CONTRACT, TORT OR OTHERWISE,
-// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-// SOFTWARE.
-
-// Version 2.1.0
-
-/// \file
-/// Main header file for half-precision functionality.
-
-#ifndef HALF_HALF_HPP
-#define HALF_HALF_HPP
-
-#define HALF_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
-
-#if defined(__INTEL_COMPILER)
-#define HALF_ICC_VERSION __INTEL_COMPILER
-#elif defined(__ICC)
-#define HALF_ICC_VERSION __ICC
-#elif defined(__ICL)
-#define HALF_ICC_VERSION __ICL
-#else
-#define HALF_ICC_VERSION 0
-#endif
-
-// check C++11 language features
-#if defined(__clang__) // clang
-#if __has_feature(cxx_static_assert) && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
-#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
-#endif
-#if __has_feature(cxx_constexpr) && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
-#define HALF_ENABLE_CPP11_CONSTEXPR 1
-#endif
-#if __has_feature(cxx_noexcept) && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
-#define HALF_ENABLE_CPP11_NOEXCEPT 1
-#endif
-#if __has_feature(cxx_user_literals) && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
-#define HALF_ENABLE_CPP11_USER_LITERALS 1
-#endif
-#if __has_feature(cxx_thread_local) && !defined(HALF_ENABLE_CPP11_THREAD_LOCAL)
-#define HALF_ENABLE_CPP11_THREAD_LOCAL 1
-#endif
-#if(defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L) && \
-    !defined(HALF_ENABLE_CPP11_LONG_LONG)
-#define HALF_ENABLE_CPP11_LONG_LONG 1
-#endif
-#elif HALF_ICC_VERSION && defined(__INTEL_CXX11_MODE__) // Intel C++
-#if HALF_ICC_VERSION >= 1500 && !defined(HALF_ENABLE_CPP11_THREAD_LOCAL)
-#define HALF_ENABLE_CPP11_THREAD_LOCAL 1
-#endif
-#if HALF_ICC_VERSION >= 1500 && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
-#define HALF_ENABLE_CPP11_USER_LITERALS 1
-#endif
-#if HALF_ICC_VERSION >= 1400 && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
-#define HALF_ENABLE_CPP11_CONSTEXPR 1
-#endif
-#if HALF_ICC_VERSION >= 1400 && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
-#define HALF_ENABLE_CPP11_NOEXCEPT 1
-#endif
-#if HALF_ICC_VERSION >= 1110 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
-#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
-#endif
-#if HALF_ICC_VERSION >= 1110 && !defined(HALF_ENABLE_CPP11_LONG_LONG)
-#define HALF_ENABLE_CPP11_LONG_LONG 1
-#endif
-#elif defined(__GNUC__) // gcc
-#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L
-#if HALF_GCC_VERSION >= 408 && !defined(HALF_ENABLE_CPP11_THREAD_LOCAL)
-#define HALF_ENABLE_CPP11_THREAD_LOCAL 1
-#endif
-#if HALF_GCC_VERSION >= 407 && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
-#define HALF_ENABLE_CPP11_USER_LITERALS 1
-#endif
-#if HALF_GCC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
-#define HALF_ENABLE_CPP11_CONSTEXPR 1
-#endif
-#if HALF_GCC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
-#define HALF_ENABLE_CPP11_NOEXCEPT 1
-#endif
-#if HALF_GCC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
-#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
-#endif
-#if !defined(HALF_ENABLE_CPP11_LONG_LONG)
-#define HALF_ENABLE_CPP11_LONG_LONG 1
-#endif
-#endif
-#define HALF_TWOS_COMPLEMENT_INT 1
-#elif defined(_MSC_VER) // Visual C++
-#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_THREAD_LOCAL)
-#define HALF_ENABLE_CPP11_THREAD_LOCAL 1
-#endif
-#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
-#define HALF_ENABLE_CPP11_USER_LITERALS 1
-#endif
-#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
-#define HALF_ENABLE_CPP11_CONSTEXPR 1
-#endif
-#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
-#define HALF_ENABLE_CPP11_NOEXCEPT 1
-#endif
-#if _MSC_VER >= 1600 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
-#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
-#endif
-#if _MSC_VER >= 1310 && !defined(HALF_ENABLE_CPP11_LONG_LONG)
-#define HALF_ENABLE_CPP11_LONG_LONG 1
-#endif
-#define HALF_TWOS_COMPLEMENT_INT 1
-#define HALF_POP_WARNINGS 1
-#pragma warning(push)
-#pragma warning(disable : 4099 4127 4146) // struct vs class, constant in if, negative unsigned
-#endif
-
-// check C++11 library features
-#include <utility>
-#if defined(_LIBCPP_VERSION) // libc++
-#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103
-#ifndef HALF_ENABLE_CPP11_TYPE_TRAITS
-#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
-#endif
-#ifndef HALF_ENABLE_CPP11_CSTDINT
-#define HALF_ENABLE_CPP11_CSTDINT 1
-#endif
-#ifndef HALF_ENABLE_CPP11_CMATH
-#define HALF_ENABLE_CPP11_CMATH 1
-#endif
-#ifndef HALF_ENABLE_CPP11_HASH
-#define HALF_ENABLE_CPP11_HASH 1
-#endif
-#ifndef HALF_ENABLE_CPP11_CFENV
-#define HALF_ENABLE_CPP11_CFENV 1
-#endif
-#endif
-#elif defined(__GLIBCXX__) // libstdc++
-#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103
-#ifdef __clang__
-#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_TYPE_TRAITS)
-#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
-#endif
-#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CSTDINT)
-#define HALF_ENABLE_CPP11_CSTDINT 1
-#endif
-#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CMATH)
-#define HALF_ENABLE_CPP11_CMATH 1
-#endif
-#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_HASH)
-#define HALF_ENABLE_CPP11_HASH 1
-#endif
-#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CFENV)
-#define HALF_ENABLE_CPP11_CFENV 1
-#endif
-#else
-#if HALF_GCC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_TYPE_TRAITS)
-#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
-#endif
-#if HALF_GCC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CSTDINT)
-#define HALF_ENABLE_CPP11_CSTDINT 1
-#endif
-#if HALF_GCC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CMATH)
-#define HALF_ENABLE_CPP11_CMATH 1
-#endif
-#if HALF_GCC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_HASH)
-#define HALF_ENABLE_CPP11_HASH 1
-#endif
-#if HALF_GCC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CFENV)
-#define HALF_ENABLE_CPP11_CFENV 1
-#endif
-#endif
-#endif
-#elif defined(_CPPLIB_VER) // Dinkumware/Visual C++
-#if _CPPLIB_VER >= 520 && !defined(HALF_ENABLE_CPP11_TYPE_TRAITS)
-#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
-#endif
-#if _CPPLIB_VER >= 520 && !defined(HALF_ENABLE_CPP11_CSTDINT)
-#define HALF_ENABLE_CPP11_CSTDINT 1
-#endif
-#if _CPPLIB_VER >= 520 && !defined(HALF_ENABLE_CPP11_HASH)
-#define HALF_ENABLE_CPP11_HASH 1
-#endif
-#if _CPPLIB_VER >= 610 && !defined(HALF_ENABLE_CPP11_CMATH)
-#define HALF_ENABLE_CPP11_CMATH 1
-#endif
-#if _CPPLIB_VER >= 610 && !defined(HALF_ENABLE_CPP11_CFENV)
-#define HALF_ENABLE_CPP11_CFENV 1
-#endif
-#endif
-#undef HALF_GCC_VERSION
-#undef HALF_ICC_VERSION
-
-// any error throwing C++ exceptions?
-#if defined(HALF_ERRHANDLING_THROW_INVALID) || defined(HALF_ERRHANDLING_THROW_DIVBYZERO) ||  \
-    defined(HALF_ERRHANDLING_THROW_OVERFLOW) || defined(HALF_ERRHANDLING_THROW_UNDERFLOW) || \
-    defined(HALF_ERRHANDLING_THROW_INEXACT)
-#define HALF_ERRHANDLING_THROWS 1
-#endif
-
-// any error handling enabled?
-#define HALF_ERRHANDLING                                                          \
-    (HALF_ERRHANDLING_FLAGS || HALF_ERRHANDLING_ERRNO || HALF_ERRHANDLING_FENV || \
-     HALF_ERRHANDLING_THROWS)
-
-#if HALF_ERRHANDLING
-#define HALF_UNUSED_NOERR(name) name
-#else
-#define HALF_UNUSED_NOERR(name)
-#endif
-
-// support constexpr
-#if HALF_ENABLE_CPP11_CONSTEXPR
-#define HALF_CONSTEXPR constexpr
-#define HALF_CONSTEXPR_CONST constexpr
-#if HALF_ERRHANDLING
-#define HALF_CONSTEXPR_NOERR
-#else
-#define HALF_CONSTEXPR_NOERR constexpr
-#endif
-#else
-#define HALF_CONSTEXPR
-#define HALF_CONSTEXPR_CONST const
-#define HALF_CONSTEXPR_NOERR
-#endif
-
-// support noexcept
-#if HALF_ENABLE_CPP11_NOEXCEPT
-#define HALF_NOEXCEPT noexcept
-#define HALF_NOTHROW noexcept
-#else
-#define HALF_NOEXCEPT
-#define HALF_NOTHROW throw()
-#endif
-
-// support thread storage
-#if HALF_ENABLE_CPP11_THREAD_LOCAL
-#define HALF_THREAD_LOCAL thread_local
-#else
-#define HALF_THREAD_LOCAL static
-#endif
-
-#include <utility>
-#include <algorithm>
-#include <istream>
-#include <ostream>
-#include <limits>
-#include <stdexcept>
-#include <climits>
-#include <cmath>
-#include <cstring>
-#include <cstdlib>
-#if HALF_ENABLE_CPP11_TYPE_TRAITS
-#include <type_traits>
-#endif
-#if HALF_ENABLE_CPP11_CSTDINT
-#include <cstdint>
-#endif
-#if HALF_ERRHANDLING_ERRNO
-#include <cerrno>
-#endif
-#if HALF_ENABLE_CPP11_CFENV
-#include <cfenv>
-#endif
-#if HALF_ENABLE_CPP11_HASH
-#include <functional>
-#endif
-#if HALF_ENABLE_F16C_INTRINSICS
-#include <immintrin.h>
-#endif
-
-#ifndef HALF_ENABLE_F16C_INTRINSICS
-/// Enable F16C intruction set intrinsics.
-/// Defining this to 1 enables the use of [F16C compiler
-/// intrinsics](https://en.wikipedia.org/wiki/F16C) for converting between
-/// half-precision and single-precision values which may result in improved performance. This will
-/// not perform additional checks
-/// for support of the F16C instruction set, so an appropriate target platform is required when
-/// enabling this feature.
-///
-/// Unless predefined it will be enabled automatically when the `__F16C__` symbol is defined, which
-/// some compilers do on supporting platforms.
-#define HALF_ENABLE_F16C_INTRINSICS __F16C__
-#endif
-
-#ifdef HALF_DOXYGEN_ONLY
-/// Type for internal floating-point computations.
-/// This can be predefined to a built-in floating-point type (`float`, `double` or `long double`) to
-/// override the internal
-/// half-precision implementation to use this type for computing arithmetic operations and
-/// mathematical function (if available).
-/// This can result in improved performance for arithmetic operators and mathematical functions but
-/// might cause results to
-/// deviate from the specified half-precision rounding mode and inhibits proper detection of
-/// half-precision exceptions.
-#define HALF_ARITHMETIC_TYPE (undefined)
-
-/// Enable internal exception flags.
-/// Defining this to 1 causes operations on half-precision values to raise internal floating-point
-/// exception flags according to
-/// the IEEE 754 standard. These can then be cleared and checked with clearexcept(), testexcept().
-#define HALF_ERRHANDLING_FLAGS 0
-
-/// Enable exception propagation to `errno`.
-/// Defining this to 1 causes operations on half-precision values to propagate floating-point
-/// exceptions to
-/// [errno](https://en.cppreference.com/w/cpp/error/errno) from `<cerrno>`. Specifically this will
-/// propagate domain errors as
-/// [EDOM](https://en.cppreference.com/w/cpp/error/errno_macros) and pole, overflow and underflow
-/// errors as
-/// [ERANGE](https://en.cppreference.com/w/cpp/error/errno_macros). Inexact errors won't be
-/// propagated.
-#define HALF_ERRHANDLING_ERRNO 0
-
-/// Enable exception propagation to built-in floating-point platform.
-/// Defining this to 1 causes operations on half-precision values to propagate floating-point
-/// exceptions to the built-in
-/// single- and double-precision implementation's exception flags using the
-/// [C++11 floating-point environment control](https://en.cppreference.com/w/cpp/numeric/fenv) from
-/// `<cfenv>`. However, this
-/// does not work in reverse and single- or double-precision exceptions will not raise the
-/// corresponding half-precision
-/// exception flags, nor will explicitly clearing flags clear the corresponding built-in flags.
-#define HALF_ERRHANDLING_FENV 0
-
-/// Throw C++ exception on domain errors.
-/// Defining this to a string literal causes operations on half-precision values to throw a
-/// [std::domain_error](https://en.cppreference.com/w/cpp/error/domain_error) with the specified
-/// message on domain errors.
-#define HALF_ERRHANDLING_THROW_INVALID (undefined)
-
-/// Throw C++ exception on pole errors.
-/// Defining this to a string literal causes operations on half-precision values to throw a
-/// [std::domain_error](https://en.cppreference.com/w/cpp/error/domain_error) with the specified
-/// message on pole errors.
-#define HALF_ERRHANDLING_THROW_DIVBYZERO (undefined)
-
-/// Throw C++ exception on overflow errors.
-/// Defining this to a string literal causes operations on half-precision values to throw a
-/// [std::overflow_error](https://en.cppreference.com/w/cpp/error/overflow_error) with the specified
-/// message on overflows.
-#define HALF_ERRHANDLING_THROW_OVERFLOW (undefined)
-
-/// Throw C++ exception on underflow errors.
-/// Defining this to a string literal causes operations on half-precision values to throw a
-/// [std::underflow_error](https://en.cppreference.com/w/cpp/error/underflow_error) with the
-/// specified message on underflows.
-#define HALF_ERRHANDLING_THROW_UNDERFLOW (undefined)
-
-/// Throw C++ exception on rounding errors.
-/// Defining this to 1 causes operations on half-precision values to throw a
-/// [std::range_error](https://en.cppreference.com/w/cpp/error/range_error) with the specified
-/// message on general rounding errors.
-#define HALF_ERRHANDLING_THROW_INEXACT (undefined)
-#endif
-
-#ifndef HALF_ERRHANDLING_OVERFLOW_TO_INEXACT
-/// Raise INEXACT exception on overflow.
-/// Defining this to 1 (default) causes overflow errors to automatically raise inexact exceptions in
-/// addition.
-/// These will be raised after any possible handling of the underflow exception.
-#define HALF_ERRHANDLING_OVERFLOW_TO_INEXACT 1
-#endif
-
-#ifndef HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT
-/// Raise INEXACT exception on underflow.
-/// Defining this to 1 (default) causes underflow errors to automatically raise inexact exceptions
-/// in addition.
-/// These will be raised after any possible handling of the underflow exception.
-///
-/// **Note:** This will actually cause underflow (and the accompanying inexact) exceptions to be
-/// raised *only* when the result
-/// is inexact, while if disabled bare underflow errors will be raised for *any* (possibly exact)
-/// subnormal result.
-#define HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT 1
-#endif
-
-/// Default rounding mode.
-/// This specifies the rounding mode used for all conversions between [half](\ref half_float::half)s
-/// and more precise types
-/// (unless using half_cast() and specifying the rounding mode directly) as well as in arithmetic
-/// operations and mathematical
-/// functions. It can be redefined (before including half.hpp) to one of the standard rounding modes
-/// using their respective
-/// constants or the equivalent values of
-/// [std::float_round_style](https://en.cppreference.com/w/cpp/types/numeric_limits/float_round_style):
-///
-/// `std::float_round_style`         | value | rounding
-/// ---------------------------------|-------|-------------------------
-/// `std::round_indeterminate`       | -1    | fastest
-/// `std::round_toward_zero`         | 0     | toward zero
-/// `std::round_to_nearest`          | 1     | to nearest (default)
-/// `std::round_toward_infinity`     | 2     | toward positive infinity
-/// `std::round_toward_neg_infinity` | 3     | toward negative infinity
-///
-/// By default this is set to `1` (`std::round_to_nearest`), which rounds results to the nearest
-/// representable value. It can even
-/// be set to
-/// [std::numeric_limits<float>::round_style](https://en.cppreference.com/w/cpp/types/numeric_limits/round_style)
-/// to synchronize
-/// the rounding mode with that of the built-in single-precision implementation (which is likely
-/// `std::round_to_nearest`, though).
-#ifndef HALF_ROUND_STYLE
-#define HALF_ROUND_STYLE 1 // = std::round_to_nearest
-#endif
-
-/// Value signaling overflow.
-/// In correspondence with `HUGE_VAL[F|L]` from `<cmath>` this symbol expands to a positive value
-/// signaling the overflow of an
-/// operation, in particular it just evaluates to positive infinity.
-///
-/// **See also:** Documentation for
-/// [HUGE_VAL](https://en.cppreference.com/w/cpp/numeric/math/HUGE_VAL)
-#define HUGE_VALH std::numeric_limits<half_float::half>::infinity()
-
-/// Fast half-precision fma function.
-/// This symbol is defined if the fma() function generally executes as fast as, or faster than, a
-/// separate
-/// half-precision multiplication followed by an addition, which is always the case.
-///
-/// **See also:** Documentation for
-/// [FP_FAST_FMA](https://en.cppreference.com/w/cpp/numeric/math/fma)
-#define FP_FAST_FMAH 1
-
-///	Half rounding mode.
-/// In correspondence with `FLT_ROUNDS` from `<cfloat>` this symbol expands to the rounding mode
-/// used for
-/// half-precision operations. It is an alias for [HALF_ROUND_STYLE](\ref HALF_ROUND_STYLE).
-///
-/// **See also:** Documentation for
-/// [FLT_ROUNDS](https://en.cppreference.com/w/cpp/types/climits/FLT_ROUNDS)
-#define HLF_ROUNDS HALF_ROUND_STYLE
-
-#ifndef FP_ILOGB0
-#define FP_ILOGB0 INT_MIN
-#endif
-#ifndef FP_ILOGBNAN
-#define FP_ILOGBNAN INT_MAX
-#endif
-#ifndef FP_SUBNORMAL
-#define FP_SUBNORMAL 0
-#endif
-#ifndef FP_ZERO
-#define FP_ZERO 1
-#endif
-#ifndef FP_NAN
-#define FP_NAN 2
-#endif
-#ifndef FP_INFINITE
-#define FP_INFINITE 3
-#endif
-#ifndef FP_NORMAL
-#define FP_NORMAL 4
-#endif
-
-#if !HALF_ENABLE_CPP11_CFENV && !defined(FE_ALL_EXCEPT)
-#define FE_INVALID 0x10
-#define FE_DIVBYZERO 0x08
-#define FE_OVERFLOW 0x04
-#define FE_UNDERFLOW 0x02
-#define FE_INEXACT 0x01
-#define FE_ALL_EXCEPT (FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW | FE_UNDERFLOW | FE_INEXACT)
-#endif
-
-/// Main namespace for half-precision functionality.
-/// This namespace contains all the functionality provided by the library.
-namespace half_float {
-class half;
-
-#if HALF_ENABLE_CPP11_USER_LITERALS
-/// Library-defined half-precision literals.
-/// Import this namespace to enable half-precision floating-point literals:
-/// ~~~~{.cpp}
-/// using namespace half_float::literal;
-/// half_float::half = 4.2_h;
-/// ~~~~
-namespace literal {
-half operator"" _h(long double);
-}
-#endif
-
-/// \internal
-/// \brief Implementation details.
-namespace detail {
-#if HALF_ENABLE_CPP11_TYPE_TRAITS
-/// Conditional type.
-template <bool B, typename T, typename F>
-struct conditional : std::conditional<B, T, F>
-{
-};
-
-/// Helper for tag dispatching.
-template <bool B>
-struct bool_type : std::integral_constant<bool, B>
-{
-};
-using std::false_type;
-using std::true_type;
-
-/// Type traits for floating-point types.
-template <typename T>
-struct is_float : std::is_floating_point<T>
-{
-};
-#else
-/// Conditional type.
-template <bool, typename T, typename>
-struct conditional
-{
-    typedef T type;
-};
-template <typename T, typename F>
-struct conditional<false, T, F>
-{
-    typedef F type;
-};
-
-/// Helper for tag dispatching.
-template <bool>
-struct bool_type
-{
-};
-typedef bool_type<true> true_type;
-typedef bool_type<false> false_type;
-
-/// Type traits for floating-point types.
-template <typename>
-struct is_float : false_type
-{
-};
-template <typename T>
-struct is_float<const T> : is_float<T>
-{
-};
-template <typename T>
-struct is_float<volatile T> : is_float<T>
-{
-};
-template <typename T>
-struct is_float<const volatile T> : is_float<T>
-{
-};
-template <>
-struct is_float<float> : true_type
-{
-};
-template <>
-struct is_float<double> : true_type
-{
-};
-template <>
-struct is_float<long double> : true_type
-{
-};
-#endif
-
-/// Type traits for floating-point bits.
-template <typename T>
-struct bits
-{
-    typedef unsigned char type;
-};
-template <typename T>
-struct bits<const T> : bits<T>
-{
-};
-template <typename T>
-struct bits<volatile T> : bits<T>
-{
-};
-template <typename T>
-struct bits<const volatile T> : bits<T>
-{
-};
-
-#if HALF_ENABLE_CPP11_CSTDINT
-/// Unsigned integer of (at least) 16 bits width.
-typedef std::uint_least16_t uint16;
-
-/// Fastest unsigned integer of (at least) 32 bits width.
-typedef std::uint_fast32_t uint32;
-
-/// Fastest signed integer of (at least) 32 bits width.
-typedef std::int_fast32_t int32;
-
-/// Unsigned integer of (at least) 32 bits width.
-template <>
-struct bits<float>
-{
-    typedef std::uint_least32_t type;
-};
-
-/// Unsigned integer of (at least) 64 bits width.
-template <>
-struct bits<double>
-{
-    typedef std::uint_least64_t type;
-};
-#else
-/// Unsigned integer of (at least) 16 bits width.
-typedef unsigned short uint16;
-
-/// Fastest unsigned integer of (at least) 32 bits width.
-typedef unsigned long uint32;
-
-/// Fastest unsigned integer of (at least) 32 bits width.
-typedef long int32;
-
-/// Unsigned integer of (at least) 32 bits width.
-template <>
-struct bits<float>
-    : conditional<std::numeric_limits<unsigned int>::digits >= 32, unsigned int, unsigned long>
-{
-};
-
-#if HALF_ENABLE_CPP11_LONG_LONG
-/// Unsigned integer of (at least) 64 bits width.
-template <>
-struct bits<double> : conditional<std::numeric_limits<unsigned long>::digits >= 64,
-                                  unsigned long,
-                                  unsigned long long>
-{
-};
-#else
-/// Unsigned integer of (at least) 64 bits width.
-template <>
-struct bits<double>
-{
-    typedef unsigned long type;
-};
-#endif
-#endif
-
-#ifdef HALF_ARITHMETIC_TYPE
-/// Type to use for arithmetic computations and mathematic functions internally.
-typedef HALF_ARITHMETIC_TYPE internal_t;
-#endif
-
-/// Tag type for binary construction.
-struct binary_t
-{
-};
-
-/// Tag for binary construction.
-HALF_CONSTEXPR_CONST binary_t binary = binary_t();
-
-/// \name Implementation defined classification and arithmetic
-/// \{
-
-/// Check for infinity.
-/// \tparam T argument type (builtin floating-point type)
-/// \param arg value to query
-/// \retval true if infinity
-/// \retval false else
-template <typename T>
-bool builtin_isinf(T arg)
-{
-#if HALF_ENABLE_CPP11_CMATH
-    return std::isinf(arg);
-#elif defined(_MSC_VER)
-    return !::_finite(static_cast<double>(arg)) && !::_isnan(static_cast<double>(arg));
-#else
-    return arg == std::numeric_limits<T>::infinity() || arg == -std::numeric_limits<T>::infinity();
-#endif
-}
-
-/// Check for NaN.
-/// \tparam T argument type (builtin floating-point type)
-/// \param arg value to query
-/// \retval true if not a number
-/// \retval false else
-template <typename T>
-bool builtin_isnan(T arg)
-{
-#if HALF_ENABLE_CPP11_CMATH
-    return std::isnan(arg);
-#elif defined(_MSC_VER)
-    return ::_isnan(static_cast<double>(arg)) != 0;
-#else
-    return arg != arg;
-#endif
-}
-
-/// Check sign.
-/// \tparam T argument type (builtin floating-point type)
-/// \param arg value to query
-/// \retval true if signbit set
-/// \retval false else
-template <typename T>
-bool builtin_signbit(T arg)
-{
-#if HALF_ENABLE_CPP11_CMATH
-    return std::signbit(arg);
-#else
-    return arg < T() || (arg == T() && T(1) / arg < T());
-#endif
-}
-
-/// Platform-independent sign mask.
-/// \param arg integer value in two's complement
-/// \retval -1 if \a arg negative
-/// \retval 0 if \a arg positive
-inline uint32 sign_mask(uint32 arg)
-{
-    static const int N = std::numeric_limits<uint32>::digits - 1;
-#if HALF_TWOS_COMPLEMENT_INT
-    return static_cast<int32>(arg) >> N;
-#else
-    return -((arg >> N) & 1);
-#endif
-}
-
-/// Platform-independent arithmetic right shift.
-/// \param arg integer value in two's complement
-/// \param i shift amount (at most 31)
-/// \return \a arg right shifted for \a i bits with possible sign extension
-inline uint32 arithmetic_shift(uint32 arg, int i)
-{
-#if HALF_TWOS_COMPLEMENT_INT
-    return static_cast<int32>(arg) >> i;
-#else
-    return static_cast<int32>(arg) / (static_cast<int32>(1) << i) -
-           ((arg >> (std::numeric_limits<uint32>::digits - 1)) & 1);
-#endif
-}
-
-/// \}
-/// \name Error handling
-/// \{
-
-/// Internal exception flags.
-/// \return reference to global exception flags
-inline int& errflags()
-{
-    HALF_THREAD_LOCAL int flags = 0;
-    return flags;
-}
-
-/// Raise floating-point exception.
-/// \param flags exceptions to raise
-/// \param cond condition to raise exceptions for
-inline void raise(int HALF_UNUSED_NOERR(flags), bool HALF_UNUSED_NOERR(cond) = true)
-{
-#if HALF_ERRHANDLING
-    if(!cond)
-        return;
-#if HALF_ERRHANDLING_FLAGS
-    errflags() |= flags;
-#endif
-#if HALF_ERRHANDLING_ERRNO
-    if(flags & FE_INVALID)
-        errno = EDOM;
-    else if(flags & (FE_DIVBYZERO | FE_OVERFLOW | FE_UNDERFLOW))
-        errno = ERANGE;
-#endif
-#if HALF_ERRHANDLING_FENV && HALF_ENABLE_CPP11_CFENV
-    std::feraiseexcept(flags);
-#endif
-#ifdef HALF_ERRHANDLING_THROW_INVALID
-    if(flags & FE_INVALID)
-        throw std::domain_error(HALF_ERRHANDLING_THROW_INVALID);
-#endif
-#ifdef HALF_ERRHANDLING_THROW_DIVBYZERO
-    if(flags & FE_DIVBYZERO)
-        throw std::domain_error(HALF_ERRHANDLING_THROW_DIVBYZERO);
-#endif
-#ifdef HALF_ERRHANDLING_THROW_OVERFLOW
-    if(flags & FE_OVERFLOW)
-        throw std::overflow_error(HALF_ERRHANDLING_THROW_OVERFLOW);
-#endif
-#ifdef HALF_ERRHANDLING_THROW_UNDERFLOW
-    if(flags & FE_UNDERFLOW)
-        throw std::underflow_error(HALF_ERRHANDLING_THROW_UNDERFLOW);
-#endif
-#ifdef HALF_ERRHANDLING_THROW_INEXACT
-    if(flags & FE_INEXACT)
-        throw std::range_error(HALF_ERRHANDLING_THROW_INEXACT);
-#endif
-#if HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT
-    if((flags & FE_UNDERFLOW) && !(flags & FE_INEXACT))
-        raise(FE_INEXACT);
-#endif
-#if HALF_ERRHANDLING_OVERFLOW_TO_INEXACT
-    if((flags & FE_OVERFLOW) && !(flags & FE_INEXACT))
-        raise(FE_INEXACT);
-#endif
-#endif
-}
-
-/// Check and signal for any NaN.
-/// \param x first half-precision value to check
-/// \param y second half-precision value to check
-/// \retval true if either \a x or \a y is NaN
-/// \retval false else
-/// \exception FE_INVALID if \a x or \a y is NaN
-inline HALF_CONSTEXPR_NOERR bool compsignal(unsigned int x, unsigned int y)
-{
-#if HALF_ERRHANDLING
-    raise(FE_INVALID, (x & 0x7FFF) > 0x7C00 || (y & 0x7FFF) > 0x7C00);
-#endif
-    return (x & 0x7FFF) > 0x7C00 || (y & 0x7FFF) > 0x7C00;
-}
-
-/// Signal and silence signaling NaN.
-/// \param nan half-precision NaN value
-/// \return quiet NaN
-/// \exception FE_INVALID if \a nan is signaling NaN
-inline HALF_CONSTEXPR_NOERR unsigned int signal(unsigned int nan)
-{
-#if HALF_ERRHANDLING
-    raise(FE_INVALID, !(nan & 0x200));
-#endif
-    return nan | 0x200;
-}
-
-/// Signal and silence signaling NaNs.
-/// \param x first half-precision value to check
-/// \param y second half-precision value to check
-/// \return quiet NaN
-/// \exception FE_INVALID if \a x or \a y is signaling NaN
-inline HALF_CONSTEXPR_NOERR unsigned int signal(unsigned int x, unsigned int y)
-{
-#if HALF_ERRHANDLING
-    raise(FE_INVALID,
-          ((x & 0x7FFF) > 0x7C00 && !(x & 0x200)) || ((y & 0x7FFF) > 0x7C00 && !(y & 0x200)));
-#endif
-    return ((x & 0x7FFF) > 0x7C00) ? (x | 0x200) : (y | 0x200);
-}
-
-/// Signal and silence signaling NaNs.
-/// \param x first half-precision value to check
-/// \param y second half-precision value to check
-/// \param z third half-precision value to check
-/// \return quiet NaN
-/// \exception FE_INVALID if \a x, \a y or \a z is signaling NaN
-inline HALF_CONSTEXPR_NOERR unsigned int signal(unsigned int x, unsigned int y, unsigned int z)
-{
-#if HALF_ERRHANDLING
-    raise(FE_INVALID,
-          ((x & 0x7FFF) > 0x7C00 && !(x & 0x200)) || ((y & 0x7FFF) > 0x7C00 && !(y & 0x200)) ||
-              ((z & 0x7FFF) > 0x7C00 && !(z & 0x200)));
-#endif
-    return ((x & 0x7FFF) > 0x7C00) ? (x | 0x200)
-                                   : ((y & 0x7FFF) > 0x7C00) ? (y | 0x200) : (z | 0x200);
-}
-
-/// Select value or signaling NaN.
-/// \param x preferred half-precision value
-/// \param y ignored half-precision value except for signaling NaN
-/// \return \a y if signaling NaN, \a x otherwise
-/// \exception FE_INVALID if \a y is signaling NaN
-inline HALF_CONSTEXPR_NOERR unsigned int select(unsigned int x, unsigned int HALF_UNUSED_NOERR(y))
-{
-#if HALF_ERRHANDLING
-    return (((y & 0x7FFF) > 0x7C00) && !(y & 0x200)) ? signal(y) : x;
-#else
-    return x;
-#endif
-}
-
-/// Raise domain error and return NaN.
-/// return quiet NaN
-/// \exception FE_INVALID
-inline HALF_CONSTEXPR_NOERR unsigned int invalid()
-{
-#if HALF_ERRHANDLING
-    raise(FE_INVALID);
-#endif
-    return 0x7FFF;
-}
-
-/// Raise pole error and return infinity.
-/// \param sign half-precision value with sign bit only
-/// \return half-precision infinity with sign of \a sign
-/// \exception FE_DIVBYZERO
-inline HALF_CONSTEXPR_NOERR unsigned int pole(unsigned int sign = 0)
-{
-#if HALF_ERRHANDLING
-    raise(FE_DIVBYZERO);
-#endif
-    return sign | 0x7C00;
-}
-
-/// Check value for underflow.
-/// \param arg non-zero half-precision value to check
-/// \return \a arg
-/// \exception FE_UNDERFLOW if arg is subnormal
-inline HALF_CONSTEXPR_NOERR unsigned int check_underflow(unsigned int arg)
-{
-#if HALF_ERRHANDLING && !HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT
-    raise(FE_UNDERFLOW, !(arg & 0x7C00));
-#endif
-    return arg;
-}
-
-/// \}
-/// \name Conversion and rounding
-/// \{
-
-/// Half-precision overflow.
-/// \tparam R rounding mode to use
-/// \param sign half-precision value with sign bit only
-/// \return rounded overflowing half-precision value
-/// \exception FE_OVERFLOW
-template <std::float_round_style R>
-HALF_CONSTEXPR_NOERR unsigned int overflow(unsigned int sign = 0)
-{
-#if HALF_ERRHANDLING
-    raise(FE_OVERFLOW);
-#endif
-    return (R == std::round_toward_infinity)
-               ? (sign + 0x7C00 - (sign >> 15))
-               : (R == std::round_toward_neg_infinity)
-                     ? (sign + 0x7BFF + (sign >> 15))
-                     : (R == std::round_toward_zero) ? (sign | 0x7BFF) : (sign | 0x7C00);
-}
-
-/// Half-precision underflow.
-/// \tparam R rounding mode to use
-/// \param sign half-precision value with sign bit only
-/// \return rounded underflowing half-precision value
-/// \exception FE_UNDERFLOW
-template <std::float_round_style R>
-HALF_CONSTEXPR_NOERR unsigned int underflow(unsigned int sign = 0)
-{
-#if HALF_ERRHANDLING
-    raise(FE_UNDERFLOW);
-#endif
-    return (R == std::round_toward_infinity)
-               ? (sign + 1 - (sign >> 15))
-               : (R == std::round_toward_neg_infinity) ? (sign + (sign >> 15)) : sign;
-}
-
-/// Round half-precision number.
-/// \tparam R rounding mode to use
-/// \tparam I `true` to always raise INEXACT exception, `false` to raise only for rounded results
-/// \param value finite half-precision number to round
-/// \param g guard bit (most significant discarded bit)
-/// \param s sticky bit (or of all but the most significant discarded bits)
-/// \return rounded half-precision value
-/// \exception FE_OVERFLOW on overflows
-/// \exception FE_UNDERFLOW on underflows
-/// \exception FE_INEXACT if value had to be rounded or \a I is `true`
-template <std::float_round_style R, bool I>
-HALF_CONSTEXPR_NOERR unsigned int rounded(unsigned int value, int g, int s)
-{
-#if HALF_ERRHANDLING
-    value += (R == std::round_to_nearest)
-                 ? (g & (s | value))
-                 : (R == std::round_toward_infinity)
-                       ? (~(value >> 15) & (g | s))
-                       : (R == std::round_toward_neg_infinity) ? ((value >> 15) & (g | s)) : 0;
-    if((value & 0x7C00) == 0x7C00)
-        raise(FE_OVERFLOW);
-    else if(value & 0x7C00)
-        raise(FE_INEXACT, I || (g | s) != 0);
-    else
-        raise(FE_UNDERFLOW, !(HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT) || I || (g | s) != 0);
-    return value;
-#else
-    return (R == std::round_to_nearest)
-               ? (value + (g & (s | value)))
-               : (R == std::round_toward_infinity)
-                     ? (value + (~(value >> 15) & (g | s)))
-                     : (R == std::round_toward_neg_infinity) ? (value + ((value >> 15) & (g | s)))
-                                                             : value;
-#endif
-}
-
-/// Round half-precision number to nearest integer value.
-/// \tparam R rounding mode to use
-/// \tparam E `true` for round to even, `false` for round away from zero
-/// \tparam I `true` to raise INEXACT exception (if inexact), `false` to never raise it
-/// \param value half-precision value to round
-/// \return half-precision bits for nearest integral value
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_INEXACT if value had to be rounded and \a I is `true`
-template <std::float_round_style R, bool E, bool I>
-unsigned int integral(unsigned int value)
-{
-    unsigned int abs = value & 0x7FFF;
-    if(abs < 0x3C00)
-    {
-        raise(FE_INEXACT, I);
-        return ((R == std::round_to_nearest)
-                    ? (0x3C00 & -static_cast<unsigned>(abs >= (0x3800 + E)))
-                    : (R == std::round_toward_infinity)
-                          ? (0x3C00 & -(~(value >> 15) & (abs != 0)))
-                          : (R == std::round_toward_neg_infinity)
-                                ? (0x3C00 & -static_cast<unsigned>(value > 0x8000))
-                                : 0) |
-               (value & 0x8000);
-    }
-    if(abs >= 0x6400)
-        return (abs > 0x7C00) ? signal(value) : value;
-    unsigned int exp = 25 - (abs >> 10), mask = (1 << exp) - 1;
-    raise(FE_INEXACT, I && (value & mask));
-    return (((R == std::round_to_nearest)
-                 ? ((1 << (exp - 1)) - (~(value >> exp) & E))
-                 : (R == std::round_toward_infinity)
-                       ? (mask & ((value >> 15) - 1))
-                       : (R == std::round_toward_neg_infinity) ? (mask & -(value >> 15)) : 0) +
-            value) &
-           ~mask;
-}
-
-/// Convert fixed point to half-precision floating-point.
-/// \tparam R rounding mode to use
-/// \tparam F number of fractional bits (at least 11)
-/// \tparam S `true` for signed, `false` for unsigned
-/// \tparam N `true` for additional normalization step, `false` if already normalized to 1.F
-/// \tparam I `true` to always raise INEXACT exception, `false` to raise only for rounded results
-/// \param m mantissa in Q1.F fixed point format
-/// \param exp exponent
-/// \param sign half-precision value with sign bit only
-/// \param s sticky bit (or of all but the most significant already discarded bits)
-/// \return value converted to half-precision
-/// \exception FE_OVERFLOW on overflows
-/// \exception FE_UNDERFLOW on underflows
-/// \exception FE_INEXACT if value had to be rounded or \a I is `true`
-template <std::float_round_style R, unsigned int F, bool S, bool N, bool I>
-unsigned int fixed2half(uint32 m, int exp = 14, unsigned int sign = 0, int s = 0)
-{
-    if(S)
-    {
-        uint32 msign = sign_mask(m);
-        m            = (m ^ msign) - msign;
-        sign         = msign & 0x8000;
-    }
-    if(N)
-        for(; m < (static_cast<uint32>(1) << F) && exp; m <<= 1, --exp)
-            ;
-    else if(exp < 0)
-        return rounded<R, I>(sign + (m >> (F - 10 - exp)),
-                             (m >> (F - 11 - exp)) & 1,
-                             s | ((m & ((static_cast<uint32>(1) << (F - 11 - exp)) - 1)) != 0));
-    return rounded<R, I>(sign + (exp << 10) + (m >> (F - 10)),
-                         (m >> (F - 11)) & 1,
-                         s | ((m & ((static_cast<uint32>(1) << (F - 11)) - 1)) != 0));
-}
-
-/// Convert IEEE single-precision to half-precision.
-/// Credit for this goes to [Jeroen van der
-/// Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf).
-/// \tparam R rounding mode to use
-/// \param value single-precision value to convert
-/// \return rounded half-precision value
-/// \exception FE_OVERFLOW on overflows
-/// \exception FE_UNDERFLOW on underflows
-/// \exception FE_INEXACT if value had to be rounded
-template <std::float_round_style R>
-unsigned int float2half_impl(float value, true_type)
-{
-#if HALF_ENABLE_F16C_INTRINSICS
-    return _mm_cvtsi128_si32(_mm_cvtps_ph(_mm_set_ss(value),
-                                          (R == std::round_to_nearest)
-                                              ? _MM_FROUND_TO_NEAREST_INT
-                                              : (R == std::round_toward_zero)
-                                                    ? _MM_FROUND_TO_ZERO
-                                                    : (R == std::round_toward_infinity)
-                                                          ? _MM_FROUND_TO_POS_INF
-                                                          : (R == std::round_toward_neg_infinity)
-                                                                ? _MM_FROUND_TO_NEG_INF
-                                                                : _MM_FROUND_CUR_DIRECTION));
-#else
-    bits<float>::type fbits;
-    std::memcpy(&fbits, &value, sizeof(float));
-#if 1
-    unsigned int sign = (fbits >> 16) & 0x8000;
-    fbits &= 0x7FFFFFFF;
-    if(fbits >= 0x7F800000)
-        return sign | 0x7C00 | ((fbits > 0x7F800000) ? (0x200 | ((fbits >> 13) & 0x3FF)) : 0);
-    if(fbits >= 0x47800000)
-        return overflow<R>(sign);
-    if(fbits >= 0x38800000)
-        return rounded<R, false>(sign | (((fbits >> 23) - 112) << 10) | ((fbits >> 13) & 0x3FF),
-                                 (fbits >> 12) & 1,
-                                 (fbits & 0xFFF) != 0);
-    if(fbits >= 0x33000000)
-    {
-        int i = 125 - (fbits >> 23);
-        fbits = (fbits & 0x7FFFFF) | 0x800000;
-        return rounded<R, false>(sign | (fbits >> (i + 1)),
-                                 (fbits >> i) & 1,
-                                 (fbits & ((static_cast<uint32>(1) << i) - 1)) != 0);
-    }
-    if(fbits != 0)
-        return underflow<R>(sign);
-    return sign;
-#else
-    static const uint16 base_table[512] = {
-        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-        0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040,
-        0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000,
-        0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00, 0x4000, 0x4400, 0x4800, 0x4C00,
-        0x5000, 0x5400, 0x5800, 0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800,
-        0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF,
-        0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF,
-        0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF,
-        0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF,
-        0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF,
-        0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF,
-        0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF,
-        0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF,
-        0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF,
-        0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF,
-        0x7BFF, 0x7BFF, 0x7C00, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8002, 0x8004, 0x8008,
-        0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 0x8200, 0x8400, 0x8800, 0x8C00, 0x9000, 0x9400,
-        0x9800, 0x9C00, 0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00, 0xC000,
-        0xC400, 0xC800, 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00,
-        0xF000, 0xF400, 0xF800, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF,
-        0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF,
-        0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF,
-        0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF,
-        0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF,
-        0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF,
-        0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF,
-        0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF,
-        0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF,
-        0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF,
-        0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFC00};
-    static const unsigned char shift_table[256] = {
-        24, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
-        25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
-        25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
-        25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
-        25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 24, 23, 22, 21, 20, 19, 18, 17,
-        16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
-        13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
-        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
-        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
-        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
-        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
-        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13};
-    int sexp = fbits >> 23, exp = sexp & 0xFF, i = shift_table[exp];
-    fbits &= 0x7FFFFF;
-    uint32 m = (fbits | ((exp != 0) << 23)) & -static_cast<uint32>(exp != 0xFF);
-    return rounded<R, false>(base_table[sexp] + (fbits >> i),
-                             (m >> (i - 1)) & 1,
-                             (((static_cast<uint32>(1) << (i - 1)) - 1) & m) != 0);
-#endif
-#endif
-}
-
-/// Convert IEEE double-precision to half-precision.
-/// \tparam R rounding mode to use
-/// \param value double-precision value to convert
-/// \return rounded half-precision value
-/// \exception FE_OVERFLOW on overflows
-/// \exception FE_UNDERFLOW on underflows
-/// \exception FE_INEXACT if value had to be rounded
-template <std::float_round_style R>
-unsigned int float2half_impl(double value, true_type)
-{
-#if HALF_ENABLE_F16C_INTRINSICS
-    if(R == std::round_indeterminate)
-        return _mm_cvtsi128_si32(
-            _mm_cvtps_ph(_mm_cvtpd_ps(_mm_set_sd(value)), _MM_FROUND_CUR_DIRECTION));
-#endif
-    bits<double>::type dbits;
-    std::memcpy(&dbits, &value, sizeof(double));
-    uint32 hi = dbits >> 32, lo = dbits & 0xFFFFFFFF;
-    unsigned int sign = (hi >> 16) & 0x8000;
-    hi &= 0x7FFFFFFF;
-    if(hi >= 0x7FF00000)
-        return sign | 0x7C00 | ((dbits & 0xFFFFFFFFFFFFF) ? (0x200 | ((hi >> 10) & 0x3FF)) : 0);
-    if(hi >= 0x40F00000)
-        return overflow<R>(sign);
-    if(hi >= 0x3F100000)
-        return rounded<R, false>(sign | (((hi >> 20) - 1008) << 10) | ((hi >> 10) & 0x3FF),
-                                 (hi >> 9) & 1,
-                                 ((hi & 0x1FF) | lo) != 0);
-    if(hi >= 0x3E600000)
-    {
-        int i = 1018 - (hi >> 20);
-        hi    = (hi & 0xFFFFF) | 0x100000;
-        return rounded<R, false>(sign | (hi >> (i + 1)),
-                                 (hi >> i) & 1,
-                                 ((hi & ((static_cast<uint32>(1) << i) - 1)) | lo) != 0);
-    }
-    if((hi | lo) != 0)
-        return underflow<R>(sign);
-    return sign;
-}
-
-/// Convert non-IEEE floating-point to half-precision.
-/// \tparam R rounding mode to use
-/// \tparam T source type (builtin floating-point type)
-/// \param value floating-point value to convert
-/// \return rounded half-precision value
-/// \exception FE_OVERFLOW on overflows
-/// \exception FE_UNDERFLOW on underflows
-/// \exception FE_INEXACT if value had to be rounded
-template <std::float_round_style R, typename T>
-unsigned int float2half_impl(T value, ...)
-{
-    unsigned int hbits = static_cast<unsigned>(builtin_signbit(value)) << 15;
-    if(value == T())
-        return hbits;
-    if(builtin_isnan(value))
-        return hbits | 0x7FFF;
-    if(builtin_isinf(value))
-        return hbits | 0x7C00;
-    int exp;
-    std::frexp(value, &exp);
-    if(exp > 16)
-        return overflow<R>(hbits);
-    if(exp < -13)
-        value = std::ldexp(value, 25);
-    else
-    {
-        value = std::ldexp(value, 12 - exp);
-        hbits |= ((exp + 13) << 10);
-    }
-    T ival, frac = std::modf(value, &ival);
-    int m = std::abs(static_cast<int>(ival));
-    return rounded<R, false>(hbits + (m >> 1), m & 1, frac != T());
-}
-
-/// Convert floating-point to half-precision.
-/// \tparam R rounding mode to use
-/// \tparam T source type (builtin floating-point type)
-/// \param value floating-point value to convert
-/// \return rounded half-precision value
-/// \exception FE_OVERFLOW on overflows
-/// \exception FE_UNDERFLOW on underflows
-/// \exception FE_INEXACT if value had to be rounded
-template <std::float_round_style R, typename T>
-unsigned int float2half(T value)
-{
-    return float2half_impl<R>(value,
-                              bool_type < std::numeric_limits<T>::is_iec559 &&
-                                  sizeof(typename bits<T>::type) == sizeof(T) > ());
-}
-
-/// Convert integer to half-precision floating-point.
-/// \tparam R rounding mode to use
-/// \tparam T type to convert (builtin integer type)
-/// \param value integral value to convert
-/// \return rounded half-precision value
-/// \exception FE_OVERFLOW on overflows
-/// \exception FE_INEXACT if value had to be rounded
-template <std::float_round_style R, typename T>
-unsigned int int2half(T value)
-{
-    unsigned int bits = static_cast<unsigned>(value < 0) << 15;
-    if(!value)
-        return bits;
-    if(bits)
-        value = -value;
-    if(value > 0xFFFF)
-        return overflow<R>(bits);
-    unsigned int m = static_cast<unsigned int>(value), exp = 24;
-    for(; m < 0x400; m <<= 1, --exp)
-        ;
-    for(; m > 0x7FF; m >>= 1, ++exp)
-        ;
-    bits |= (exp << 10) + m;
-    return (exp > 24) ? rounded<R, false>(
-                            bits, (value >> (exp - 25)) & 1, (((1 << (exp - 25)) - 1) & value) != 0)
-                      : bits;
-}
-
-/// Convert half-precision to IEEE single-precision.
-/// Credit for this goes to [Jeroen van der
-/// Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf).
-/// \param value half-precision value to convert
-/// \return single-precision value
-inline float half2float_impl(unsigned int value, float, true_type)
-{
-#if HALF_ENABLE_F16C_INTRINSICS
-    return _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(value)));
-#else
-#if 0
-			bits<float>::type fbits = static_cast<bits<float>::type>(value&0x8000) << 16;
-			int abs = value & 0x7FFF;
-			if(abs)
-			{
-				fbits |= 0x38000000 << static_cast<unsigned>(abs>=0x7C00);
-				for(; abs<0x400; abs<<=1,fbits-=0x800000) ;
-				fbits += static_cast<bits<float>::type>(abs) << 13;
-			}
-#else
-    static const bits<float>::type mantissa_table[2048] = {
-        0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, 0x34C00000,
-        0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000,
-        0x35600000, 0x35700000, 0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000,
-        0x35A80000, 0x35B00000, 0x35B80000, 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000,
-        0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000, 0x36000000, 0x36040000, 0x36080000,
-        0x360C0000, 0x36100000, 0x36140000, 0x36180000, 0x361C0000, 0x36200000, 0x36240000,
-        0x36280000, 0x362C0000, 0x36300000, 0x36340000, 0x36380000, 0x363C0000, 0x36400000,
-        0x36440000, 0x36480000, 0x364C0000, 0x36500000, 0x36540000, 0x36580000, 0x365C0000,
-        0x36600000, 0x36640000, 0x36680000, 0x366C0000, 0x36700000, 0x36740000, 0x36780000,
-        0x367C0000, 0x36800000, 0x36820000, 0x36840000, 0x36860000, 0x36880000, 0x368A0000,
-        0x368C0000, 0x368E0000, 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000,
-        0x369A0000, 0x369C0000, 0x369E0000, 0x36A00000, 0x36A20000, 0x36A40000, 0x36A60000,
-        0x36A80000, 0x36AA0000, 0x36AC0000, 0x36AE0000, 0x36B00000, 0x36B20000, 0x36B40000,
-        0x36B60000, 0x36B80000, 0x36BA0000, 0x36BC0000, 0x36BE0000, 0x36C00000, 0x36C20000,
-        0x36C40000, 0x36C60000, 0x36C80000, 0x36CA0000, 0x36CC0000, 0x36CE0000, 0x36D00000,
-        0x36D20000, 0x36D40000, 0x36D60000, 0x36D80000, 0x36DA0000, 0x36DC0000, 0x36DE0000,
-        0x36E00000, 0x36E20000, 0x36E40000, 0x36E60000, 0x36E80000, 0x36EA0000, 0x36EC0000,
-        0x36EE0000, 0x36F00000, 0x36F20000, 0x36F40000, 0x36F60000, 0x36F80000, 0x36FA0000,
-        0x36FC0000, 0x36FE0000, 0x37000000, 0x37010000, 0x37020000, 0x37030000, 0x37040000,
-        0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, 0x370A0000, 0x370B0000,
-        0x370C0000, 0x370D0000, 0x370E0000, 0x370F0000, 0x37100000, 0x37110000, 0x37120000,
-        0x37130000, 0x37140000, 0x37150000, 0x37160000, 0x37170000, 0x37180000, 0x37190000,
-        0x371A0000, 0x371B0000, 0x371C0000, 0x371D0000, 0x371E0000, 0x371F0000, 0x37200000,
-        0x37210000, 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000,
-        0x37280000, 0x37290000, 0x372A0000, 0x372B0000, 0x372C0000, 0x372D0000, 0x372E0000,
-        0x372F0000, 0x37300000, 0x37310000, 0x37320000, 0x37330000, 0x37340000, 0x37350000,
-        0x37360000, 0x37370000, 0x37380000, 0x37390000, 0x373A0000, 0x373B0000, 0x373C0000,
-        0x373D0000, 0x373E0000, 0x373F0000, 0x37400000, 0x37410000, 0x37420000, 0x37430000,
-        0x37440000, 0x37450000, 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374A0000,
-        0x374B0000, 0x374C0000, 0x374D0000, 0x374E0000, 0x374F0000, 0x37500000, 0x37510000,
-        0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, 0x37580000,
-        0x37590000, 0x375A0000, 0x375B0000, 0x375C0000, 0x375D0000, 0x375E0000, 0x375F0000,
-        0x37600000, 0x37610000, 0x37620000, 0x37630000, 0x37640000, 0x37650000, 0x37660000,
-        0x37670000, 0x37680000, 0x37690000, 0x376A0000, 0x376B0000, 0x376C0000, 0x376D0000,
-        0x376E0000, 0x376F0000, 0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000,
-        0x37750000, 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377A0000, 0x377B0000,
-        0x377C0000, 0x377D0000, 0x377E0000, 0x377F0000, 0x37800000, 0x37808000, 0x37810000,
-        0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, 0x37840000, 0x37848000,
-        0x37850000, 0x37858000, 0x37860000, 0x37868000, 0x37870000, 0x37878000, 0x37880000,
-        0x37888000, 0x37890000, 0x37898000, 0x378A0000, 0x378A8000, 0x378B0000, 0x378B8000,
-        0x378C0000, 0x378C8000, 0x378D0000, 0x378D8000, 0x378E0000, 0x378E8000, 0x378F0000,
-        0x378F8000, 0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000,
-        0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, 0x37960000,
-        0x37968000, 0x37970000, 0x37978000, 0x37980000, 0x37988000, 0x37990000, 0x37998000,
-        0x379A0000, 0x379A8000, 0x379B0000, 0x379B8000, 0x379C0000, 0x379C8000, 0x379D0000,
-        0x379D8000, 0x379E0000, 0x379E8000, 0x379F0000, 0x379F8000, 0x37A00000, 0x37A08000,
-        0x37A10000, 0x37A18000, 0x37A20000, 0x37A28000, 0x37A30000, 0x37A38000, 0x37A40000,
-        0x37A48000, 0x37A50000, 0x37A58000, 0x37A60000, 0x37A68000, 0x37A70000, 0x37A78000,
-        0x37A80000, 0x37A88000, 0x37A90000, 0x37A98000, 0x37AA0000, 0x37AA8000, 0x37AB0000,
-        0x37AB8000, 0x37AC0000, 0x37AC8000, 0x37AD0000, 0x37AD8000, 0x37AE0000, 0x37AE8000,
-        0x37AF0000, 0x37AF8000, 0x37B00000, 0x37B08000, 0x37B10000, 0x37B18000, 0x37B20000,
-        0x37B28000, 0x37B30000, 0x37B38000, 0x37B40000, 0x37B48000, 0x37B50000, 0x37B58000,
-        0x37B60000, 0x37B68000, 0x37B70000, 0x37B78000, 0x37B80000, 0x37B88000, 0x37B90000,
-        0x37B98000, 0x37BA0000, 0x37BA8000, 0x37BB0000, 0x37BB8000, 0x37BC0000, 0x37BC8000,
-        0x37BD0000, 0x37BD8000, 0x37BE0000, 0x37BE8000, 0x37BF0000, 0x37BF8000, 0x37C00000,
-        0x37C08000, 0x37C10000, 0x37C18000, 0x37C20000, 0x37C28000, 0x37C30000, 0x37C38000,
-        0x37C40000, 0x37C48000, 0x37C50000, 0x37C58000, 0x37C60000, 0x37C68000, 0x37C70000,
-        0x37C78000, 0x37C80000, 0x37C88000, 0x37C90000, 0x37C98000, 0x37CA0000, 0x37CA8000,
-        0x37CB0000, 0x37CB8000, 0x37CC0000, 0x37CC8000, 0x37CD0000, 0x37CD8000, 0x37CE0000,
-        0x37CE8000, 0x37CF0000, 0x37CF8000, 0x37D00000, 0x37D08000, 0x37D10000, 0x37D18000,
-        0x37D20000, 0x37D28000, 0x37D30000, 0x37D38000, 0x37D40000, 0x37D48000, 0x37D50000,
-        0x37D58000, 0x37D60000, 0x37D68000, 0x37D70000, 0x37D78000, 0x37D80000, 0x37D88000,
-        0x37D90000, 0x37D98000, 0x37DA0000, 0x37DA8000, 0x37DB0000, 0x37DB8000, 0x37DC0000,
-        0x37DC8000, 0x37DD0000, 0x37DD8000, 0x37DE0000, 0x37DE8000, 0x37DF0000, 0x37DF8000,
-        0x37E00000, 0x37E08000, 0x37E10000, 0x37E18000, 0x37E20000, 0x37E28000, 0x37E30000,
-        0x37E38000, 0x37E40000, 0x37E48000, 0x37E50000, 0x37E58000, 0x37E60000, 0x37E68000,
-        0x37E70000, 0x37E78000, 0x37E80000, 0x37E88000, 0x37E90000, 0x37E98000, 0x37EA0000,
-        0x37EA8000, 0x37EB0000, 0x37EB8000, 0x37EC0000, 0x37EC8000, 0x37ED0000, 0x37ED8000,
-        0x37EE0000, 0x37EE8000, 0x37EF0000, 0x37EF8000, 0x37F00000, 0x37F08000, 0x37F10000,
-        0x37F18000, 0x37F20000, 0x37F28000, 0x37F30000, 0x37F38000, 0x37F40000, 0x37F48000,
-        0x37F50000, 0x37F58000, 0x37F60000, 0x37F68000, 0x37F70000, 0x37F78000, 0x37F80000,
-        0x37F88000, 0x37F90000, 0x37F98000, 0x37FA0000, 0x37FA8000, 0x37FB0000, 0x37FB8000,
-        0x37FC0000, 0x37FC8000, 0x37FD0000, 0x37FD8000, 0x37FE0000, 0x37FE8000, 0x37FF0000,
-        0x37FF8000, 0x38000000, 0x38004000, 0x38008000, 0x3800C000, 0x38010000, 0x38014000,
-        0x38018000, 0x3801C000, 0x38020000, 0x38024000, 0x38028000, 0x3802C000, 0x38030000,
-        0x38034000, 0x38038000, 0x3803C000, 0x38040000, 0x38044000, 0x38048000, 0x3804C000,
-        0x38050000, 0x38054000, 0x38058000, 0x3805C000, 0x38060000, 0x38064000, 0x38068000,
-        0x3806C000, 0x38070000, 0x38074000, 0x38078000, 0x3807C000, 0x38080000, 0x38084000,
-        0x38088000, 0x3808C000, 0x38090000, 0x38094000, 0x38098000, 0x3809C000, 0x380A0000,
-        0x380A4000, 0x380A8000, 0x380AC000, 0x380B0000, 0x380B4000, 0x380B8000, 0x380BC000,
-        0x380C0000, 0x380C4000, 0x380C8000, 0x380CC000, 0x380D0000, 0x380D4000, 0x380D8000,
-        0x380DC000, 0x380E0000, 0x380E4000, 0x380E8000, 0x380EC000, 0x380F0000, 0x380F4000,
-        0x380F8000, 0x380FC000, 0x38100000, 0x38104000, 0x38108000, 0x3810C000, 0x38110000,
-        0x38114000, 0x38118000, 0x3811C000, 0x38120000, 0x38124000, 0x38128000, 0x3812C000,
-        0x38130000, 0x38134000, 0x38138000, 0x3813C000, 0x38140000, 0x38144000, 0x38148000,
-        0x3814C000, 0x38150000, 0x38154000, 0x38158000, 0x3815C000, 0x38160000, 0x38164000,
-        0x38168000, 0x3816C000, 0x38170000, 0x38174000, 0x38178000, 0x3817C000, 0x38180000,
-        0x38184000, 0x38188000, 0x3818C000, 0x38190000, 0x38194000, 0x38198000, 0x3819C000,
-        0x381A0000, 0x381A4000, 0x381A8000, 0x381AC000, 0x381B0000, 0x381B4000, 0x381B8000,
-        0x381BC000, 0x381C0000, 0x381C4000, 0x381C8000, 0x381CC000, 0x381D0000, 0x381D4000,
-        0x381D8000, 0x381DC000, 0x381E0000, 0x381E4000, 0x381E8000, 0x381EC000, 0x381F0000,
-        0x381F4000, 0x381F8000, 0x381FC000, 0x38200000, 0x38204000, 0x38208000, 0x3820C000,
-        0x38210000, 0x38214000, 0x38218000, 0x3821C000, 0x38220000, 0x38224000, 0x38228000,
-        0x3822C000, 0x38230000, 0x38234000, 0x38238000, 0x3823C000, 0x38240000, 0x38244000,
-        0x38248000, 0x3824C000, 0x38250000, 0x38254000, 0x38258000, 0x3825C000, 0x38260000,
-        0x38264000, 0x38268000, 0x3826C000, 0x38270000, 0x38274000, 0x38278000, 0x3827C000,
-        0x38280000, 0x38284000, 0x38288000, 0x3828C000, 0x38290000, 0x38294000, 0x38298000,
-        0x3829C000, 0x382A0000, 0x382A4000, 0x382A8000, 0x382AC000, 0x382B0000, 0x382B4000,
-        0x382B8000, 0x382BC000, 0x382C0000, 0x382C4000, 0x382C8000, 0x382CC000, 0x382D0000,
-        0x382D4000, 0x382D8000, 0x382DC000, 0x382E0000, 0x382E4000, 0x382E8000, 0x382EC000,
-        0x382F0000, 0x382F4000, 0x382F8000, 0x382FC000, 0x38300000, 0x38304000, 0x38308000,
-        0x3830C000, 0x38310000, 0x38314000, 0x38318000, 0x3831C000, 0x38320000, 0x38324000,
-        0x38328000, 0x3832C000, 0x38330000, 0x38334000, 0x38338000, 0x3833C000, 0x38340000,
-        0x38344000, 0x38348000, 0x3834C000, 0x38350000, 0x38354000, 0x38358000, 0x3835C000,
-        0x38360000, 0x38364000, 0x38368000, 0x3836C000, 0x38370000, 0x38374000, 0x38378000,
-        0x3837C000, 0x38380000, 0x38384000, 0x38388000, 0x3838C000, 0x38390000, 0x38394000,
-        0x38398000, 0x3839C000, 0x383A0000, 0x383A4000, 0x383A8000, 0x383AC000, 0x383B0000,
-        0x383B4000, 0x383B8000, 0x383BC000, 0x383C0000, 0x383C4000, 0x383C8000, 0x383CC000,
-        0x383D0000, 0x383D4000, 0x383D8000, 0x383DC000, 0x383E0000, 0x383E4000, 0x383E8000,
-        0x383EC000, 0x383F0000, 0x383F4000, 0x383F8000, 0x383FC000, 0x38400000, 0x38404000,
-        0x38408000, 0x3840C000, 0x38410000, 0x38414000, 0x38418000, 0x3841C000, 0x38420000,
-        0x38424000, 0x38428000, 0x3842C000, 0x38430000, 0x38434000, 0x38438000, 0x3843C000,
-        0x38440000, 0x38444000, 0x38448000, 0x3844C000, 0x38450000, 0x38454000, 0x38458000,
-        0x3845C000, 0x38460000, 0x38464000, 0x38468000, 0x3846C000, 0x38470000, 0x38474000,
-        0x38478000, 0x3847C000, 0x38480000, 0x38484000, 0x38488000, 0x3848C000, 0x38490000,
-        0x38494000, 0x38498000, 0x3849C000, 0x384A0000, 0x384A4000, 0x384A8000, 0x384AC000,
-        0x384B0000, 0x384B4000, 0x384B8000, 0x384BC000, 0x384C0000, 0x384C4000, 0x384C8000,
-        0x384CC000, 0x384D0000, 0x384D4000, 0x384D8000, 0x384DC000, 0x384E0000, 0x384E4000,
-        0x384E8000, 0x384EC000, 0x384F0000, 0x384F4000, 0x384F8000, 0x384FC000, 0x38500000,
-        0x38504000, 0x38508000, 0x3850C000, 0x38510000, 0x38514000, 0x38518000, 0x3851C000,
-        0x38520000, 0x38524000, 0x38528000, 0x3852C000, 0x38530000, 0x38534000, 0x38538000,
-        0x3853C000, 0x38540000, 0x38544000, 0x38548000, 0x3854C000, 0x38550000, 0x38554000,
-        0x38558000, 0x3855C000, 0x38560000, 0x38564000, 0x38568000, 0x3856C000, 0x38570000,
-        0x38574000, 0x38578000, 0x3857C000, 0x38580000, 0x38584000, 0x38588000, 0x3858C000,
-        0x38590000, 0x38594000, 0x38598000, 0x3859C000, 0x385A0000, 0x385A4000, 0x385A8000,
-        0x385AC000, 0x385B0000, 0x385B4000, 0x385B8000, 0x385BC000, 0x385C0000, 0x385C4000,
-        0x385C8000, 0x385CC000, 0x385D0000, 0x385D4000, 0x385D8000, 0x385DC000, 0x385E0000,
-        0x385E4000, 0x385E8000, 0x385EC000, 0x385F0000, 0x385F4000, 0x385F8000, 0x385FC000,
-        0x38600000, 0x38604000, 0x38608000, 0x3860C000, 0x38610000, 0x38614000, 0x38618000,
-        0x3861C000, 0x38620000, 0x38624000, 0x38628000, 0x3862C000, 0x38630000, 0x38634000,
-        0x38638000, 0x3863C000, 0x38640000, 0x38644000, 0x38648000, 0x3864C000, 0x38650000,
-        0x38654000, 0x38658000, 0x3865C000, 0x38660000, 0x38664000, 0x38668000, 0x3866C000,
-        0x38670000, 0x38674000, 0x38678000, 0x3867C000, 0x38680000, 0x38684000, 0x38688000,
-        0x3868C000, 0x38690000, 0x38694000, 0x38698000, 0x3869C000, 0x386A0000, 0x386A4000,
-        0x386A8000, 0x386AC000, 0x386B0000, 0x386B4000, 0x386B8000, 0x386BC000, 0x386C0000,
-        0x386C4000, 0x386C8000, 0x386CC000, 0x386D0000, 0x386D4000, 0x386D8000, 0x386DC000,
-        0x386E0000, 0x386E4000, 0x386E8000, 0x386EC000, 0x386F0000, 0x386F4000, 0x386F8000,
-        0x386FC000, 0x38700000, 0x38704000, 0x38708000, 0x3870C000, 0x38710000, 0x38714000,
-        0x38718000, 0x3871C000, 0x38720000, 0x38724000, 0x38728000, 0x3872C000, 0x38730000,
-        0x38734000, 0x38738000, 0x3873C000, 0x38740000, 0x38744000, 0x38748000, 0x3874C000,
-        0x38750000, 0x38754000, 0x38758000, 0x3875C000, 0x38760000, 0x38764000, 0x38768000,
-        0x3876C000, 0x38770000, 0x38774000, 0x38778000, 0x3877C000, 0x38780000, 0x38784000,
-        0x38788000, 0x3878C000, 0x38790000, 0x38794000, 0x38798000, 0x3879C000, 0x387A0000,
-        0x387A4000, 0x387A8000, 0x387AC000, 0x387B0000, 0x387B4000, 0x387B8000, 0x387BC000,
-        0x387C0000, 0x387C4000, 0x387C8000, 0x387CC000, 0x387D0000, 0x387D4000, 0x387D8000,
-        0x387DC000, 0x387E0000, 0x387E4000, 0x387E8000, 0x387EC000, 0x387F0000, 0x387F4000,
-        0x387F8000, 0x387FC000, 0x38000000, 0x38002000, 0x38004000, 0x38006000, 0x38008000,
-        0x3800A000, 0x3800C000, 0x3800E000, 0x38010000, 0x38012000, 0x38014000, 0x38016000,
-        0x38018000, 0x3801A000, 0x3801C000, 0x3801E000, 0x38020000, 0x38022000, 0x38024000,
-        0x38026000, 0x38028000, 0x3802A000, 0x3802C000, 0x3802E000, 0x38030000, 0x38032000,
-        0x38034000, 0x38036000, 0x38038000, 0x3803A000, 0x3803C000, 0x3803E000, 0x38040000,
-        0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804A000, 0x3804C000, 0x3804E000,
-        0x38050000, 0x38052000, 0x38054000, 0x38056000, 0x38058000, 0x3805A000, 0x3805C000,
-        0x3805E000, 0x38060000, 0x38062000, 0x38064000, 0x38066000, 0x38068000, 0x3806A000,
-        0x3806C000, 0x3806E000, 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000,
-        0x3807A000, 0x3807C000, 0x3807E000, 0x38080000, 0x38082000, 0x38084000, 0x38086000,
-        0x38088000, 0x3808A000, 0x3808C000, 0x3808E000, 0x38090000, 0x38092000, 0x38094000,
-        0x38096000, 0x38098000, 0x3809A000, 0x3809C000, 0x3809E000, 0x380A0000, 0x380A2000,
-        0x380A4000, 0x380A6000, 0x380A8000, 0x380AA000, 0x380AC000, 0x380AE000, 0x380B0000,
-        0x380B2000, 0x380B4000, 0x380B6000, 0x380B8000, 0x380BA000, 0x380BC000, 0x380BE000,
-        0x380C0000, 0x380C2000, 0x380C4000, 0x380C6000, 0x380C8000, 0x380CA000, 0x380CC000,
-        0x380CE000, 0x380D0000, 0x380D2000, 0x380D4000, 0x380D6000, 0x380D8000, 0x380DA000,
-        0x380DC000, 0x380DE000, 0x380E0000, 0x380E2000, 0x380E4000, 0x380E6000, 0x380E8000,
-        0x380EA000, 0x380EC000, 0x380EE000, 0x380F0000, 0x380F2000, 0x380F4000, 0x380F6000,
-        0x380F8000, 0x380FA000, 0x380FC000, 0x380FE000, 0x38100000, 0x38102000, 0x38104000,
-        0x38106000, 0x38108000, 0x3810A000, 0x3810C000, 0x3810E000, 0x38110000, 0x38112000,
-        0x38114000, 0x38116000, 0x38118000, 0x3811A000, 0x3811C000, 0x3811E000, 0x38120000,
-        0x38122000, 0x38124000, 0x38126000, 0x38128000, 0x3812A000, 0x3812C000, 0x3812E000,
-        0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813A000, 0x3813C000,
-        0x3813E000, 0x38140000, 0x38142000, 0x38144000, 0x38146000, 0x38148000, 0x3814A000,
-        0x3814C000, 0x3814E000, 0x38150000, 0x38152000, 0x38154000, 0x38156000, 0x38158000,
-        0x3815A000, 0x3815C000, 0x3815E000, 0x38160000, 0x38162000, 0x38164000, 0x38166000,
-        0x38168000, 0x3816A000, 0x3816C000, 0x3816E000, 0x38170000, 0x38172000, 0x38174000,
-        0x38176000, 0x38178000, 0x3817A000, 0x3817C000, 0x3817E000, 0x38180000, 0x38182000,
-        0x38184000, 0x38186000, 0x38188000, 0x3818A000, 0x3818C000, 0x3818E000, 0x38190000,
-        0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819A000, 0x3819C000, 0x3819E000,
-        0x381A0000, 0x381A2000, 0x381A4000, 0x381A6000, 0x381A8000, 0x381AA000, 0x381AC000,
-        0x381AE000, 0x381B0000, 0x381B2000, 0x381B4000, 0x381B6000, 0x381B8000, 0x381BA000,
-        0x381BC000, 0x381BE000, 0x381C0000, 0x381C2000, 0x381C4000, 0x381C6000, 0x381C8000,
-        0x381CA000, 0x381CC000, 0x381CE000, 0x381D0000, 0x381D2000, 0x381D4000, 0x381D6000,
-        0x381D8000, 0x381DA000, 0x381DC000, 0x381DE000, 0x381E0000, 0x381E2000, 0x381E4000,
-        0x381E6000, 0x381E8000, 0x381EA000, 0x381EC000, 0x381EE000, 0x381F0000, 0x381F2000,
-        0x381F4000, 0x381F6000, 0x381F8000, 0x381FA000, 0x381FC000, 0x381FE000, 0x38200000,
-        0x38202000, 0x38204000, 0x38206000, 0x38208000, 0x3820A000, 0x3820C000, 0x3820E000,
-        0x38210000, 0x38212000, 0x38214000, 0x38216000, 0x38218000, 0x3821A000, 0x3821C000,
-        0x3821E000, 0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822A000,
-        0x3822C000, 0x3822E000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, 0x38238000,
-        0x3823A000, 0x3823C000, 0x3823E000, 0x38240000, 0x38242000, 0x38244000, 0x38246000,
-        0x38248000, 0x3824A000, 0x3824C000, 0x3824E000, 0x38250000, 0x38252000, 0x38254000,
-        0x38256000, 0x38258000, 0x3825A000, 0x3825C000, 0x3825E000, 0x38260000, 0x38262000,
-        0x38264000, 0x38266000, 0x38268000, 0x3826A000, 0x3826C000, 0x3826E000, 0x38270000,
-        0x38272000, 0x38274000, 0x38276000, 0x38278000, 0x3827A000, 0x3827C000, 0x3827E000,
-        0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828A000, 0x3828C000,
-        0x3828E000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, 0x38298000, 0x3829A000,
-        0x3829C000, 0x3829E000, 0x382A0000, 0x382A2000, 0x382A4000, 0x382A6000, 0x382A8000,
-        0x382AA000, 0x382AC000, 0x382AE000, 0x382B0000, 0x382B2000, 0x382B4000, 0x382B6000,
-        0x382B8000, 0x382BA000, 0x382BC000, 0x382BE000, 0x382C0000, 0x382C2000, 0x382C4000,
-        0x382C6000, 0x382C8000, 0x382CA000, 0x382CC000, 0x382CE000, 0x382D0000, 0x382D2000,
-        0x382D4000, 0x382D6000, 0x382D8000, 0x382DA000, 0x382DC000, 0x382DE000, 0x382E0000,
-        0x382E2000, 0x382E4000, 0x382E6000, 0x382E8000, 0x382EA000, 0x382EC000, 0x382EE000,
-        0x382F0000, 0x382F2000, 0x382F4000, 0x382F6000, 0x382F8000, 0x382FA000, 0x382FC000,
-        0x382FE000, 0x38300000, 0x38302000, 0x38304000, 0x38306000, 0x38308000, 0x3830A000,
-        0x3830C000, 0x3830E000, 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000,
-        0x3831A000, 0x3831C000, 0x3831E000, 0x38320000, 0x38322000, 0x38324000, 0x38326000,
-        0x38328000, 0x3832A000, 0x3832C000, 0x3832E000, 0x38330000, 0x38332000, 0x38334000,
-        0x38336000, 0x38338000, 0x3833A000, 0x3833C000, 0x3833E000, 0x38340000, 0x38342000,
-        0x38344000, 0x38346000, 0x38348000, 0x3834A000, 0x3834C000, 0x3834E000, 0x38350000,
-        0x38352000, 0x38354000, 0x38356000, 0x38358000, 0x3835A000, 0x3835C000, 0x3835E000,
-        0x38360000, 0x38362000, 0x38364000, 0x38366000, 0x38368000, 0x3836A000, 0x3836C000,
-        0x3836E000, 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837A000,
-        0x3837C000, 0x3837E000, 0x38380000, 0x38382000, 0x38384000, 0x38386000, 0x38388000,
-        0x3838A000, 0x3838C000, 0x3838E000, 0x38390000, 0x38392000, 0x38394000, 0x38396000,
-        0x38398000, 0x3839A000, 0x3839C000, 0x3839E000, 0x383A0000, 0x383A2000, 0x383A4000,
-        0x383A6000, 0x383A8000, 0x383AA000, 0x383AC000, 0x383AE000, 0x383B0000, 0x383B2000,
-        0x383B4000, 0x383B6000, 0x383B8000, 0x383BA000, 0x383BC000, 0x383BE000, 0x383C0000,
-        0x383C2000, 0x383C4000, 0x383C6000, 0x383C8000, 0x383CA000, 0x383CC000, 0x383CE000,
-        0x383D0000, 0x383D2000, 0x383D4000, 0x383D6000, 0x383D8000, 0x383DA000, 0x383DC000,
-        0x383DE000, 0x383E0000, 0x383E2000, 0x383E4000, 0x383E6000, 0x383E8000, 0x383EA000,
-        0x383EC000, 0x383EE000, 0x383F0000, 0x383F2000, 0x383F4000, 0x383F6000, 0x383F8000,
-        0x383FA000, 0x383FC000, 0x383FE000, 0x38400000, 0x38402000, 0x38404000, 0x38406000,
-        0x38408000, 0x3840A000, 0x3840C000, 0x3840E000, 0x38410000, 0x38412000, 0x38414000,
-        0x38416000, 0x38418000, 0x3841A000, 0x3841C000, 0x3841E000, 0x38420000, 0x38422000,
-        0x38424000, 0x38426000, 0x38428000, 0x3842A000, 0x3842C000, 0x3842E000, 0x38430000,
-        0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843A000, 0x3843C000, 0x3843E000,
-        0x38440000, 0x38442000, 0x38444000, 0x38446000, 0x38448000, 0x3844A000, 0x3844C000,
-        0x3844E000, 0x38450000, 0x38452000, 0x38454000, 0x38456000, 0x38458000, 0x3845A000,
-        0x3845C000, 0x3845E000, 0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000,
-        0x3846A000, 0x3846C000, 0x3846E000, 0x38470000, 0x38472000, 0x38474000, 0x38476000,
-        0x38478000, 0x3847A000, 0x3847C000, 0x3847E000, 0x38480000, 0x38482000, 0x38484000,
-        0x38486000, 0x38488000, 0x3848A000, 0x3848C000, 0x3848E000, 0x38490000, 0x38492000,
-        0x38494000, 0x38496000, 0x38498000, 0x3849A000, 0x3849C000, 0x3849E000, 0x384A0000,
-        0x384A2000, 0x384A4000, 0x384A6000, 0x384A8000, 0x384AA000, 0x384AC000, 0x384AE000,
-        0x384B0000, 0x384B2000, 0x384B4000, 0x384B6000, 0x384B8000, 0x384BA000, 0x384BC000,
-        0x384BE000, 0x384C0000, 0x384C2000, 0x384C4000, 0x384C6000, 0x384C8000, 0x384CA000,
-        0x384CC000, 0x384CE000, 0x384D0000, 0x384D2000, 0x384D4000, 0x384D6000, 0x384D8000,
-        0x384DA000, 0x384DC000, 0x384DE000, 0x384E0000, 0x384E2000, 0x384E4000, 0x384E6000,
-        0x384E8000, 0x384EA000, 0x384EC000, 0x384EE000, 0x384F0000, 0x384F2000, 0x384F4000,
-        0x384F6000, 0x384F8000, 0x384FA000, 0x384FC000, 0x384FE000, 0x38500000, 0x38502000,
-        0x38504000, 0x38506000, 0x38508000, 0x3850A000, 0x3850C000, 0x3850E000, 0x38510000,
-        0x38512000, 0x38514000, 0x38516000, 0x38518000, 0x3851A000, 0x3851C000, 0x3851E000,
-        0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852A000, 0x3852C000,
-        0x3852E000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, 0x38538000, 0x3853A000,
-        0x3853C000, 0x3853E000, 0x38540000, 0x38542000, 0x38544000, 0x38546000, 0x38548000,
-        0x3854A000, 0x3854C000, 0x3854E000, 0x38550000, 0x38552000, 0x38554000, 0x38556000,
-        0x38558000, 0x3855A000, 0x3855C000, 0x3855E000, 0x38560000, 0x38562000, 0x38564000,
-        0x38566000, 0x38568000, 0x3856A000, 0x3856C000, 0x3856E000, 0x38570000, 0x38572000,
-        0x38574000, 0x38576000, 0x38578000, 0x3857A000, 0x3857C000, 0x3857E000, 0x38580000,
-        0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858A000, 0x3858C000, 0x3858E000,
-        0x38590000, 0x38592000, 0x38594000, 0x38596000, 0x38598000, 0x3859A000, 0x3859C000,
-        0x3859E000, 0x385A0000, 0x385A2000, 0x385A4000, 0x385A6000, 0x385A8000, 0x385AA000,
-        0x385AC000, 0x385AE000, 0x385B0000, 0x385B2000, 0x385B4000, 0x385B6000, 0x385B8000,
-        0x385BA000, 0x385BC000, 0x385BE000, 0x385C0000, 0x385C2000, 0x385C4000, 0x385C6000,
-        0x385C8000, 0x385CA000, 0x385CC000, 0x385CE000, 0x385D0000, 0x385D2000, 0x385D4000,
-        0x385D6000, 0x385D8000, 0x385DA000, 0x385DC000, 0x385DE000, 0x385E0000, 0x385E2000,
-        0x385E4000, 0x385E6000, 0x385E8000, 0x385EA000, 0x385EC000, 0x385EE000, 0x385F0000,
-        0x385F2000, 0x385F4000, 0x385F6000, 0x385F8000, 0x385FA000, 0x385FC000, 0x385FE000,
-        0x38600000, 0x38602000, 0x38604000, 0x38606000, 0x38608000, 0x3860A000, 0x3860C000,
-        0x3860E000, 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861A000,
-        0x3861C000, 0x3861E000, 0x38620000, 0x38622000, 0x38624000, 0x38626000, 0x38628000,
-        0x3862A000, 0x3862C000, 0x3862E000, 0x38630000, 0x38632000, 0x38634000, 0x38636000,
-        0x38638000, 0x3863A000, 0x3863C000, 0x3863E000, 0x38640000, 0x38642000, 0x38644000,
-        0x38646000, 0x38648000, 0x3864A000, 0x3864C000, 0x3864E000, 0x38650000, 0x38652000,
-        0x38654000, 0x38656000, 0x38658000, 0x3865A000, 0x3865C000, 0x3865E000, 0x38660000,
-        0x38662000, 0x38664000, 0x38666000, 0x38668000, 0x3866A000, 0x3866C000, 0x3866E000,
-        0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867A000, 0x3867C000,
-        0x3867E000, 0x38680000, 0x38682000, 0x38684000, 0x38686000, 0x38688000, 0x3868A000,
-        0x3868C000, 0x3868E000, 0x38690000, 0x38692000, 0x38694000, 0x38696000, 0x38698000,
-        0x3869A000, 0x3869C000, 0x3869E000, 0x386A0000, 0x386A2000, 0x386A4000, 0x386A6000,
-        0x386A8000, 0x386AA000, 0x386AC000, 0x386AE000, 0x386B0000, 0x386B2000, 0x386B4000,
-        0x386B6000, 0x386B8000, 0x386BA000, 0x386BC000, 0x386BE000, 0x386C0000, 0x386C2000,
-        0x386C4000, 0x386C6000, 0x386C8000, 0x386CA000, 0x386CC000, 0x386CE000, 0x386D0000,
-        0x386D2000, 0x386D4000, 0x386D6000, 0x386D8000, 0x386DA000, 0x386DC000, 0x386DE000,
-        0x386E0000, 0x386E2000, 0x386E4000, 0x386E6000, 0x386E8000, 0x386EA000, 0x386EC000,
-        0x386EE000, 0x386F0000, 0x386F2000, 0x386F4000, 0x386F6000, 0x386F8000, 0x386FA000,
-        0x386FC000, 0x386FE000, 0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000,
-        0x3870A000, 0x3870C000, 0x3870E000, 0x38710000, 0x38712000, 0x38714000, 0x38716000,
-        0x38718000, 0x3871A000, 0x3871C000, 0x3871E000, 0x38720000, 0x38722000, 0x38724000,
-        0x38726000, 0x38728000, 0x3872A000, 0x3872C000, 0x3872E000, 0x38730000, 0x38732000,
-        0x38734000, 0x38736000, 0x38738000, 0x3873A000, 0x3873C000, 0x3873E000, 0x38740000,
-        0x38742000, 0x38744000, 0x38746000, 0x38748000, 0x3874A000, 0x3874C000, 0x3874E000,
-        0x38750000, 0x38752000, 0x38754000, 0x38756000, 0x38758000, 0x3875A000, 0x3875C000,
-        0x3875E000, 0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876A000,
-        0x3876C000, 0x3876E000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, 0x38778000,
-        0x3877A000, 0x3877C000, 0x3877E000, 0x38780000, 0x38782000, 0x38784000, 0x38786000,
-        0x38788000, 0x3878A000, 0x3878C000, 0x3878E000, 0x38790000, 0x38792000, 0x38794000,
-        0x38796000, 0x38798000, 0x3879A000, 0x3879C000, 0x3879E000, 0x387A0000, 0x387A2000,
-        0x387A4000, 0x387A6000, 0x387A8000, 0x387AA000, 0x387AC000, 0x387AE000, 0x387B0000,
-        0x387B2000, 0x387B4000, 0x387B6000, 0x387B8000, 0x387BA000, 0x387BC000, 0x387BE000,
-        0x387C0000, 0x387C2000, 0x387C4000, 0x387C6000, 0x387C8000, 0x387CA000, 0x387CC000,
-        0x387CE000, 0x387D0000, 0x387D2000, 0x387D4000, 0x387D6000, 0x387D8000, 0x387DA000,
-        0x387DC000, 0x387DE000, 0x387E0000, 0x387E2000, 0x387E4000, 0x387E6000, 0x387E8000,
-        0x387EA000, 0x387EC000, 0x387EE000, 0x387F0000, 0x387F2000, 0x387F4000, 0x387F6000,
-        0x387F8000, 0x387FA000, 0x387FC000, 0x387FE000};
-    static const bits<float>::type exponent_table[64] = {
-        0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, 0x03000000,
-        0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, 0x06000000, 0x06800000,
-        0x07000000, 0x07800000, 0x08000000, 0x08800000, 0x09000000, 0x09800000, 0x0A000000,
-        0x0A800000, 0x0B000000, 0x0B800000, 0x0C000000, 0x0C800000, 0x0D000000, 0x0D800000,
-        0x0E000000, 0x0E800000, 0x0F000000, 0x47800000, 0x80000000, 0x80800000, 0x81000000,
-        0x81800000, 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000,
-        0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000, 0x88000000,
-        0x88800000, 0x89000000, 0x89800000, 0x8A000000, 0x8A800000, 0x8B000000, 0x8B800000,
-        0x8C000000, 0x8C800000, 0x8D000000, 0x8D800000, 0x8E000000, 0x8E800000, 0x8F000000,
-        0xC7800000};
-    static const unsigned short offset_table[64] = {
-        0,    1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024,
-        1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024,
-        1024, 1024, 1024, 1024, 1024, 1024, 0,    1024, 1024, 1024, 1024, 1024, 1024,
-        1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024,
-        1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024};
-    bits<float>::type fbits =
-        mantissa_table[offset_table[value >> 10] + (value & 0x3FF)] + exponent_table[value >> 10];
-#endif
-    float out;
-    std::memcpy(&out, &fbits, sizeof(float));
-    return out;
-#endif
-}
-
-/// Convert half-precision to IEEE double-precision.
-/// \param value half-precision value to convert
-/// \return double-precision value
-inline double half2float_impl(unsigned int value, double, true_type)
-{
-#if HALF_ENABLE_F16C_INTRINSICS
-    return _mm_cvtsd_f64(_mm_cvtps_pd(_mm_cvtph_ps(_mm_cvtsi32_si128(value))));
-#else
-    uint32 hi        = static_cast<uint32>(value & 0x8000) << 16;
-    unsigned int abs = value & 0x7FFF;
-    if(abs)
-    {
-        hi |= 0x3F000000 << static_cast<unsigned>(abs >= 0x7C00);
-        for(; abs < 0x400; abs <<= 1, hi -= 0x100000)
-            ;
-        hi += static_cast<uint32>(abs) << 10;
-    }
-    bits<double>::type dbits = static_cast<bits<double>::type>(hi) << 32;
-    double out;
-    std::memcpy(&out, &dbits, sizeof(double));
-    return out;
-#endif
-}
-
-/// Convert half-precision to non-IEEE floating-point.
-/// \tparam T type to convert to (builtin integer type)
-/// \param value half-precision value to convert
-/// \return floating-point value
-template <typename T>
-T half2float_impl(unsigned int value, T, ...)
-{
-    T out;
-    unsigned int abs = value & 0x7FFF;
-    if(abs > 0x7C00)
-        out =
-            (std::numeric_limits<T>::has_signaling_NaN && !(abs & 0x200))
-                ? std::numeric_limits<T>::signaling_NaN()
-                : std::numeric_limits<T>::has_quiet_NaN ? std::numeric_limits<T>::quiet_NaN() : T();
-    else if(abs == 0x7C00)
-        out = std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity()
-                                                   : std::numeric_limits<T>::max();
-    else if(abs > 0x3FF)
-        out = std::ldexp(static_cast<T>((abs & 0x3FF) | 0x400), (abs >> 10) - 25);
-    else
-        out = std::ldexp(static_cast<T>(abs), -24);
-    return (value & 0x8000) ? -out : out;
-}
-
-/// Convert half-precision to floating-point.
-/// \tparam T type to convert to (builtin integer type)
-/// \param value half-precision value to convert
-/// \return floating-point value
-template <typename T>
-T half2float(unsigned int value)
-{
-    return half2float_impl(value,
-                           T(),
-                           bool_type < std::numeric_limits<T>::is_iec559 &&
-                               sizeof(typename bits<T>::type) == sizeof(T) > ());
-}
-
-/// Convert half-precision floating-point to integer.
-/// \tparam R rounding mode to use
-/// \tparam E `true` for round to even, `false` for round away from zero
-/// \tparam I `true` to raise INEXACT exception (if inexact), `false` to never raise it
-/// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding
-/// any implicit sign bits)
-/// \param value half-precision value to convert
-/// \return rounded integer value
-/// \exception FE_INVALID if value is not representable in type \a T
-/// \exception FE_INEXACT if value had to be rounded and \a I is `true`
-template <std::float_round_style R, bool E, bool I, typename T>
-T half2int(unsigned int value)
-{
-    unsigned int abs = value & 0x7FFF;
-    if(abs >= 0x7C00)
-    {
-        raise(FE_INVALID);
-        return (value & 0x8000) ? std::numeric_limits<T>::min() : std::numeric_limits<T>::max();
-    }
-    if(abs < 0x3800)
-    {
-        raise(FE_INEXACT, I);
-        return (R == std::round_toward_infinity)
-                   ? T(~(value >> 15) & (abs != 0))
-                   : (R == std::round_toward_neg_infinity) ? -T(value > 0x8000) : T();
-    }
-    int exp        = 25 - (abs >> 10);
-    unsigned int m = (value & 0x3FF) | 0x400;
-    int32 i        = static_cast<int32>(
-        (exp <= 0)
-            ? (m << -exp)
-            : ((m + ((R == std::round_to_nearest) ? ((1 << (exp - 1)) - (~(m >> exp) & E))
-                                                  : (R == std::round_toward_infinity)
-                                                        ? (((1 << exp) - 1) & ((value >> 15) - 1))
-                                                        : (R == std::round_toward_neg_infinity)
-                                                              ? (((1 << exp) - 1) & -(value >> 15))
-                                                              : 0)) >>
-               exp));
-    if((!std::numeric_limits<T>::is_signed && (value & 0x8000)) ||
-       (std::numeric_limits<T>::digits < 16 &&
-        ((value & 0x8000) ? (-i < std::numeric_limits<T>::min())
-                          : (i > std::numeric_limits<T>::max()))))
-        raise(FE_INVALID);
-    else if(I && exp > 0 && (m & ((1 << exp) - 1)))
-        raise(FE_INEXACT);
-    return static_cast<T>((value & 0x8000) ? -i : i);
-}
-
-/// \}
-/// \name Mathematics
-/// \{
-
-/// upper part of 64-bit multiplication.
-/// \tparam R rounding mode to use
-/// \param x first factor
-/// \param y second factor
-/// \return upper 32 bit of \a x * \a y
-template <std::float_round_style R>
-uint32 mulhi(uint32 x, uint32 y)
-{
-    uint32 xy = (x >> 16) * (y & 0xFFFF), yx = (x & 0xFFFF) * (y >> 16),
-           c = (xy & 0xFFFF) + (yx & 0xFFFF) + (((x & 0xFFFF) * (y & 0xFFFF)) >> 16);
-    return (x >> 16) * (y >> 16) + (xy >> 16) + (yx >> 16) + (c >> 16) +
-           ((R == std::round_to_nearest)
-                ? ((c >> 15) & 1)
-                : (R == std::round_toward_infinity) ? ((c & 0xFFFF) != 0) : 0);
-}
-
-/// 64-bit multiplication.
-/// \param x first factor
-/// \param y second factor
-/// \return upper 32 bit of \a x * \a y rounded to nearest
-inline uint32 multiply64(uint32 x, uint32 y)
-{
-#if HALF_ENABLE_CPP11_LONG_LONG
-    return static_cast<uint32>(
-        (static_cast<unsigned long long>(x) * static_cast<unsigned long long>(y) + 0x80000000) >>
-        32);
-#else
-    return mulhi<std::round_to_nearest>(x, y);
-#endif
-}
-
-/// 64-bit division.
-/// \param x upper 32 bit of dividend
-/// \param y divisor
-/// \param s variable to store sticky bit for rounding
-/// \return (\a x << 32) / \a y
-inline uint32 divide64(uint32 x, uint32 y, int& s)
-{
-#if HALF_ENABLE_CPP11_LONG_LONG
-    unsigned long long xx = static_cast<unsigned long long>(x) << 32;
-    return s              = (xx % y != 0), static_cast<uint32>(xx / y);
-#else
-    y >>= 1;
-    uint32 rem = x, div = 0;
-    for(unsigned int i = 0; i < 32; ++i)
-    {
-        div <<= 1;
-        if(rem >= y)
-        {
-            rem -= y;
-            div |= 1;
-        }
-        rem <<= 1;
-    }
-    return s    = rem > 1, div;
-#endif
-}
-
-/// Half precision positive modulus.
-/// \tparam Q `true` to compute full quotient, `false` else
-/// \tparam R `true` to compute signed remainder, `false` for positive remainder
-/// \param x first operand as positive finite half-precision value
-/// \param y second operand as positive finite half-precision value
-/// \param quo adress to store quotient at, `nullptr` if \a Q `false`
-/// \return modulus of \a x / \a y
-template <bool Q, bool R>
-unsigned int mod(unsigned int x, unsigned int y, int* quo = NULL)
-{
-    unsigned int q = 0;
-    if(x > y)
-    {
-        int absx = x, absy = y, expx = 0, expy = 0;
-        for(; absx < 0x400; absx <<= 1, --expx)
-            ;
-        for(; absy < 0x400; absy <<= 1, --expy)
-            ;
-        expx += absx >> 10;
-        expy += absy >> 10;
-        int mx = (absx & 0x3FF) | 0x400, my = (absy & 0x3FF) | 0x400;
-        for(int d = expx - expy; d; --d)
-        {
-            if(!Q && mx == my)
-                return 0;
-            if(mx >= my)
-            {
-                mx -= my;
-                q += Q;
-            }
-            mx <<= 1;
-            q <<= static_cast<int>(Q);
-        }
-        if(!Q && mx == my)
-            return 0;
-        if(mx >= my)
-        {
-            mx -= my;
-            ++q;
-        }
-        if(Q)
-        {
-            q &= (1 << (std::numeric_limits<int>::digits - 1)) - 1;
-            if(!mx)
-                return *quo = q, 0;
-        }
-        for(; mx < 0x400; mx <<= 1, --expy)
-            ;
-        x = (expy > 0) ? ((expy << 10) | (mx & 0x3FF)) : (mx >> (1 - expy));
-    }
-    if(R)
-    {
-        unsigned int a, b;
-        if(y < 0x800)
-        {
-            a = (x < 0x400) ? (x << 1) : (x + 0x400);
-            b = y;
-        }
-        else
-        {
-            a = x;
-            b = y - 0x400;
-        }
-        if(a > b || (a == b && (q & 1)))
-        {
-            int exp = (y >> 10) + (y <= 0x3FF), d = exp - (x >> 10) - (x <= 0x3FF);
-            int m = (((y & 0x3FF) | ((y > 0x3FF) << 10)) << 1) -
-                    (((x & 0x3FF) | ((x > 0x3FF) << 10)) << (1 - d));
-            for(; m < 0x800 && exp > 1; m <<= 1, --exp)
-                ;
-            x = 0x8000 + ((exp - 1) << 10) + (m >> 1);
-            q += Q;
-        }
-    }
-    if(Q)
-        *quo = q;
-    return x;
-}
-
-/// Fixed point square root.
-/// \tparam F number of fractional bits
-/// \param r radicand in Q1.F fixed point format
-/// \param exp exponent
-/// \return square root as Q1.F/2
-template <unsigned int F>
-uint32 sqrt(uint32& r, int& exp)
-{
-    int i = exp & 1;
-    r <<= i;
-    exp      = (exp - i) / 2;
-    uint32 m = 0;
-    for(uint32 bit = static_cast<uint32>(1) << F; bit; bit >>= 2)
-    {
-        if(r < m + bit)
-            m >>= 1;
-        else
-        {
-            r -= m + bit;
-            m = (m >> 1) + bit;
-        }
-    }
-    return m;
-}
-
-/// Fixed point binary exponential.
-/// This uses the BKM algorithm in E-mode.
-/// \param m exponent in [0,1) as Q0.31
-/// \param n number of iterations (at most 32)
-/// \return 2 ^ \a m as Q1.31
-inline uint32 exp2(uint32 m, unsigned int n = 32)
-{
-    static const uint32 logs[] = {
-        0x80000000, 0x4AE00D1D, 0x2934F098, 0x15C01A3A, 0x0B31FB7D, 0x05AEB4DD, 0x02DCF2D1,
-        0x016FE50B, 0x00B84E23, 0x005C3E10, 0x002E24CA, 0x001713D6, 0x000B8A47, 0x0005C53B,
-        0x0002E2A3, 0x00017153, 0x0000B8AA, 0x00005C55, 0x00002E2B, 0x00001715, 0x00000B8B,
-        0x000005C5, 0x000002E3, 0x00000171, 0x000000B9, 0x0000005C, 0x0000002E, 0x00000017,
-        0x0000000C, 0x00000006, 0x00000003, 0x00000001};
-    if(!m)
-        return 0x80000000;
-    uint32 mx = 0x80000000, my = 0;
-    for(unsigned int i = 1; i < n; ++i)
-    {
-        uint32 mz = my + logs[i];
-        if(mz <= m)
-        {
-            my = mz;
-            mx += mx >> i;
-        }
-    }
-    return mx;
-}
-
-/// Fixed point binary logarithm.
-/// This uses the BKM algorithm in L-mode.
-/// \param m mantissa in [1,2) as Q1.30
-/// \param n number of iterations (at most 32)
-/// \return log2(\a m) as Q0.31
-inline uint32 log2(uint32 m, unsigned int n = 32)
-{
-    static const uint32 logs[] = {
-        0x80000000, 0x4AE00D1D, 0x2934F098, 0x15C01A3A, 0x0B31FB7D, 0x05AEB4DD, 0x02DCF2D1,
-        0x016FE50B, 0x00B84E23, 0x005C3E10, 0x002E24CA, 0x001713D6, 0x000B8A47, 0x0005C53B,
-        0x0002E2A3, 0x00017153, 0x0000B8AA, 0x00005C55, 0x00002E2B, 0x00001715, 0x00000B8B,
-        0x000005C5, 0x000002E3, 0x00000171, 0x000000B9, 0x0000005C, 0x0000002E, 0x00000017,
-        0x0000000C, 0x00000006, 0x00000003, 0x00000001};
-    if(m == 0x40000000)
-        return 0;
-    uint32 mx = 0x40000000, my = 0;
-    for(unsigned int i = 1; i < n; ++i)
-    {
-        uint32 mz = mx + (mx >> i);
-        if(mz <= m)
-        {
-            mx = mz;
-            my += logs[i];
-        }
-    }
-    return my;
-}
-
-/// Fixed point sine and cosine.
-/// This uses the CORDIC algorithm in rotation mode.
-/// \param mz angle in [-pi/2,pi/2] as Q1.30
-/// \param n number of iterations (at most 31)
-/// \return sine and cosine of \a mz as Q1.30
-inline std::pair<uint32, uint32> sincos(uint32 mz, unsigned int n = 31)
-{
-    static const uint32 angles[] = {
-        0x3243F6A9, 0x1DAC6705, 0x0FADBAFD, 0x07F56EA7, 0x03FEAB77, 0x01FFD55C, 0x00FFFAAB,
-        0x007FFF55, 0x003FFFEB, 0x001FFFFD, 0x00100000, 0x00080000, 0x00040000, 0x00020000,
-        0x00010000, 0x00008000, 0x00004000, 0x00002000, 0x00001000, 0x00000800, 0x00000400,
-        0x00000200, 0x00000100, 0x00000080, 0x00000040, 0x00000020, 0x00000010, 0x00000008,
-        0x00000004, 0x00000002, 0x00000001};
-    uint32 mx = 0x26DD3B6A, my = 0;
-    for(unsigned int i = 0; i < n; ++i)
-    {
-        uint32 sign = sign_mask(mz);
-        uint32 tx   = mx - (arithmetic_shift(my, i) ^ sign) + sign;
-        uint32 ty   = my + (arithmetic_shift(mx, i) ^ sign) - sign;
-        mx          = tx;
-        my          = ty;
-        mz -= (angles[i] ^ sign) - sign;
-    }
-    return std::make_pair(my, mx);
-}
-
-/// Fixed point arc tangent.
-/// This uses the CORDIC algorithm in vectoring mode.
-/// \param my y coordinate as Q0.30
-/// \param mx x coordinate as Q0.30
-/// \param n number of iterations (at most 31)
-/// \return arc tangent of \a my / \a mx as Q1.30
-inline uint32 atan2(uint32 my, uint32 mx, unsigned int n = 31)
-{
-    static const uint32 angles[] = {
-        0x3243F6A9, 0x1DAC6705, 0x0FADBAFD, 0x07F56EA7, 0x03FEAB77, 0x01FFD55C, 0x00FFFAAB,
-        0x007FFF55, 0x003FFFEB, 0x001FFFFD, 0x00100000, 0x00080000, 0x00040000, 0x00020000,
-        0x00010000, 0x00008000, 0x00004000, 0x00002000, 0x00001000, 0x00000800, 0x00000400,
-        0x00000200, 0x00000100, 0x00000080, 0x00000040, 0x00000020, 0x00000010, 0x00000008,
-        0x00000004, 0x00000002, 0x00000001};
-    uint32 mz = 0;
-    for(unsigned int i = 0; i < n; ++i)
-    {
-        uint32 sign = sign_mask(my);
-        uint32 tx   = mx + (arithmetic_shift(my, i) ^ sign) - sign;
-        uint32 ty   = my - (arithmetic_shift(mx, i) ^ sign) + sign;
-        mx          = tx;
-        my          = ty;
-        mz += (angles[i] ^ sign) - sign;
-    }
-    return mz;
-}
-
-/// Reduce argument for trigonometric functions.
-/// \param abs half-precision floating-point value
-/// \param k value to take quarter period
-/// \return \a abs reduced to [-pi/4,pi/4] as Q0.30
-inline uint32 angle_arg(unsigned int abs, int& k)
-{
-    uint32 m = (abs & 0x3FF) | ((abs > 0x3FF) << 10);
-    int exp  = (abs >> 10) + (abs <= 0x3FF) - 15;
-    if(abs < 0x3A48)
-        return k = 0, m << (exp + 20);
-#if HALF_ENABLE_CPP11_LONG_LONG
-    unsigned long long y = m * 0xA2F9836E4E442, mask = (1ULL << (62 - exp)) - 1,
-                       yi = (y + (mask >> 1)) & ~mask, f = y - yi;
-    uint32 sign = -static_cast<uint32>(f >> 63);
-    k           = static_cast<int>(yi >> (62 - exp));
-    return (multiply64(static_cast<uint32>((sign ? -f : f) >> (31 - exp)), 0xC90FDAA2) ^ sign) -
-           sign;
-#else
-    uint32 yh   = m * 0xA2F98 + mulhi<std::round_toward_zero>(m, 0x36E4E442),
-           yl   = (m * 0x36E4E442) & 0xFFFFFFFF;
-    uint32 mask = (static_cast<uint32>(1) << (30 - exp)) - 1, yi = (yh + (mask >> 1)) & ~mask,
-           sign = -static_cast<uint32>(yi > yh);
-    k           = static_cast<int>(yi >> (30 - exp));
-    uint32 fh = (yh ^ sign) + (yi ^ ~sign) - ~sign, fl = (yl ^ sign) - sign;
-    return (multiply64((exp > -1)
-                           ? (((fh << (1 + exp)) & 0xFFFFFFFF) | ((fl & 0xFFFFFFFF) >> (31 - exp)))
-                           : fh,
-                       0xC90FDAA2) ^
-            sign) -
-           sign;
-#endif
-}
-
-/// Get arguments for atan2 function.
-/// \param abs half-precision floating-point value
-/// \return \a abs and sqrt(1 - \a abs^2) as Q0.30
-inline std::pair<uint32, uint32> atan2_args(unsigned int abs)
-{
-    int exp = -15;
-    for(; abs < 0x400; abs <<= 1, --exp)
-        ;
-    exp += abs >> 10;
-    uint32 my = ((abs & 0x3FF) | 0x400) << 5, r = my * my;
-    int rexp = 2 * exp;
-    r        = 0x40000000 -
-        ((rexp > -31) ? ((r >> -rexp) | ((r & ((static_cast<uint32>(1) << -rexp) - 1)) != 0)) : 1);
-    for(rexp = 0; r < 0x40000000; r <<= 1, --rexp)
-        ;
-    uint32 mx = sqrt<30>(r, rexp);
-    int d     = exp - rexp;
-    if(d < 0)
-        return std::make_pair((d < -14) ? ((my >> (-d - 14)) + ((my >> (-d - 15)) & 1))
-                                        : (my << (14 + d)),
-                              (mx << 14) + (r << 13) / mx);
-    if(d > 0)
-        return std::make_pair(my << 14,
-                              (d > 14)
-                                  ? ((mx >> (d - 14)) + ((mx >> (d - 15)) & 1))
-                                  : ((d == 14) ? mx : ((mx << (14 - d)) + (r << (13 - d)) / mx)));
-    return std::make_pair(my << 13, (mx << 13) + (r << 12) / mx);
-}
-
-/// Get exponentials for hyperbolic computation
-/// \param abs half-precision floating-point value
-/// \param exp variable to take unbiased exponent of larger result
-/// \param n number of BKM iterations (at most 32)
-/// \return exp(abs) and exp(-\a abs) as Q1.31 with same exponent
-inline std::pair<uint32, uint32> hyperbolic_args(unsigned int abs, int& exp, unsigned int n = 32)
-{
-    uint32 mx = detail::multiply64(static_cast<uint32>((abs & 0x3FF) + ((abs > 0x3FF) << 10)) << 21,
-                                   0xB8AA3B29),
-           my;
-    int e = (abs >> 10) + (abs <= 0x3FF);
-    if(e < 14)
-    {
-        exp = 0;
-        mx >>= 14 - e;
-    }
-    else
-    {
-        exp = mx >> (45 - e);
-        mx  = (mx << (e - 14)) & 0x7FFFFFFF;
-    }
-    mx    = exp2(mx, n);
-    int d = exp << 1, s;
-    if(mx > 0x80000000)
-    {
-        my = divide64(0x80000000, mx, s);
-        my |= s;
-        ++d;
-    }
-    else
-        my = mx;
-    return std::make_pair(
-        mx, (d < 31) ? ((my >> d) | ((my & ((static_cast<uint32>(1) << d) - 1)) != 0)) : 1);
-}
-
-/// Postprocessing for binary exponential.
-/// \tparam R rounding mode to use
-/// \tparam I `true` to always raise INEXACT exception, `false` to raise only for rounded results
-/// \param m mantissa as Q1.31
-/// \param exp absolute value of unbiased exponent
-/// \param esign sign of actual exponent
-/// \param sign sign bit of result
-/// \return value converted to half-precision
-/// \exception FE_OVERFLOW on overflows
-/// \exception FE_UNDERFLOW on underflows
-/// \exception FE_INEXACT if value had to be rounded or \a I is `true`
-template <std::float_round_style R, bool I>
-unsigned int exp2_post(uint32 m, int exp, bool esign, unsigned int sign = 0)
-{
-    int s = 0;
-    if(esign)
-    {
-        if(m > 0x80000000)
-        {
-            m = divide64(0x80000000, m, s);
-            ++exp;
-        }
-        if(exp > 25)
-            return underflow<R>(sign);
-        else if(exp == 25)
-            return rounded<R, I>(sign, 1, (m & 0x7FFFFFFF) != 0);
-        exp = -exp;
-    }
-    else if(exp > 15)
-        return overflow<R>(sign);
-    return fixed2half<R, 31, false, false, I>(m, exp + 14, sign, s);
-}
-
-/// Postprocessing for binary logarithm.
-/// \tparam R rounding mode to use
-/// \tparam L logarithm for base transformation as Q1.31
-/// \param m fractional part of logarithm as Q0.31
-/// \param ilog signed integer part of logarithm
-/// \param exp biased exponent of result
-/// \param sign sign bit of result
-/// \return value base-transformed and converted to half-precision
-/// \exception FE_OVERFLOW on overflows
-/// \exception FE_UNDERFLOW on underflows
-/// \exception FE_INEXACT if no other exception occurred
-template <std::float_round_style R, uint32 L>
-unsigned int log2_post(uint32 m, int ilog, int exp, unsigned int sign = 0)
-{
-    uint32 msign = sign_mask(ilog);
-    m            = (((static_cast<uint32>(ilog) << 27) + (m >> 4)) ^ msign) - msign;
-    if(!m)
-        return 0;
-    for(; m < 0x80000000; m <<= 1, --exp)
-        ;
-    int i = m >= L, s;
-    exp += i;
-    m >>= 1 + i;
-    sign ^= msign & 0x8000;
-    if(exp < -11)
-        return underflow<R>(sign);
-    m = divide64(m, L, s);
-    return fixed2half<R, 30, false, false, true>(m, exp, sign, 1);
-}
-
-/// Hypotenuse square root and postprocessing.
-/// \tparam R rounding mode to use
-/// \param r mantissa as Q2.30
-/// \param exp unbiased exponent
-/// \return square root converted to half-precision
-/// \exception FE_OVERFLOW on overflows
-/// \exception FE_UNDERFLOW on underflows
-/// \exception FE_INEXACT if value had to be rounded
-template <std::float_round_style R>
-unsigned int hypot_post(uint32 r, int exp)
-{
-    int i = r >> 31;
-    if((exp += i) > 46)
-        return overflow<R>();
-    if(exp < -34)
-        return underflow<R>();
-    r        = (r >> i) | (r & i);
-    uint32 m = sqrt<30>(r, exp += 15);
-    return fixed2half<R, 15, false, false, false>(m, exp - 1, 0, r != 0);
-}
-
-/// Division and postprocessing for tangents.
-/// \tparam R rounding mode to use
-/// \param my dividend as Q1.31
-/// \param mx divisor as Q1.31
-/// \param exp biased exponent of result
-/// \param sign sign bit of result
-/// \return quotient converted to half-precision
-/// \exception FE_OVERFLOW on overflows
-/// \exception FE_UNDERFLOW on underflows
-/// \exception FE_INEXACT if no other exception occurred
-template <std::float_round_style R>
-unsigned int tangent_post(uint32 my, uint32 mx, int exp, unsigned int sign = 0)
-{
-    int i = my >= mx, s;
-    exp += i;
-    if(exp > 29)
-        return overflow<R>(sign);
-    if(exp < -11)
-        return underflow<R>(sign);
-    uint32 m = divide64(my >> (i + 1), mx, s);
-    return fixed2half<R, 30, false, false, true>(m, exp, sign, s);
-}
-
-/// Area function and postprocessing.
-/// This computes the value directly in Q2.30 using the representation `asinh|acosh(x) =
-/// log(x+sqrt(x^2+|-1))`.
-/// \tparam R rounding mode to use
-/// \tparam S `true` for asinh, `false` for acosh
-/// \param arg half-precision argument
-/// \return asinh|acosh(\a arg) converted to half-precision
-/// \exception FE_OVERFLOW on overflows
-/// \exception FE_UNDERFLOW on underflows
-/// \exception FE_INEXACT if no other exception occurred
-template <std::float_round_style R, bool S>
-unsigned int area(unsigned int arg)
-{
-    int abs = arg & 0x7FFF, expx = (abs >> 10) + (abs <= 0x3FF) - 15, expy = -15, ilog, i;
-    uint32 mx = static_cast<uint32>((abs & 0x3FF) | ((abs > 0x3FF) << 10)) << 20, my, r;
-    for(; abs < 0x400; abs <<= 1, --expy)
-        ;
-    expy += abs >> 10;
-    r = ((abs & 0x3FF) | 0x400) << 5;
-    r *= r;
-    i    = r >> 31;
-    expy = 2 * expy + i;
-    r >>= i;
-    if(S)
-    {
-        if(expy < 0)
-        {
-            r    = 0x40000000 + ((expy > -30) ? ((r >> -expy) |
-                                              ((r & ((static_cast<uint32>(1) << -expy) - 1)) != 0))
-                                           : 1);
-            expy = 0;
-        }
-        else
-        {
-            r += 0x40000000 >> expy;
-            i = r >> 31;
-            r = (r >> i) | (r & i);
-            expy += i;
-        }
-    }
-    else
-    {
-        r -= 0x40000000 >> expy;
-        for(; r < 0x40000000; r <<= 1, --expy)
-            ;
-    }
-    my = sqrt<30>(r, expy);
-    my = (my << 15) + (r << 14) / my;
-    if(S)
-    {
-        mx >>= expy - expx;
-        ilog = expy;
-    }
-    else
-    {
-        my >>= expx - expy;
-        ilog = expx;
-    }
-    my += mx;
-    i                  = my >> 31;
-    static const int G = S && (R == std::round_to_nearest);
-    return log2_post<R, 0xB8AA3B2A>(
-        log2(my >> i, 26 + S + G) + (G << 3), ilog + i, 17, arg & (static_cast<unsigned>(S) << 15));
-}
-
-/// Class for 1.31 unsigned floating-point computation
-struct f31
-{
-    /// Constructor.
-    /// \param mant mantissa as 1.31
-    /// \param e exponent
-    HALF_CONSTEXPR f31(uint32 mant, int e) : m(mant), exp(e) {}
-
-    /// Constructor.
-    /// \param abs unsigned half-precision value
-    f31(unsigned int abs) : exp(-15)
-    {
-        for(; abs < 0x400; abs <<= 1, --exp)
-            ;
-        m = static_cast<uint32>((abs & 0x3FF) | 0x400) << 21;
-        exp += (abs >> 10);
-    }
-
-    /// Addition operator.
-    /// \param a first operand
-    /// \param b second operand
-    /// \return \a a + \a b
-    friend f31 operator+(f31 a, f31 b)
-    {
-        if(b.exp > a.exp)
-            std::swap(a, b);
-        int d    = a.exp - b.exp;
-        uint32 m = a.m + ((d < 32) ? (b.m >> d) : 0);
-        int i    = (m & 0xFFFFFFFF) < a.m;
-        return f31(((m + i) >> i) | 0x80000000, a.exp + i);
-    }
-
-    /// Subtraction operator.
-    /// \param a first operand
-    /// \param b second operand
-    /// \return \a a - \a b
-    friend f31 operator-(f31 a, f31 b)
-    {
-        int d = a.exp - b.exp, exp = a.exp;
-        uint32 m = a.m - ((d < 32) ? (b.m >> d) : 0);
-        if(!m)
-            return f31(0, -32);
-        for(; m < 0x80000000; m <<= 1, --exp)
-            ;
-        return f31(m, exp);
-    }
-
-    /// Multiplication operator.
-    /// \param a first operand
-    /// \param b second operand
-    /// \return \a a * \a b
-    friend f31 operator*(f31 a, f31 b)
-    {
-        uint32 m = multiply64(a.m, b.m);
-        int i    = m >> 31;
-        return f31(m << (1 - i), a.exp + b.exp + i);
-    }
-
-    /// Division operator.
-    /// \param a first operand
-    /// \param b second operand
-    /// \return \a a / \a b
-    friend f31 operator/(f31 a, f31 b)
-    {
-        int i    = a.m >= b.m, s;
-        uint32 m = divide64((a.m + i) >> i, b.m, s);
-        return f31(m, a.exp - b.exp + i - 1);
-    }
-
-    uint32 m; ///< mantissa as 1.31.
-    int exp;  ///< exponent.
-};
-
-/// Error function and postprocessing.
-/// This computes the value directly in Q1.31 using the approximations given
-/// [here](https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions).
-/// \tparam R rounding mode to use
-/// \tparam C `true` for comlementary error function, `false` else
-/// \param arg half-precision function argument
-/// \return approximated value of error function in half-precision
-/// \exception FE_OVERFLOW on overflows
-/// \exception FE_UNDERFLOW on underflows
-/// \exception FE_INEXACT if no other exception occurred
-template <std::float_round_style R, bool C>
-unsigned int erf(unsigned int arg)
-{
-    unsigned int abs = arg & 0x7FFF, sign = arg & 0x8000;
-    f31 x(abs), x2                        = x * x * f31(0xB8AA3B29, 0),
-                t = f31(0x80000000, 0) / (f31(0x80000000, 0) + f31(0xA7BA054A, -2) * x), t2 = t * t;
-    f31 e = ((f31(0x87DC2213, 0) * t2 + f31(0xB5F0E2AE, 0)) * t2 + f31(0x82790637, -2) -
-             (f31(0xBA00E2B8, 0) * t2 + f31(0x91A98E62, -2)) * t) *
-            t /
-            ((x2.exp < 0) ? f31(exp2((x2.exp > -32) ? (x2.m >> -x2.exp) : 0, 30), 0)
-                          : f31(exp2((x2.m << x2.exp) & 0x7FFFFFFF, 22), x2.m >> (31 - x2.exp)));
-    return (!C || sign)
-               ? fixed2half<R, 31, false, true, true>(
-                     0x80000000 - (e.m >> (C - e.exp)), 14 + C, sign & (C - 1U))
-               : (e.exp < -25)
-                     ? underflow<R>()
-                     : fixed2half<R, 30, false, false, true>(e.m >> 1, e.exp + 14, 0, e.m & 1);
-}
-
-/// Gamma function and postprocessing.
-/// This approximates the value of either the gamma function or its logarithm directly in Q1.31.
-/// \tparam R rounding mode to use
-/// \tparam L `true` for lograithm of gamma function, `false` for gamma function
-/// \param arg half-precision floating-point value
-/// \return lgamma/tgamma(\a arg) in half-precision
-/// \exception FE_OVERFLOW on overflows
-/// \exception FE_UNDERFLOW on underflows
-/// \exception FE_INEXACT if \a arg is not a positive integer
-template <std::float_round_style R, bool L>
-unsigned int gamma(unsigned int arg)
-{
-    /*			static const double p[] ={ 2.50662827563479526904, 225.525584619175212544,
-       -268.295973841304927459, 80.9030806934622512966, -5.00757863970517583837,
-       0.0114684895434781459556 }; double t = arg + 4.65, s = p[0]; for(unsigned int i=0; i<5; ++i)
-                                s += p[i+1] / (arg+i);
-                        return std::log(s) + (arg-0.5)*std::log(t) - t;
-*/ static const f31 pi(0xC90FDAA2, 1), lbe(0xB8AA3B29, 0);
-    unsigned int abs = arg & 0x7FFF, sign = arg & 0x8000;
-    bool bsign = sign != 0;
-    f31 z(abs), x = sign ? (z + f31(0x80000000, 0)) : z, t = x + f31(0x94CCCCCD, 2),
-                s = f31(0xA06C9901, 1) + f31(0xBBE654E2, -7) / (x + f31(0x80000000, 2)) +
-                    f31(0xA1CE6098, 6) / (x + f31(0x80000000, 1)) + f31(0xE1868CB7, 7) / x -
-                    f31(0x8625E279, 8) / (x + f31(0x80000000, 0)) -
-                    f31(0xA03E158F, 2) / (x + f31(0xC0000000, 1));
-    int i = (s.exp >= 2) + (s.exp >= 4) + (s.exp >= 8) + (s.exp >= 16);
-    s     = f31((static_cast<uint32>(s.exp) << (31 - i)) + (log2(s.m >> 1, 28) >> i), i) / lbe;
-    if(x.exp != -1 || x.m != 0x80000000)
-    {
-        i     = (t.exp >= 2) + (t.exp >= 4) + (t.exp >= 8);
-        f31 l = f31((static_cast<uint32>(t.exp) << (31 - i)) + (log2(t.m >> 1, 30) >> i), i) / lbe;
-        s     = (x.exp < -1) ? (s - (f31(0x80000000, -1) - x) * l)
-                         : (s + (x - f31(0x80000000, -1)) * l);
-    }
-    s = x.exp ? (s - t) : (t - s);
-    if(bsign)
-    {
-        if(z.exp >= 0)
-        {
-            sign &= (L | ((z.m >> (31 - z.exp)) & 1)) - 1;
-            for(z = f31((z.m << (1 + z.exp)) & 0xFFFFFFFF, -1); z.m < 0x80000000;
-                z.m <<= 1, --z.exp)
-                ;
-        }
-        if(z.exp == -1)
-            z = f31(0x80000000, 0) - z;
-        if(z.exp < -1)
-        {
-            z   = z * pi;
-            z.m = sincos(z.m >> (1 - z.exp), 30).first;
-            for(z.exp = 1; z.m < 0x80000000; z.m <<= 1, --z.exp)
-                ;
-        }
-        else
-            z = f31(0x80000000, 0);
-    }
-    if(L)
-    {
-        if(bsign)
-        {
-            f31 l(0x92868247, 0);
-            if(z.exp < 0)
-            {
-                uint32 m = log2((z.m + 1) >> 1, 27);
-                z        = f31(-((static_cast<uint32>(z.exp) << 26) + (m >> 5)), 5);
-                for(; z.m < 0x80000000; z.m <<= 1, --z.exp)
-                    ;
-                l = l + z / lbe;
-            }
-            sign = static_cast<unsigned>(x.exp && (l.exp < s.exp || (l.exp == s.exp && l.m < s.m)))
-                   << 15;
-            s = sign ? (s - l) : x.exp ? (l - s) : (l + s);
-        }
-        else
-        {
-            sign = static_cast<unsigned>(x.exp == 0) << 15;
-            if(s.exp < -24)
-                return underflow<R>(sign);
-            if(s.exp > 15)
-                return overflow<R>(sign);
-        }
-    }
-    else
-    {
-        s = s * lbe;
-        uint32 m;
-        if(s.exp < 0)
-        {
-            m     = s.m >> -s.exp;
-            s.exp = 0;
-        }
-        else
-        {
-            m     = (s.m << s.exp) & 0x7FFFFFFF;
-            s.exp = (s.m >> (31 - s.exp));
-        }
-        s.m = exp2(m, 27);
-        if(!x.exp)
-            s = f31(0x80000000, 0) / s;
-        if(bsign)
-        {
-            if(z.exp < 0)
-                s = s * z;
-            s = pi / s;
-            if(s.exp < -24)
-                return underflow<R>(sign);
-        }
-        else if(z.exp > 0 && !(z.m & ((1 << (31 - z.exp)) - 1)))
-            return ((s.exp + 14) << 10) + (s.m >> 21);
-        if(s.exp > 15)
-            return overflow<R>(sign);
-    }
-    return fixed2half<R, 31, false, false, true>(s.m, s.exp + 14, sign);
-}
-/// \}
-
-template <typename, typename, std::float_round_style>
-struct half_caster;
-} // namespace detail
-
-/// Half-precision floating-point type.
-/// This class implements an IEEE-conformant half-precision floating-point type with the usual
-/// arithmetic
-/// operators and conversions. It is implicitly convertible to single-precision floating-point,
-/// which makes artihmetic
-/// expressions and functions with mixed-type operands to be of the most precise operand type.
-///
-/// According to the C++98/03 definition, the half type is not a POD type. But according to C++11's
-/// less strict and
-/// extended definitions it is both a standard layout type and a trivially copyable type (even if
-/// not a POD type), which
-/// means it can be standard-conformantly copied using raw binary copies. But in this context some
-/// more words about the
-/// actual size of the type. Although the half is representing an IEEE 16-bit type, it does not
-/// neccessarily have to be of
-/// exactly 16-bits size. But on any reasonable implementation the actual binary representation of
-/// this type will most
-/// probably not ivolve any additional "magic" or padding beyond the simple binary representation of
-/// the underlying 16-bit
-/// IEEE number, even if not strictly guaranteed by the standard. But even then it only has an
-/// actual size of 16 bits if
-/// your C++ implementation supports an unsigned integer type of exactly 16 bits width. But this
-/// should be the case on
-/// nearly any reasonable platform.
-///
-/// So if your C++ implementation is not totally exotic or imposes special alignment requirements,
-/// it is a reasonable
-/// assumption that the data of a half is just comprised of the 2 bytes of the underlying IEEE
-/// representation.
-class half
-{
-    public:
-    /// \name Construction and assignment
-    /// \{
-
-    /// Default constructor.
-    /// This initializes the half to 0. Although this does not match the builtin types'
-    /// default-initialization semantics
-    /// and may be less efficient than no initialization, it is needed to provide proper
-    /// value-initialization semantics.
-    HALF_CONSTEXPR half() HALF_NOEXCEPT : data_() {}
-
-    /// Conversion constructor.
-    /// \param rhs float to convert
-    /// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-    explicit half(float rhs)
-        : data_(static_cast<detail::uint16>(detail::float2half<round_style>(rhs)))
-    {
-    }
-
-    /// Conversion to single-precision.
-    /// \return single precision value representing expression value
-    operator float() const { return detail::half2float<float>(data_); }
-
-    /// Assignment operator.
-    /// \param rhs single-precision value to copy from
-    /// \return reference to this half
-    /// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-    half& operator=(float rhs)
-    {
-        data_ = static_cast<detail::uint16>(detail::float2half<round_style>(rhs));
-        return *this;
-    }
-
-    /// \}
-    /// \name Arithmetic updates
-    /// \{
-
-    /// Arithmetic assignment.
-    /// \tparam T type of concrete half expression
-    /// \param rhs half expression to add
-    /// \return reference to this half
-    /// \exception FE_... according to operator+(half,half)
-    half& operator+=(half rhs) { return *this = *this + rhs; }
-
-    /// Arithmetic assignment.
-    /// \tparam T type of concrete half expression
-    /// \param rhs half expression to subtract
-    /// \return reference to this half
-    /// \exception FE_... according to operator-(half,half)
-    half& operator-=(half rhs) { return *this = *this - rhs; }
-
-    /// Arithmetic assignment.
-    /// \tparam T type of concrete half expression
-    /// \param rhs half expression to multiply with
-    /// \return reference to this half
-    /// \exception FE_... according to operator*(half,half)
-    half& operator*=(half rhs) { return *this = *this * rhs; }
-
-    /// Arithmetic assignment.
-    /// \tparam T type of concrete half expression
-    /// \param rhs half expression to divide by
-    /// \return reference to this half
-    /// \exception FE_... according to operator/(half,half)
-    half& operator/=(half rhs) { return *this = *this / rhs; }
-
-    /// Arithmetic assignment.
-    /// \param rhs single-precision value to add
-    /// \return reference to this half
-    /// \exception FE_... according to operator=()
-    half& operator+=(float rhs) { return *this = *this + rhs; }
-
-    /// Arithmetic assignment.
-    /// \param rhs single-precision value to subtract
-    /// \return reference to this half
-    /// \exception FE_... according to operator=()
-    half& operator-=(float rhs) { return *this = *this - rhs; }
-
-    /// Arithmetic assignment.
-    /// \param rhs single-precision value to multiply with
-    /// \return reference to this half
-    /// \exception FE_... according to operator=()
-    half& operator*=(float rhs) { return *this = *this * rhs; }
-
-    /// Arithmetic assignment.
-    /// \param rhs single-precision value to divide by
-    /// \return reference to this half
-    /// \exception FE_... according to operator=()
-    half& operator/=(float rhs) { return *this = *this / rhs; }
-
-    /// \}
-    /// \name Increment and decrement
-    /// \{
-
-    /// Prefix increment.
-    /// \return incremented half value
-    /// \exception FE_... according to operator+(half,half)
-    half& operator++() { return *this = *this + half(detail::binary, 0x3C00); }
-
-    /// Prefix decrement.
-    /// \return decremented half value
-    /// \exception FE_... according to operator-(half,half)
-    half& operator--() { return *this = *this + half(detail::binary, 0xBC00); }
-
-    /// Postfix increment.
-    /// \return non-incremented half value
-    /// \exception FE_... according to operator+(half,half)
-    half operator++(int)
-    {
-        half out(*this);
-        ++*this;
-        return out;
-    }
-
-    /// Postfix decrement.
-    /// \return non-decremented half value
-    /// \exception FE_... according to operator-(half,half)
-    half operator--(int)
-    {
-        half out(*this);
-        --*this;
-        return out;
-    }
-    /// \}
-
-    private:
-    /// Rounding mode to use
-    static const std::float_round_style round_style = (std::float_round_style)(HALF_ROUND_STYLE);
-
-    /// Constructor.
-    /// \param bits binary representation to set half to
-    HALF_CONSTEXPR half(detail::binary_t, unsigned int bits) HALF_NOEXCEPT
-        : data_(static_cast<detail::uint16>(bits))
-    {
-    }
-
-    /// Internal binary representation
-    detail::uint16 data_;
-
-#ifndef HALF_DOXYGEN_ONLY
-    friend HALF_CONSTEXPR_NOERR bool operator==(half, half);
-    friend HALF_CONSTEXPR_NOERR bool operator!=(half, half);
-    friend HALF_CONSTEXPR_NOERR bool operator<(half, half);
-    friend HALF_CONSTEXPR_NOERR bool operator>(half, half);
-    friend HALF_CONSTEXPR_NOERR bool operator<=(half, half);
-    friend HALF_CONSTEXPR_NOERR bool operator>=(half, half);
-    friend HALF_CONSTEXPR half operator-(half);
-    friend half operator+(half, half);
-    friend half operator-(half, half);
-    friend half operator*(half, half);
-    friend half operator/(half, half);
-    template <typename charT, typename traits>
-    friend std::basic_ostream<charT, traits>& operator<<(std::basic_ostream<charT, traits>&, half);
-    template <typename charT, typename traits>
-    friend std::basic_istream<charT, traits>& operator>>(std::basic_istream<charT, traits>&, half&);
-    friend HALF_CONSTEXPR half fabs(half);
-    friend half fmod(half, half);
-    friend half remainder(half, half);
-    friend half remquo(half, half, int*);
-    friend half fma(half, half, half);
-    friend HALF_CONSTEXPR_NOERR half fmax(half, half);
-    friend HALF_CONSTEXPR_NOERR half fmin(half, half);
-    friend half fdim(half, half);
-    friend half nanh(const char*);
-    friend half exp(half);
-    friend half exp2(half);
-    friend half expm1(half);
-    friend half log(half);
-    friend half log10(half);
-    friend half log2(half);
-    friend half log1p(half);
-    friend half sqrt(half);
-    friend half cbrt(half);
-    friend half hypot(half, half);
-    friend half hypot(half, half, half);
-    friend half pow(half, half);
-    friend void sincos(half, half*, half*);
-    friend half sin(half);
-    friend half cos(half);
-    friend half tan(half);
-    friend half asin(half);
-    friend half acos(half);
-    friend half atan(half);
-    friend half atan2(half, half);
-    friend half sinh(half);
-    friend half cosh(half);
-    friend half tanh(half);
-    friend half asinh(half);
-    friend half acosh(half);
-    friend half atanh(half);
-    friend half erf(half);
-    friend half erfc(half);
-    friend half lgamma(half);
-    friend half tgamma(half);
-    friend half ceil(half);
-    friend half floor(half);
-    friend half trunc(half);
-    friend half round(half);
-    friend long lround(half);
-    friend half rint(half);
-    friend long lrint(half);
-    friend half nearbyint(half);
-#ifdef HALF_ENABLE_CPP11_LONG_LONG
-    friend long long llround(half);
-    friend long long llrint(half);
-#endif
-    friend half frexp(half, int*);
-    friend half scalbln(half, long);
-    friend half modf(half, half*);
-    friend int ilogb(half);
-    friend half logb(half);
-    friend half nextafter(half, half);
-    friend half nexttoward(half, long double);
-    friend HALF_CONSTEXPR half copysign(half, half);
-    friend HALF_CONSTEXPR int fpclassify(half);
-    friend HALF_CONSTEXPR bool isfinite(half);
-    friend HALF_CONSTEXPR bool isinf(half);
-    friend HALF_CONSTEXPR bool isnan(half);
-    friend HALF_CONSTEXPR bool isnormal(half);
-    friend HALF_CONSTEXPR bool signbit(half);
-    friend HALF_CONSTEXPR bool isgreater(half, half);
-    friend HALF_CONSTEXPR bool isgreaterequal(half, half);
-    friend HALF_CONSTEXPR bool isless(half, half);
-    friend HALF_CONSTEXPR bool islessequal(half, half);
-    friend HALF_CONSTEXPR bool islessgreater(half, half);
-    template <typename, typename, std::float_round_style>
-    friend struct detail::half_caster;
-    friend class std::numeric_limits<half>;
-#if HALF_ENABLE_CPP11_HASH
-    friend struct std::hash<half>;
-#endif
-#if HALF_ENABLE_CPP11_USER_LITERALS
-    friend half literal::operator"" _h(long double);
-#endif
-#endif
-};
-
-#if HALF_ENABLE_CPP11_USER_LITERALS
-namespace literal {
-/// Half literal.
-/// While this returns a properly rounded half-precision value, half literals can unfortunately not
-/// be constant
-/// expressions due to rather involved conversions. So don't expect this to be a literal literal
-/// without involving
-/// conversion operations at runtime. It is a convenience feature, not a performance optimization.
-/// \param value literal value
-/// \return half with of given value (possibly rounded)
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half operator"" _h(long double value)
-{
-    return half(detail::binary, detail::float2half<half::round_style>(value));
-}
-} // namespace literal
-#endif
-
-namespace detail {
-/// Helper class for half casts.
-/// This class template has to be specialized for all valid cast arguments to define an appropriate
-/// static
-/// `cast` member function and a corresponding `type` member denoting its return type.
-/// \tparam T destination type
-/// \tparam U source type
-/// \tparam R rounding mode to use
-template <typename T,
-          typename U,
-          std::float_round_style R = (std::float_round_style)(HALF_ROUND_STYLE)>
-struct half_caster
-{
-};
-template <typename U, std::float_round_style R>
-struct half_caster<half, U, R>
-{
-#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
-    static_assert(std::is_arithmetic<U>::value, "half_cast from non-arithmetic type unsupported");
-#endif
-
-    static half cast(U arg) { return cast_impl(arg, is_float<U>()); };
-
-    private:
-    static half cast_impl(U arg, true_type) { return half(binary, float2half<R>(arg)); }
-    static half cast_impl(U arg, false_type) { return half(binary, int2half<R>(arg)); }
-};
-template <typename T, std::float_round_style R>
-struct half_caster<T, half, R>
-{
-#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
-    static_assert(std::is_arithmetic<T>::value, "half_cast to non-arithmetic type unsupported");
-#endif
-
-    static T cast(half arg) { return cast_impl(arg, is_float<T>()); }
-
-    private:
-    static T cast_impl(half arg, true_type) { return half2float<T>(arg.data_); }
-    static T cast_impl(half arg, false_type) { return half2int<R, true, true, T>(arg.data_); }
-};
-template <std::float_round_style R>
-struct half_caster<half, half, R>
-{
-    static half cast(half arg) { return arg; }
-};
-} // namespace detail
-} // namespace half_float
-
-/// Extensions to the C++ standard library.
-namespace std {
-/// Numeric limits for half-precision floats.
-/// **See also:** Documentation for
-/// [std::numeric_limits](https://en.cppreference.com/w/cpp/types/numeric_limits)
-template <>
-class numeric_limits<half_float::half>
-{
-    public:
-    /// Is template specialization.
-    static HALF_CONSTEXPR_CONST bool is_specialized = true;
-
-    /// Supports signed values.
-    static HALF_CONSTEXPR_CONST bool is_signed = true;
-
-    /// Is not an integer type.
-    static HALF_CONSTEXPR_CONST bool is_integer = false;
-
-    /// Is not exact.
-    static HALF_CONSTEXPR_CONST bool is_exact = false;
-
-    /// Doesn't provide modulo arithmetic.
-    static HALF_CONSTEXPR_CONST bool is_modulo = false;
-
-    /// Has a finite set of values.
-    static HALF_CONSTEXPR_CONST bool is_bounded = true;
-
-    /// IEEE conformant.
-    static HALF_CONSTEXPR_CONST bool is_iec559 = true;
-
-    /// Supports infinity.
-    static HALF_CONSTEXPR_CONST bool has_infinity = true;
-
-    /// Supports quiet NaNs.
-    static HALF_CONSTEXPR_CONST bool has_quiet_NaN = true;
-
-    /// Supports signaling NaNs.
-    static HALF_CONSTEXPR_CONST bool has_signaling_NaN = true;
-
-    /// Supports subnormal values.
-    static HALF_CONSTEXPR_CONST float_denorm_style has_denorm = denorm_present;
-
-    /// Supports no denormalization detection.
-    static HALF_CONSTEXPR_CONST bool has_denorm_loss = false;
-
-#if HALF_ERRHANDLING_THROWS
-    static HALF_CONSTEXPR_CONST bool traps = true;
-#else
-    /// Traps only if [HALF_ERRHANDLING_THROW_...](\ref HALF_ERRHANDLING_THROW_INVALID) is
-    /// acitvated.
-    static HALF_CONSTEXPR_CONST bool traps = false;
-#endif
-
-    /// Does not support no pre-rounding underflow detection.
-    static HALF_CONSTEXPR_CONST bool tinyness_before = false;
-
-    /// Rounding mode.
-    static HALF_CONSTEXPR_CONST float_round_style round_style = half_float::half::round_style;
-
-    /// Significant digits.
-    static HALF_CONSTEXPR_CONST int digits = 11;
-
-    /// Significant decimal digits.
-    static HALF_CONSTEXPR_CONST int digits10 = 3;
-
-    /// Required decimal digits to represent all possible values.
-    static HALF_CONSTEXPR_CONST int max_digits10 = 5;
-
-    /// Number base.
-    static HALF_CONSTEXPR_CONST int radix = 2;
-
-    /// One more than smallest exponent.
-    static HALF_CONSTEXPR_CONST int min_exponent = -13;
-
-    /// Smallest normalized representable power of 10.
-    static HALF_CONSTEXPR_CONST int min_exponent10 = -4;
-
-    /// One more than largest exponent
-    static HALF_CONSTEXPR_CONST int max_exponent = 16;
-
-    /// Largest finitely representable power of 10.
-    static HALF_CONSTEXPR_CONST int max_exponent10 = 4;
-
-    /// Smallest positive normal value.
-    static HALF_CONSTEXPR half_float::half min() HALF_NOTHROW
-    {
-        return half_float::half(half_float::detail::binary, 0x0400);
-    }
-
-    /// Smallest finite value.
-    static HALF_CONSTEXPR half_float::half lowest() HALF_NOTHROW
-    {
-        return half_float::half(half_float::detail::binary, 0xFBFF);
-    }
-
-    /// Largest finite value.
-    static HALF_CONSTEXPR half_float::half max() HALF_NOTHROW
-    {
-        return half_float::half(half_float::detail::binary, 0x7BFF);
-    }
-
-    /// Difference between 1 and next representable value.
-    static HALF_CONSTEXPR half_float::half epsilon() HALF_NOTHROW
-    {
-        return half_float::half(half_float::detail::binary, 0x1400);
-    }
-
-    /// Maximum rounding error in ULP (units in the last place).
-    static HALF_CONSTEXPR half_float::half round_error() HALF_NOTHROW
-    {
-        return half_float::half(half_float::detail::binary,
-                                (round_style == std::round_to_nearest) ? 0x3800 : 0x3C00);
-    }
-
-    /// Positive infinity.
-    static HALF_CONSTEXPR half_float::half infinity() HALF_NOTHROW
-    {
-        return half_float::half(half_float::detail::binary, 0x7C00);
-    }
-
-    /// Quiet NaN.
-    static HALF_CONSTEXPR half_float::half quiet_NaN() HALF_NOTHROW
-    {
-        return half_float::half(half_float::detail::binary, 0x7FFF);
-    }
-
-    /// Signaling NaN.
-    static HALF_CONSTEXPR half_float::half signaling_NaN() HALF_NOTHROW
-    {
-        return half_float::half(half_float::detail::binary, 0x7DFF);
-    }
-
-    /// Smallest positive subnormal value.
-    static HALF_CONSTEXPR half_float::half denorm_min() HALF_NOTHROW
-    {
-        return half_float::half(half_float::detail::binary, 0x0001);
-    }
-};
-
-#if HALF_ENABLE_CPP11_HASH
-/// Hash function for half-precision floats.
-/// This is only defined if C++11 `std::hash` is supported and enabled.
-///
-/// **See also:** Documentation for [std::hash](https://en.cppreference.com/w/cpp/utility/hash)
-template <>
-struct hash<half_float::half>
-{
-    /// Type of function argument.
-    typedef half_float::half argument_type;
-
-    /// Function return type.
-    typedef size_t result_type;
-
-    /// Compute hash function.
-    /// \param arg half to hash
-    /// \return hash value
-    result_type operator()(argument_type arg) const
-    {
-        return hash<half_float::detail::uint16>()(arg.data_ &
-                                                  -static_cast<unsigned>(arg.data_ != 0x8000));
-    }
-};
-#endif
-} // namespace std
-
-namespace half_float {
-/// \anchor compop
-/// \name Comparison operators
-/// \{
-
-/// Comparison for equality.
-/// \param x first operand
-/// \param y second operand
-/// \retval true if operands equal
-/// \retval false else
-/// \exception FE_INVALID if \a x or \a y is NaN
-inline HALF_CONSTEXPR_NOERR bool operator==(half x, half y)
-{
-    return !detail::compsignal(x.data_, y.data_) &&
-           (x.data_ == y.data_ || !((x.data_ | y.data_) & 0x7FFF));
-}
-
-/// Comparison for inequality.
-/// \param x first operand
-/// \param y second operand
-/// \retval true if operands not equal
-/// \retval false else
-/// \exception FE_INVALID if \a x or \a y is NaN
-inline HALF_CONSTEXPR_NOERR bool operator!=(half x, half y)
-{
-    return detail::compsignal(x.data_, y.data_) ||
-           (x.data_ != y.data_ && ((x.data_ | y.data_) & 0x7FFF));
-}
-
-/// Comparison for less than.
-/// \param x first operand
-/// \param y second operand
-/// \retval true if \a x less than \a y
-/// \retval false else
-/// \exception FE_INVALID if \a x or \a y is NaN
-inline HALF_CONSTEXPR_NOERR bool operator<(half x, half y)
-{
-    return !detail::compsignal(x.data_, y.data_) &&
-           ((x.data_ ^ (0x8000 | (0x8000 - (x.data_ >> 15)))) + (x.data_ >> 15)) <
-               ((y.data_ ^ (0x8000 | (0x8000 - (y.data_ >> 15)))) + (y.data_ >> 15));
-}
-
-/// Comparison for greater than.
-/// \param x first operand
-/// \param y second operand
-/// \retval true if \a x greater than \a y
-/// \retval false else
-/// \exception FE_INVALID if \a x or \a y is NaN
-inline HALF_CONSTEXPR_NOERR bool operator>(half x, half y)
-{
-    return !detail::compsignal(x.data_, y.data_) &&
-           ((x.data_ ^ (0x8000 | (0x8000 - (x.data_ >> 15)))) + (x.data_ >> 15)) >
-               ((y.data_ ^ (0x8000 | (0x8000 - (y.data_ >> 15)))) + (y.data_ >> 15));
-}
-
-/// Comparison for less equal.
-/// \param x first operand
-/// \param y second operand
-/// \retval true if \a x less equal \a y
-/// \retval false else
-/// \exception FE_INVALID if \a x or \a y is NaN
-inline HALF_CONSTEXPR_NOERR bool operator<=(half x, half y)
-{
-    return !detail::compsignal(x.data_, y.data_) &&
-           ((x.data_ ^ (0x8000 | (0x8000 - (x.data_ >> 15)))) + (x.data_ >> 15)) <=
-               ((y.data_ ^ (0x8000 | (0x8000 - (y.data_ >> 15)))) + (y.data_ >> 15));
-}
-
-/// Comparison for greater equal.
-/// \param x first operand
-/// \param y second operand
-/// \retval true if \a x greater equal \a y
-/// \retval false else
-/// \exception FE_INVALID if \a x or \a y is NaN
-inline HALF_CONSTEXPR_NOERR bool operator>=(half x, half y)
-{
-    return !detail::compsignal(x.data_, y.data_) &&
-           ((x.data_ ^ (0x8000 | (0x8000 - (x.data_ >> 15)))) + (x.data_ >> 15)) >=
-               ((y.data_ ^ (0x8000 | (0x8000 - (y.data_ >> 15)))) + (y.data_ >> 15));
-}
-
-/// \}
-/// \anchor arithmetics
-/// \name Arithmetic operators
-/// \{
-
-/// Identity.
-/// \param arg operand
-/// \return unchanged operand
-inline HALF_CONSTEXPR half operator+(half arg) { return arg; }
-
-/// Negation.
-/// \param arg operand
-/// \return negated operand
-inline HALF_CONSTEXPR half operator-(half arg) { return half(detail::binary, arg.data_ ^ 0x8000); }
-
-/// Addition.
-/// This operation is exact to rounding for all rounding modes.
-/// \param x left operand
-/// \param y right operand
-/// \return sum of half expressions
-/// \exception FE_INVALID if \a x and \a y are infinities with different signs or signaling NaNs
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half operator+(half x, half y)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(
-        detail::binary,
-        detail::float2half<half::round_style>(detail::half2float<detail::internal_t>(x.data_) +
-                                              detail::half2float<detail::internal_t>(y.data_)));
-#else
-    int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF;
-    bool sub = ((x.data_ ^ y.data_) & 0x8000) != 0;
-    if(absx >= 0x7C00 || absy >= 0x7C00)
-        return half(detail::binary,
-                    (absx > 0x7C00 || absy > 0x7C00)
-                        ? detail::signal(x.data_, y.data_)
-                        : (absy != 0x7C00) ? x.data_
-                                           : (sub && absx == 0x7C00) ? detail::invalid() : y.data_);
-    if(!absx)
-        return absy ? y
-                    : half(detail::binary,
-                           (half::round_style == std::round_toward_neg_infinity)
-                               ? (x.data_ | y.data_)
-                               : (x.data_ & y.data_));
-    if(!absy)
-        return x;
-    unsigned int sign = ((sub && absy > absx) ? y.data_ : x.data_) & 0x8000;
-    if(absy > absx)
-        std::swap(absx, absy);
-    int exp = (absx >> 10) + (absx <= 0x3FF), d = exp - (absy >> 10) - (absy <= 0x3FF),
-        mx = ((absx & 0x3FF) | ((absx > 0x3FF) << 10)) << 3, my;
-    if(d < 13)
-    {
-        my = ((absy & 0x3FF) | ((absy > 0x3FF) << 10)) << 3;
-        my = (my >> d) | ((my & ((1 << d) - 1)) != 0);
-    }
-    else
-        my = 1;
-    if(sub)
-    {
-        if(!(mx -= my))
-            return half(detail::binary,
-                        static_cast<unsigned>(half::round_style == std::round_toward_neg_infinity)
-                            << 15);
-        for(; mx < 0x2000 && exp > 1; mx <<= 1, --exp)
-            ;
-    }
-    else
-    {
-        mx += my;
-        int i = mx >> 14;
-        if((exp += i) > 30)
-            return half(detail::binary, detail::overflow<half::round_style>(sign));
-        mx = (mx >> i) | (mx & i);
-    }
-    return half(detail::binary,
-                detail::rounded<half::round_style, false>(
-                    sign + ((exp - 1) << 10) + (mx >> 3), (mx >> 2) & 1, (mx & 0x3) != 0));
-#endif
-}
-
-/// Subtraction.
-/// This operation is exact to rounding for all rounding modes.
-/// \param x left operand
-/// \param y right operand
-/// \return difference of half expressions
-/// \exception FE_INVALID if \a x and \a y are infinities with equal signs or signaling NaNs
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half operator-(half x, half y)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(
-        detail::binary,
-        detail::float2half<half::round_style>(detail::half2float<detail::internal_t>(x.data_) -
-                                              detail::half2float<detail::internal_t>(y.data_)));
-#else
-    return x + -y;
-#endif
-}
-
-/// Multiplication.
-/// This operation is exact to rounding for all rounding modes.
-/// \param x left operand
-/// \param y right operand
-/// \return product of half expressions
-/// \exception FE_INVALID if multiplying 0 with infinity or if \a x or \a y is signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half operator*(half x, half y)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(
-        detail::binary,
-        detail::float2half<half::round_style>(detail::half2float<detail::internal_t>(x.data_) *
-                                              detail::half2float<detail::internal_t>(y.data_)));
-#else
-    int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, exp = -16;
-    unsigned int sign = (x.data_ ^ y.data_) & 0x8000;
-    if(absx >= 0x7C00 || absy >= 0x7C00)
-        return half(detail::binary,
-                    (absx > 0x7C00 || absy > 0x7C00)
-                        ? detail::signal(x.data_, y.data_)
-                        : ((absx == 0x7C00 && !absy) || (absy == 0x7C00 && !absx))
-                              ? detail::invalid()
-                              : (sign | 0x7C00));
-    if(!absx || !absy)
-        return half(detail::binary, sign);
-    for(; absx < 0x400; absx <<= 1, --exp)
-        ;
-    for(; absy < 0x400; absy <<= 1, --exp)
-        ;
-    detail::uint32 m = static_cast<detail::uint32>((absx & 0x3FF) | 0x400) *
-                       static_cast<detail::uint32>((absy & 0x3FF) | 0x400);
-    int i = m >> 21, s = m & i;
-    exp += (absx >> 10) + (absy >> 10) + i;
-    if(exp > 29)
-        return half(detail::binary, detail::overflow<half::round_style>(sign));
-    else if(exp < -11)
-        return half(detail::binary, detail::underflow<half::round_style>(sign));
-    return half(
-        detail::binary,
-        detail::fixed2half<half::round_style, 20, false, false, false>(m >> i, exp, sign, s));
-#endif
-}
-
-/// Division.
-/// This operation is exact to rounding for all rounding modes.
-/// \param x left operand
-/// \param y right operand
-/// \return quotient of half expressions
-/// \exception FE_INVALID if dividing 0s or infinities with each other or if \a x or \a y is
-/// signaling NaN
-/// \exception FE_DIVBYZERO if dividing finite value by 0
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half operator/(half x, half y)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(
-        detail::binary,
-        detail::float2half<half::round_style>(detail::half2float<detail::internal_t>(x.data_) /
-                                              detail::half2float<detail::internal_t>(y.data_)));
-#else
-    int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, exp = 14;
-    unsigned int sign = (x.data_ ^ y.data_) & 0x8000;
-    if(absx >= 0x7C00 || absy >= 0x7C00)
-        return half(detail::binary,
-                    (absx > 0x7C00 || absy > 0x7C00)
-                        ? detail::signal(x.data_, y.data_)
-                        : (absx == absy) ? detail::invalid()
-                                         : (sign | ((absx == 0x7C00) ? 0x7C00 : 0)));
-    if(!absx)
-        return half(detail::binary, absy ? sign : detail::invalid());
-    if(!absy)
-        return half(detail::binary, detail::pole(sign));
-    for(; absx < 0x400; absx <<= 1, --exp)
-        ;
-    for(; absy < 0x400; absy <<= 1, ++exp)
-        ;
-    detail::uint32 mx = (absx & 0x3FF) | 0x400, my = (absy & 0x3FF) | 0x400;
-    int i = mx < my;
-    exp += (absx >> 10) - (absy >> 10) - i;
-    if(exp > 29)
-        return half(detail::binary, detail::overflow<half::round_style>(sign));
-    else if(exp < -11)
-        return half(detail::binary, detail::underflow<half::round_style>(sign));
-    mx <<= 12 + i;
-    my <<= 1;
-    return half(detail::binary,
-                detail::fixed2half<half::round_style, 11, false, false, false>(
-                    mx / my, exp, sign, mx % my != 0));
-#endif
-}
-
-/// \}
-/// \anchor streaming
-/// \name Input and output
-/// \{
-
-/// Output operator.
-///	This uses the built-in functionality for streaming out floating-point numbers.
-/// \param out output stream to write into
-/// \param arg half expression to write
-/// \return reference to output stream
-template <typename charT, typename traits>
-std::basic_ostream<charT, traits>& operator<<(std::basic_ostream<charT, traits>& out, half arg)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return out << detail::half2float<detail::internal_t>(arg.data_);
-#else
-    return out << detail::half2float<float>(arg.data_);
-#endif
-}
-
-/// Input operator.
-///	This uses the built-in functionality for streaming in floating-point numbers, specifically
-/// double precision floating
-/// point numbers (unless overridden with [HALF_ARITHMETIC_TYPE](\ref HALF_ARITHMETIC_TYPE)). So the
-/// input string is first
-/// rounded to double precision using the underlying platform's current floating-point rounding mode
-/// before being rounded
-/// to half-precision using the library's half-precision rounding mode.
-/// \param in input stream to read from
-/// \param arg half to read into
-/// \return reference to input stream
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-template <typename charT, typename traits>
-std::basic_istream<charT, traits>& operator>>(std::basic_istream<charT, traits>& in, half& arg)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    detail::internal_t f;
-#else
-    double f;
-#endif
-    if(in >> f)
-        arg.data_ = detail::float2half<half::round_style>(f);
-    return in;
-}
-
-/// \}
-/// \anchor basic
-/// \name Basic mathematical operations
-/// \{
-
-/// Absolute value.
-/// **See also:** Documentation for
-/// [std::fabs](https://en.cppreference.com/w/cpp/numeric/math/fabs).
-/// \param arg operand
-/// \return absolute value of \a arg
-inline HALF_CONSTEXPR half fabs(half arg) { return half(detail::binary, arg.data_ & 0x7FFF); }
-
-/// Absolute value.
-/// **See also:** Documentation for [std::abs](https://en.cppreference.com/w/cpp/numeric/math/fabs).
-/// \param arg operand
-/// \return absolute value of \a arg
-inline HALF_CONSTEXPR half abs(half arg) { return fabs(arg); }
-
-/// Remainder of division.
-/// **See also:** Documentation for
-/// [std::fmod](https://en.cppreference.com/w/cpp/numeric/math/fmod).
-/// \param x first operand
-/// \param y second operand
-/// \return remainder of floating-point division.
-/// \exception FE_INVALID if \a x is infinite or \a y is 0 or if \a x or \a y is signaling NaN
-inline half fmod(half x, half y)
-{
-    unsigned int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, sign = x.data_ & 0x8000;
-    if(absx >= 0x7C00 || absy >= 0x7C00)
-        return half(detail::binary,
-                    (absx > 0x7C00 || absy > 0x7C00)
-                        ? detail::signal(x.data_, y.data_)
-                        : (absx == 0x7C00) ? detail::invalid() : x.data_);
-    if(!absy)
-        return half(detail::binary, detail::invalid());
-    if(!absx)
-        return x;
-    if(absx == absy)
-        return half(detail::binary, sign);
-    return half(detail::binary, sign | detail::mod<false, false>(absx, absy));
-}
-
-/// Remainder of division.
-/// **See also:** Documentation for
-/// [std::remainder](https://en.cppreference.com/w/cpp/numeric/math/remainder).
-/// \param x first operand
-/// \param y second operand
-/// \return remainder of floating-point division.
-/// \exception FE_INVALID if \a x is infinite or \a y is 0 or if \a x or \a y is signaling NaN
-inline half remainder(half x, half y)
-{
-    unsigned int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, sign = x.data_ & 0x8000;
-    if(absx >= 0x7C00 || absy >= 0x7C00)
-        return half(detail::binary,
-                    (absx > 0x7C00 || absy > 0x7C00)
-                        ? detail::signal(x.data_, y.data_)
-                        : (absx == 0x7C00) ? detail::invalid() : x.data_);
-    if(!absy)
-        return half(detail::binary, detail::invalid());
-    if(absx == absy)
-        return half(detail::binary, sign);
-    return half(detail::binary, sign ^ detail::mod<false, true>(absx, absy));
-}
-
-/// Remainder of division.
-/// **See also:** Documentation for
-/// [std::remquo](https://en.cppreference.com/w/cpp/numeric/math/remquo).
-/// \param x first operand
-/// \param y second operand
-/// \param quo address to store some bits of quotient at
-/// \return remainder of floating-point division.
-/// \exception FE_INVALID if \a x is infinite or \a y is 0 or if \a x or \a y is signaling NaN
-inline half remquo(half x, half y, int* quo)
-{
-    unsigned int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, value = x.data_ & 0x8000;
-    if(absx >= 0x7C00 || absy >= 0x7C00)
-        return half(detail::binary,
-                    (absx > 0x7C00 || absy > 0x7C00)
-                        ? detail::signal(x.data_, y.data_)
-                        : (absx == 0x7C00) ? detail::invalid() : (*quo = 0, x.data_));
-    if(!absy)
-        return half(detail::binary, detail::invalid());
-    bool qsign = ((value ^ y.data_) & 0x8000) != 0;
-    int q      = 1;
-    if(absx != absy)
-        value ^= detail::mod<true, true>(absx, absy, &q);
-    return *quo = qsign ? -q : q, half(detail::binary, value);
-}
-
-/// Fused multiply add.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for [std::fma](https://en.cppreference.com/w/cpp/numeric/math/fma).
-/// \param x first operand
-/// \param y second operand
-/// \param z third operand
-/// \return ( \a x * \a y ) + \a z rounded as one operation.
-/// \exception FE_INVALID according to operator*() and operator+() unless any argument is a quiet
-/// NaN and no argument is a signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding the final addition
-inline half fma(half x, half y, half z)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    detail::internal_t fx = detail::half2float<detail::internal_t>(x.data_),
-                       fy = detail::half2float<detail::internal_t>(y.data_),
-                       fz = detail::half2float<detail::internal_t>(z.data_);
-#if HALF_ENABLE_CPP11_CMATH && FP_FAST_FMA
-    return half(detail::binary, detail::float2half<half::round_style>(std::fma(fx, fy, fz)));
-#else
-    return half(detail::binary, detail::float2half<half::round_style>(fx * fy + fz));
-#endif
-#else
-    int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, absz = z.data_ & 0x7FFF, exp = -15;
-    unsigned int sign = (x.data_ ^ y.data_) & 0x8000;
-    bool sub          = ((sign ^ z.data_) & 0x8000) != 0;
-    if(absx >= 0x7C00 || absy >= 0x7C00 || absz >= 0x7C00)
-        return (absx > 0x7C00 || absy > 0x7C00 || absz > 0x7C00)
-                   ? half(detail::binary, detail::signal(x.data_, y.data_, z.data_))
-                   : (absx == 0x7C00) ? half(detail::binary,
-                                             (!absy || (sub && absz == 0x7C00)) ? detail::invalid()
-                                                                                : (sign | 0x7C00))
-                                      : (absy == 0x7C00) ? half(detail::binary,
-                                                                (!absx || (sub && absz == 0x7C00))
-                                                                    ? detail::invalid()
-                                                                    : (sign | 0x7C00))
-                                                         : z;
-    if(!absx || !absy)
-        return absz
-                   ? z
-                   : half(detail::binary,
-                          (half::round_style == std::round_toward_neg_infinity) ? (z.data_ | sign)
-                                                                                : (z.data_ & sign));
-    for(; absx < 0x400; absx <<= 1, --exp)
-        ;
-    for(; absy < 0x400; absy <<= 1, --exp)
-        ;
-    detail::uint32 m = static_cast<detail::uint32>((absx & 0x3FF) | 0x400) *
-                       static_cast<detail::uint32>((absy & 0x3FF) | 0x400);
-    int i = m >> 21;
-    exp += (absx >> 10) + (absy >> 10) + i;
-    m <<= 3 - i;
-    if(absz)
-    {
-        int expz = 0;
-        for(; absz < 0x400; absz <<= 1, --expz)
-            ;
-        expz += absz >> 10;
-        detail::uint32 mz = static_cast<detail::uint32>((absz & 0x3FF) | 0x400) << 13;
-        if(expz > exp || (expz == exp && mz > m))
-        {
-            std::swap(m, mz);
-            std::swap(exp, expz);
-            if(sub)
-                sign = z.data_ & 0x8000;
-        }
-        int d = exp - expz;
-        mz = (d < 23) ? ((mz >> d) | ((mz & ((static_cast<detail::uint32>(1) << d) - 1)) != 0)) : 1;
-        if(sub)
-        {
-            m = m - mz;
-            if(!m)
-                return half(
-                    detail::binary,
-                    static_cast<unsigned>(half::round_style == std::round_toward_neg_infinity)
-                        << 15);
-            for(; m < 0x800000; m <<= 1, --exp)
-                ;
-        }
-        else
-        {
-            m += mz;
-            i = m >> 24;
-            m = (m >> i) | (m & i);
-            exp += i;
-        }
-    }
-    if(exp > 30)
-        return half(detail::binary, detail::overflow<half::round_style>(sign));
-    else if(exp < -10)
-        return half(detail::binary, detail::underflow<half::round_style>(sign));
-    return half(detail::binary,
-                detail::fixed2half<half::round_style, 23, false, false, false>(m, exp - 1, sign));
-#endif
-}
-
-/// Maximum of half expressions.
-/// **See also:** Documentation for
-/// [std::fmax](https://en.cppreference.com/w/cpp/numeric/math/fmax).
-/// \param x first operand
-/// \param y second operand
-/// \return maximum of operands, ignoring quiet NaNs
-/// \exception FE_INVALID if \a x or \a y is signaling NaN
-inline HALF_CONSTEXPR_NOERR half fmax(half x, half y)
-{
-    return half(detail::binary,
-                (!isnan(y) && (isnan(x) || (x.data_ ^ (0x8000 | (0x8000 - (x.data_ >> 15)))) <
-                                               (y.data_ ^ (0x8000 | (0x8000 - (y.data_ >> 15))))))
-                    ? detail::select(y.data_, x.data_)
-                    : detail::select(x.data_, y.data_));
-}
-
-/// Minimum of half expressions.
-/// **See also:** Documentation for
-/// [std::fmin](https://en.cppreference.com/w/cpp/numeric/math/fmin).
-/// \param x first operand
-/// \param y second operand
-/// \return minimum of operands, ignoring quiet NaNs
-/// \exception FE_INVALID if \a x or \a y is signaling NaN
-inline HALF_CONSTEXPR_NOERR half fmin(half x, half y)
-{
-    return half(detail::binary,
-                (!isnan(y) && (isnan(x) || (x.data_ ^ (0x8000 | (0x8000 - (x.data_ >> 15)))) >
-                                               (y.data_ ^ (0x8000 | (0x8000 - (y.data_ >> 15))))))
-                    ? detail::select(y.data_, x.data_)
-                    : detail::select(x.data_, y.data_));
-}
-
-/// Positive difference.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::fdim](https://en.cppreference.com/w/cpp/numeric/math/fdim).
-/// \param x first operand
-/// \param y second operand
-/// \return \a x - \a y or 0 if difference negative
-/// \exception FE_... according to operator-(half,half)
-inline half fdim(half x, half y)
-{
-    if(isnan(x) || isnan(y))
-        return half(detail::binary, detail::signal(x.data_, y.data_));
-    return (x.data_ ^ (0x8000 | (0x8000 - (x.data_ >> 15)))) <=
-                   (y.data_ ^ (0x8000 | (0x8000 - (y.data_ >> 15))))
-               ? half(detail::binary, 0)
-               : (x - y);
-}
-
-/// Get NaN value.
-/// **See also:** Documentation for [std::nan](https://en.cppreference.com/w/cpp/numeric/math/nan).
-/// \param arg string code
-/// \return quiet NaN
-inline half nanh(const char* arg)
-{
-    unsigned int value = 0x7FFF;
-    while(*arg)
-        value ^= static_cast<unsigned>(*arg++) & 0xFF;
-    return half(detail::binary, value);
-}
-
-/// \}
-/// \anchor exponential
-/// \name Exponential functions
-/// \{
-
-/// Exponential function.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for [std::exp](https://en.cppreference.com/w/cpp/numeric/math/exp).
-/// \param arg function argument
-/// \return e raised to \a arg
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half exp(half arg)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::exp(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    int abs = arg.data_ & 0x7FFF;
-    if(!abs)
-        return half(detail::binary, 0x3C00);
-    if(abs >= 0x7C00)
-        return half(detail::binary,
-                    (abs == 0x7C00) ? (0x7C00 & ((arg.data_ >> 15) - 1U))
-                                    : detail::signal(arg.data_));
-    if(abs >= 0x4C80)
-        return half(detail::binary,
-                    (arg.data_ & 0x8000) ? detail::underflow<half::round_style>()
-                                         : detail::overflow<half::round_style>());
-    detail::uint32 m = detail::multiply64(
-        static_cast<detail::uint32>((abs & 0x3FF) + ((abs > 0x3FF) << 10)) << 21, 0xB8AA3B29);
-    int e = (abs >> 10) + (abs <= 0x3FF), exp;
-    if(e < 14)
-    {
-        exp = 0;
-        m >>= 14 - e;
-    }
-    else
-    {
-        exp = m >> (45 - e);
-        m   = (m << (e - 14)) & 0x7FFFFFFF;
-    }
-    return half(detail::binary,
-                detail::exp2_post<half::round_style, true>(
-                    detail::exp2(m, 26), exp, (arg.data_ & 0x8000) != 0));
-#endif
-}
-
-/// Binary exponential.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::exp2](https://en.cppreference.com/w/cpp/numeric/math/exp2).
-/// \param arg function argument
-/// \return 2 raised to \a arg
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half exp2(half arg)
-{
-#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::exp2(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    int abs = arg.data_ & 0x7FFF;
-    if(!abs)
-        return half(detail::binary, 0x3C00);
-    if(abs >= 0x7C00)
-        return half(detail::binary,
-                    (abs == 0x7C00) ? (0x7C00 & ((arg.data_ >> 15) - 1U))
-                                    : detail::signal(arg.data_));
-    if(abs >= 0x4E40)
-        return half(detail::binary,
-                    (arg.data_ & 0x8000) ? detail::underflow<half::round_style>()
-                                         : detail::overflow<half::round_style>());
-    int e = (abs >> 10) + (abs <= 0x3FF), exp = (abs & 0x3FF) + ((abs > 0x3FF) << 10);
-    detail::uint32 m = detail::exp2((static_cast<detail::uint32>(exp) << (6 + e)) & 0x7FFFFFFF, 28);
-    exp >>= 25 - e;
-    if(m == 0x80000000)
-    {
-        if(arg.data_ & 0x8000)
-            exp = -exp;
-        else if(exp > 15)
-            return half(detail::binary, detail::overflow<half::round_style>());
-        return half(detail::binary,
-                    detail::fixed2half<half::round_style, 31, false, false, false>(m, exp + 14));
-    }
-    return half(detail::binary,
-                detail::exp2_post<half::round_style, true>(m, exp, (arg.data_ & 0x8000) != 0));
-#endif
-}
-
-/// Exponential minus one.
-/// This function may be 1 ULP off the correctly rounded exact result in <0.05% of inputs for
-/// `std::round_to_nearest`
-/// and in <1% of inputs for any other rounding mode.
-///
-/// **See also:** Documentation for
-/// [std::expm1](https://en.cppreference.com/w/cpp/numeric/math/expm1).
-/// \param arg function argument
-/// \return e raised to \a arg and subtracted by 1
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half expm1(half arg)
-{
-#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::expm1(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ & 0x8000;
-    if(!abs)
-        return arg;
-    if(abs >= 0x7C00)
-        return half(detail::binary,
-                    (abs == 0x7C00) ? (0x7C00 + (sign >> 1)) : detail::signal(arg.data_));
-    if(abs >= 0x4A00)
-        return half(detail::binary,
-                    (arg.data_ & 0x8000) ? detail::rounded<half::round_style, true>(0xBBFF, 1, 1)
-                                         : detail::overflow<half::round_style>());
-    detail::uint32 m = detail::multiply64(
-        static_cast<detail::uint32>((abs & 0x3FF) + ((abs > 0x3FF) << 10)) << 21, 0xB8AA3B29);
-    int e = (abs >> 10) + (abs <= 0x3FF), exp;
-    if(e < 14)
-    {
-        exp = 0;
-        m >>= 14 - e;
-    }
-    else
-    {
-        exp = m >> (45 - e);
-        m   = (m << (e - 14)) & 0x7FFFFFFF;
-    }
-    m = detail::exp2(m);
-    if(sign)
-    {
-        int s = 0;
-        if(m > 0x80000000)
-        {
-            ++exp;
-            m = detail::divide64(0x80000000, m, s);
-        }
-        m = 0x80000000 -
-            ((m >> exp) | ((m & ((static_cast<detail::uint32>(1) << exp) - 1)) != 0) | s);
-        exp = 0;
-    }
-    else
-        m -= (exp < 31) ? (0x80000000 >> exp) : 1;
-    for(exp += 14; m < 0x80000000 && exp; m <<= 1, --exp)
-        ;
-    if(exp > 29)
-        return half(detail::binary, detail::overflow<half::round_style>());
-    return half(detail::binary,
-                detail::rounded<half::round_style, true>(
-                    sign + (exp << 10) + (m >> 21), (m >> 20) & 1, (m & 0xFFFFF) != 0));
-#endif
-}
-
-/// Natural logarithm.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for [std::log](https://en.cppreference.com/w/cpp/numeric/math/log).
-/// \param arg function argument
-/// \return logarithm of \a arg to base e
-/// \exception FE_INVALID for signaling NaN or negative argument
-/// \exception FE_DIVBYZERO for 0
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half log(half arg)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::log(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    int abs = arg.data_ & 0x7FFF, exp = -15;
-    if(!abs)
-        return half(detail::binary, detail::pole(0x8000));
-    if(arg.data_ & 0x8000)
-        return half(detail::binary,
-                    (arg.data_ <= 0xFC00) ? detail::invalid() : detail::signal(arg.data_));
-    if(abs >= 0x7C00)
-        return (abs == 0x7C00) ? arg : half(detail::binary, detail::signal(arg.data_));
-    for(; abs < 0x400; abs <<= 1, --exp)
-        ;
-    exp += abs >> 10;
-    return half(detail::binary,
-                detail::log2_post<half::round_style, 0xB8AA3B2A>(
-                    detail::log2(static_cast<detail::uint32>((abs & 0x3FF) | 0x400) << 20, 27) + 8,
-                    exp,
-                    17));
-#endif
-}
-
-/// Common logarithm.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::log10](https://en.cppreference.com/w/cpp/numeric/math/log10).
-/// \param arg function argument
-/// \return logarithm of \a arg to base 10
-/// \exception FE_INVALID for signaling NaN or negative argument
-/// \exception FE_DIVBYZERO for 0
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half log10(half arg)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::log10(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    int abs = arg.data_ & 0x7FFF, exp = -15;
-    if(!abs)
-        return half(detail::binary, detail::pole(0x8000));
-    if(arg.data_ & 0x8000)
-        return half(detail::binary,
-                    (arg.data_ <= 0xFC00) ? detail::invalid() : detail::signal(arg.data_));
-    if(abs >= 0x7C00)
-        return (abs == 0x7C00) ? arg : half(detail::binary, detail::signal(arg.data_));
-    switch(abs)
-    {
-    case 0x4900: return half(detail::binary, 0x3C00);
-    case 0x5640: return half(detail::binary, 0x4000);
-    case 0x63D0: return half(detail::binary, 0x4200);
-    case 0x70E2: return half(detail::binary, 0x4400);
-    }
-    for(; abs < 0x400; abs <<= 1, --exp)
-        ;
-    exp += abs >> 10;
-    return half(detail::binary,
-                detail::log2_post<half::round_style, 0xD49A784C>(
-                    detail::log2(static_cast<detail::uint32>((abs & 0x3FF) | 0x400) << 20, 27) + 8,
-                    exp,
-                    16));
-#endif
-}
-
-/// Binary logarithm.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::log2](https://en.cppreference.com/w/cpp/numeric/math/log2).
-/// \param arg function argument
-/// \return logarithm of \a arg to base 2
-/// \exception FE_INVALID for signaling NaN or negative argument
-/// \exception FE_DIVBYZERO for 0
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half log2(half arg)
-{
-#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::log2(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    int abs = arg.data_ & 0x7FFF, exp = -15, s = 0;
-    if(!abs)
-        return half(detail::binary, detail::pole(0x8000));
-    if(arg.data_ & 0x8000)
-        return half(detail::binary,
-                    (arg.data_ <= 0xFC00) ? detail::invalid() : detail::signal(arg.data_));
-    if(abs >= 0x7C00)
-        return (abs == 0x7C00) ? arg : half(detail::binary, detail::signal(arg.data_));
-    if(abs == 0x3C00)
-        return half(detail::binary, 0);
-    for(; abs < 0x400; abs <<= 1, --exp)
-        ;
-    exp += (abs >> 10);
-    if(!(abs & 0x3FF))
-    {
-        unsigned int value = static_cast<unsigned>(exp < 0) << 15, m = std::abs(exp) << 6;
-        for(exp = 18; m < 0x400; m <<= 1, --exp)
-            ;
-        return half(detail::binary, value + (exp << 10) + m);
-    }
-    detail::uint32 ilog = exp, sign = detail::sign_mask(ilog),
-                   m = (((ilog << 27) +
-                         (detail::log2(static_cast<detail::uint32>((abs & 0x3FF) | 0x400) << 20,
-                                       28) >>
-                          4)) ^
-                        sign) -
-                       sign;
-    if(!m)
-        return half(detail::binary, 0);
-    for(exp = 14; m < 0x8000000 && exp; m <<= 1, --exp)
-        ;
-    for(; m > 0xFFFFFFF; m >>= 1, ++exp)
-        s |= m & 1;
-    return half(
-        detail::binary,
-        detail::fixed2half<half::round_style, 27, false, false, true>(m, exp, sign & 0x8000, s));
-#endif
-}
-
-/// Natural logarithm plus one.
-/// This function may be 1 ULP off the correctly rounded exact result in <0.05% of inputs for
-/// `std::round_to_nearest`
-/// and in ~1% of inputs for any other rounding mode.
-///
-/// **See also:** Documentation for
-/// [std::log1p](https://en.cppreference.com/w/cpp/numeric/math/log1p).
-/// \param arg function argument
-/// \return logarithm of \a arg plus 1 to base e
-/// \exception FE_INVALID for signaling NaN or argument <-1
-/// \exception FE_DIVBYZERO for -1
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half log1p(half arg)
-{
-#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::log1p(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    if(arg.data_ >= 0xBC00)
-        return half(detail::binary,
-                    (arg.data_ == 0xBC00)
-                        ? detail::pole(0x8000)
-                        : (arg.data_ <= 0xFC00) ? detail::invalid() : detail::signal(arg.data_));
-    int abs = arg.data_ & 0x7FFF, exp = -15;
-    if(!abs || abs >= 0x7C00)
-        return (abs > 0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
-    for(; abs < 0x400; abs <<= 1, --exp)
-        ;
-    exp += abs >> 10;
-    detail::uint32 m = static_cast<detail::uint32>((abs & 0x3FF) | 0x400) << 20;
-    if(arg.data_ & 0x8000)
-    {
-        m = 0x40000000 - (m >> -exp);
-        for(exp = 0; m < 0x40000000; m <<= 1, --exp)
-            ;
-    }
-    else
-    {
-        if(exp < 0)
-        {
-            m   = 0x40000000 + (m >> -exp);
-            exp = 0;
-        }
-        else
-        {
-            m += 0x40000000 >> exp;
-            int i = m >> 31;
-            m >>= i;
-            exp += i;
-        }
-    }
-    return half(detail::binary,
-                detail::log2_post<half::round_style, 0xB8AA3B2A>(detail::log2(m), exp, 17));
-#endif
-}
-
-/// \}
-/// \anchor power
-/// \name Power functions
-/// \{
-
-/// Square root.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::sqrt](https://en.cppreference.com/w/cpp/numeric/math/sqrt).
-/// \param arg function argument
-/// \return square root of \a arg
-/// \exception FE_INVALID for signaling NaN and negative arguments
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half sqrt(half arg)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::sqrt(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    int abs = arg.data_ & 0x7FFF, exp = 15;
-    if(!abs || arg.data_ >= 0x7C00)
-        return half(detail::binary,
-                    (abs > 0x7C00) ? detail::signal(arg.data_)
-                                   : (arg.data_ > 0x8000) ? detail::invalid() : arg.data_);
-    for(; abs < 0x400; abs <<= 1, --exp)
-        ;
-    detail::uint32 r = static_cast<detail::uint32>((abs & 0x3FF) | 0x400) << 10,
-                   m = detail::sqrt<20>(r, exp += abs >> 10);
-    return half(
-        detail::binary,
-        detail::rounded<half::round_style, false>((exp << 10) + (m & 0x3FF), r > m, r != 0));
-#endif
-}
-
-/// Cubic root.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::cbrt](https://en.cppreference.com/w/cpp/numeric/math/cbrt).
-/// \param arg function argument
-/// \return cubic root of \a arg
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half cbrt(half arg)
-{
-#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::cbrt(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    int abs = arg.data_ & 0x7FFF, exp = -15;
-    if(!abs || abs == 0x3C00 || abs >= 0x7C00)
-        return (abs > 0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
-    for(; abs < 0x400; abs <<= 1, --exp)
-        ;
-    detail::uint32 ilog = exp + (abs >> 10), sign = detail::sign_mask(ilog), f,
-                   m = (((ilog << 27) +
-                         (detail::log2(static_cast<detail::uint32>((abs & 0x3FF) | 0x400) << 20,
-                                       24) >>
-                          4)) ^
-                        sign) -
-                       sign;
-    for(exp = 2; m < 0x80000000; m <<= 1, --exp)
-        ;
-    m     = detail::multiply64(m, 0xAAAAAAAB);
-    int i = m >> 31, s;
-    exp += i;
-    m <<= 1 - i;
-    if(exp < 0)
-    {
-        f   = m >> -exp;
-        exp = 0;
-    }
-    else
-    {
-        f   = (m << exp) & 0x7FFFFFFF;
-        exp = m >> (31 - exp);
-    }
-    m = detail::exp2(f, (half::round_style == std::round_to_nearest) ? 29 : 26);
-    if(sign)
-    {
-        if(m > 0x80000000)
-        {
-            m = detail::divide64(0x80000000, m, s);
-            ++exp;
-        }
-        exp = -exp;
-    }
-    return half(detail::binary,
-                (half::round_style == std::round_to_nearest)
-                    ? detail::fixed2half<half::round_style, 31, false, false, false>(
-                          m, exp + 14, arg.data_ & 0x8000)
-                    : detail::fixed2half<half::round_style, 23, false, false, false>(
-                          (m + 0x80) >> 8, exp + 14, arg.data_ & 0x8000));
-#endif
-}
-
-/// Hypotenuse function.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::hypot](https://en.cppreference.com/w/cpp/numeric/math/hypot).
-/// \param x first argument
-/// \param y second argument
-/// \return square root of sum of squares without internal over- or underflows
-/// \exception FE_INVALID if \a x or \a y is signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding of the final square root
-inline half hypot(half x, half y)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    detail::internal_t fx = detail::half2float<detail::internal_t>(x.data_),
-                       fy = detail::half2float<detail::internal_t>(y.data_);
-#if HALF_ENABLE_CPP11_CMATH
-    return half(detail::binary, detail::float2half<half::round_style>(std::hypot(fx, fy)));
-#else
-    return half(detail::binary,
-                detail::float2half<half::round_style>(std::sqrt(fx * fx + fy * fy)));
-#endif
-#else
-    int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, expx = 0, expy = 0;
-    if(absx >= 0x7C00 || absy >= 0x7C00)
-        return half(detail::binary,
-                    (absx == 0x7C00) ? detail::select(0x7C00, y.data_)
-                                     : (absy == 0x7C00) ? detail::select(0x7C00, x.data_)
-                                                        : detail::signal(x.data_, y.data_));
-    if(!absx)
-        return half(detail::binary, absy ? detail::check_underflow(absy) : 0);
-    if(!absy)
-        return half(detail::binary, detail::check_underflow(absx));
-    if(absy > absx)
-        std::swap(absx, absy);
-    for(; absx < 0x400; absx <<= 1, --expx)
-        ;
-    for(; absy < 0x400; absy <<= 1, --expy)
-        ;
-    detail::uint32 mx = (absx & 0x3FF) | 0x400, my = (absy & 0x3FF) | 0x400;
-    mx *= mx;
-    my *= my;
-    int ix = mx >> 21, iy = my >> 21;
-    expx = 2 * (expx + (absx >> 10)) - 15 + ix;
-    expy = 2 * (expy + (absy >> 10)) - 15 + iy;
-    mx <<= 10 - ix;
-    my <<= 10 - iy;
-    int d = expx - expy;
-    my    = (d < 30) ? ((my >> d) | ((my & ((static_cast<detail::uint32>(1) << d) - 1)) != 0)) : 1;
-    return half(detail::binary, detail::hypot_post<half::round_style>(mx + my, expx));
-#endif
-}
-
-/// Hypotenuse function.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::hypot](https://en.cppreference.com/w/cpp/numeric/math/hypot).
-/// \param x first argument
-/// \param y second argument
-/// \param z third argument
-/// \return square root of sum of squares without internal over- or underflows
-/// \exception FE_INVALID if \a x, \a y or \a z is signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding of the final square root
-inline half hypot(half x, half y, half z)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    detail::internal_t fx = detail::half2float<detail::internal_t>(x.data_),
-                       fy = detail::half2float<detail::internal_t>(y.data_),
-                       fz = detail::half2float<detail::internal_t>(z.data_);
-    return half(detail::binary,
-                detail::float2half<half::round_style>(std::sqrt(fx * fx + fy * fy + fz * fz)));
-#else
-    int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, absz = z.data_ & 0x7FFF, expx = 0,
-        expy = 0, expz = 0;
-    if(!absx)
-        return hypot(y, z);
-    if(!absy)
-        return hypot(x, z);
-    if(!absz)
-        return hypot(x, y);
-    if(absx >= 0x7C00 || absy >= 0x7C00 || absz >= 0x7C00)
-        return half(detail::binary,
-                    (absx == 0x7C00)
-                        ? detail::select(0x7C00, detail::select(y.data_, z.data_))
-                        : (absy == 0x7C00)
-                              ? detail::select(0x7C00, detail::select(x.data_, z.data_))
-                              : (absz == 0x7C00)
-                                    ? detail::select(0x7C00, detail::select(x.data_, y.data_))
-                                    : detail::signal(x.data_, y.data_, z.data_));
-    if(absz > absy)
-        std::swap(absy, absz);
-    if(absy > absx)
-        std::swap(absx, absy);
-    if(absz > absy)
-        std::swap(absy, absz);
-    for(; absx < 0x400; absx <<= 1, --expx)
-        ;
-    for(; absy < 0x400; absy <<= 1, --expy)
-        ;
-    for(; absz < 0x400; absz <<= 1, --expz)
-        ;
-    detail::uint32 mx = (absx & 0x3FF) | 0x400, my = (absy & 0x3FF) | 0x400,
-                   mz = (absz & 0x3FF) | 0x400;
-    mx *= mx;
-    my *= my;
-    mz *= mz;
-    int ix = mx >> 21, iy = my >> 21, iz = mz >> 21;
-    expx = 2 * (expx + (absx >> 10)) - 15 + ix;
-    expy = 2 * (expy + (absy >> 10)) - 15 + iy;
-    expz = 2 * (expz + (absz >> 10)) - 15 + iz;
-    mx <<= 10 - ix;
-    my <<= 10 - iy;
-    mz <<= 10 - iz;
-    int d = expy - expz;
-    mz    = (d < 30) ? ((mz >> d) | ((mz & ((static_cast<detail::uint32>(1) << d) - 1)) != 0)) : 1;
-    my += mz;
-    if(my & 0x80000000)
-    {
-        my = (my >> 1) | (my & 1);
-        if(++expy > expx)
-        {
-            std::swap(mx, my);
-            std::swap(expx, expy);
-        }
-    }
-    d  = expx - expy;
-    my = (d < 30) ? ((my >> d) | ((my & ((static_cast<detail::uint32>(1) << d) - 1)) != 0)) : 1;
-    return half(detail::binary, detail::hypot_post<half::round_style>(mx + my, expx));
-#endif
-}
-
-/// Power function.
-/// This function may be 1 ULP off the correctly rounded exact result for any rounding mode in
-/// ~0.00025% of inputs.
-///
-/// **See also:** Documentation for [std::pow](https://en.cppreference.com/w/cpp/numeric/math/pow).
-/// \param x base
-/// \param y exponent
-/// \return \a x raised to \a y
-/// \exception FE_INVALID if \a x or \a y is signaling NaN or if \a x is finite an negative and \a y
-/// is finite and not integral
-/// \exception FE_DIVBYZERO if \a x is 0 and \a y is negative
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half pow(half x, half y)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::pow(detail::half2float<detail::internal_t>(x.data_),
-                             detail::half2float<detail::internal_t>(y.data_))));
-#else
-    int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, exp = -15;
-    if(!absy || x.data_ == 0x3C00)
-        return half(detail::binary,
-                    detail::select(0x3C00, (x.data_ == 0x3C00) ? y.data_ : x.data_));
-    bool is_int = absy >= 0x6400 || (absy >= 0x3C00 && !(absy & ((1 << (25 - (absy >> 10))) - 1)));
-    unsigned int sign =
-        x.data_ &
-        (static_cast<unsigned>((absy < 0x6800) && is_int && ((absy >> (25 - (absy >> 10))) & 1))
-         << 15);
-    if(absx >= 0x7C00 || absy >= 0x7C00)
-        return half(detail::binary,
-                    (absx > 0x7C00 || absy > 0x7C00)
-                        ? detail::signal(x.data_, y.data_)
-                        : (absy == 0x7C00)
-                              ? ((absx == 0x3C00)
-                                     ? 0x3C00
-                                     : (!absx && y.data_ == 0xFC00)
-                                           ? detail::pole()
-                                           : (0x7C00 & -((y.data_ >> 15) ^ (absx > 0x3C00))))
-                              : (sign | (0x7C00 & ((y.data_ >> 15) - 1U))));
-    if(!absx)
-        return half(detail::binary, (y.data_ & 0x8000) ? detail::pole(sign) : sign);
-    if((x.data_ & 0x8000) && !is_int)
-        return half(detail::binary, detail::invalid());
-    if(x.data_ == 0xBC00)
-        return half(detail::binary, sign | 0x3C00);
-    if(y.data_ == 0x3800)
-        return sqrt(x);
-    if(y.data_ == 0x3C00)
-        return half(detail::binary, detail::check_underflow(x.data_));
-    if(y.data_ == 0x4000)
-        return x * x;
-    for(; absx < 0x400; absx <<= 1, --exp)
-        ;
-    detail::uint32 ilog = exp + (absx >> 10), msign = detail::sign_mask(ilog), f,
-                   m = (((ilog << 27) +
-                         ((detail::log2(static_cast<detail::uint32>((absx & 0x3FF) | 0x400) << 20) +
-                           8) >>
-                          4)) ^
-                        msign) -
-                       msign;
-    for(exp = -11; m < 0x80000000; m <<= 1, --exp)
-        ;
-    for(; absy < 0x400; absy <<= 1, --exp)
-        ;
-    m     = detail::multiply64(m, static_cast<detail::uint32>((absy & 0x3FF) | 0x400) << 21);
-    int i = m >> 31;
-    exp += (absy >> 10) + i;
-    m <<= 1 - i;
-    if(exp < 0)
-    {
-        f   = m >> -exp;
-        exp = 0;
-    }
-    else
-    {
-        f   = (m << exp) & 0x7FFFFFFF;
-        exp = m >> (31 - exp);
-    }
-    return half(detail::binary,
-                detail::exp2_post<half::round_style, false>(
-                    detail::exp2(f), exp, ((msign & 1) ^ (y.data_ >> 15)) != 0, sign));
-#endif
-}
-
-/// \}
-/// \anchor trigonometric
-/// \name Trigonometric functions
-/// \{
-
-/// Compute sine and cosine simultaneously.
-///	This returns the same results as sin() and cos() but is faster than calling each function
-/// individually.
-///
-/// This function is exact to rounding for all rounding modes.
-/// \param arg function argument
-/// \param sin variable to take sine of \a arg
-/// \param cos variable to take cosine of \a arg
-/// \exception FE_INVALID for signaling NaN or infinity
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline void sincos(half arg, half* sin, half* cos)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    detail::internal_t f = detail::half2float<detail::internal_t>(arg.data_);
-    *sin                 = half(detail::binary, detail::float2half<half::round_style>(std::sin(f)));
-    *cos                 = half(detail::binary, detail::float2half<half::round_style>(std::cos(f)));
-#else
-    int abs = arg.data_ & 0x7FFF, sign = arg.data_ >> 15, k;
-    if(abs >= 0x7C00)
-        *sin = *cos =
-            half(detail::binary, (abs == 0x7C00) ? detail::invalid() : detail::signal(arg.data_));
-    else if(!abs)
-    {
-        *sin = arg;
-        *cos = half(detail::binary, 0x3C00);
-    }
-    else if(abs < 0x2500)
-    {
-        *sin = half(detail::binary, detail::rounded<half::round_style, true>(arg.data_ - 1, 1, 1));
-        *cos = half(detail::binary, detail::rounded<half::round_style, true>(0x3BFF, 1, 1));
-    }
-    else
-    {
-        if(half::round_style != std::round_to_nearest)
-        {
-            switch(abs)
-            {
-            case 0x48B7:
-                *sin = half(
-                    detail::binary,
-                    detail::rounded<half::round_style, true>((~arg.data_ & 0x8000) | 0x1D07, 1, 1));
-                *cos = half(detail::binary, detail::rounded<half::round_style, true>(0xBBFF, 1, 1));
-                return;
-            case 0x598C:
-                *sin = half(
-                    detail::binary,
-                    detail::rounded<half::round_style, true>((arg.data_ & 0x8000) | 0x3BFF, 1, 1));
-                *cos = half(detail::binary, detail::rounded<half::round_style, true>(0x80FC, 1, 1));
-                return;
-            case 0x6A64:
-                *sin = half(
-                    detail::binary,
-                    detail::rounded<half::round_style, true>((~arg.data_ & 0x8000) | 0x3BFE, 1, 1));
-                *cos = half(detail::binary, detail::rounded<half::round_style, true>(0x27FF, 1, 1));
-                return;
-            case 0x6D8C:
-                *sin = half(
-                    detail::binary,
-                    detail::rounded<half::round_style, true>((arg.data_ & 0x8000) | 0x0FE6, 1, 1));
-                *cos = half(detail::binary, detail::rounded<half::round_style, true>(0x3BFF, 1, 1));
-                return;
-            }
-        }
-        std::pair<detail::uint32, detail::uint32> sc =
-            detail::sincos(detail::angle_arg(abs, k), 28);
-        switch(k & 3)
-        {
-        case 1: sc = std::make_pair(sc.second, -sc.first); break;
-        case 2: sc = std::make_pair(-sc.first, -sc.second); break;
-        case 3: sc = std::make_pair(-sc.second, sc.first); break;
-        }
-        *sin = half(detail::binary,
-                    detail::fixed2half<half::round_style, 30, true, true, true>(
-                        (sc.first ^ -static_cast<detail::uint32>(sign)) + sign));
-        *cos = half(detail::binary,
-                    detail::fixed2half<half::round_style, 30, true, true, true>(sc.second));
-    }
-#endif
-}
-
-/// Sine function.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for [std::sin](https://en.cppreference.com/w/cpp/numeric/math/sin).
-/// \param arg function argument
-/// \return sine value of \a arg
-/// \exception FE_INVALID for signaling NaN or infinity
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half sin(half arg)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::sin(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    int abs = arg.data_ & 0x7FFF, k;
-    if(!abs)
-        return arg;
-    if(abs >= 0x7C00)
-        return half(detail::binary,
-                    (abs == 0x7C00) ? detail::invalid() : detail::signal(arg.data_));
-    if(abs < 0x2900)
-        return half(detail::binary, detail::rounded<half::round_style, true>(arg.data_ - 1, 1, 1));
-    if(half::round_style != std::round_to_nearest)
-        switch(abs)
-        {
-        case 0x48B7:
-            return half(
-                detail::binary,
-                detail::rounded<half::round_style, true>((~arg.data_ & 0x8000) | 0x1D07, 1, 1));
-        case 0x6A64:
-            return half(
-                detail::binary,
-                detail::rounded<half::round_style, true>((~arg.data_ & 0x8000) | 0x3BFE, 1, 1));
-        case 0x6D8C:
-            return half(
-                detail::binary,
-                detail::rounded<half::round_style, true>((arg.data_ & 0x8000) | 0x0FE6, 1, 1));
-        }
-    std::pair<detail::uint32, detail::uint32> sc = detail::sincos(detail::angle_arg(abs, k), 28);
-    detail::uint32 sign = -static_cast<detail::uint32>(((k >> 1) & 1) ^ (arg.data_ >> 15));
-    return half(detail::binary,
-                detail::fixed2half<half::round_style, 30, true, true, true>(
-                    (((k & 1) ? sc.second : sc.first) ^ sign) - sign));
-#endif
-}
-
-/// Cosine function.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for [std::cos](https://en.cppreference.com/w/cpp/numeric/math/cos).
-/// \param arg function argument
-/// \return cosine value of \a arg
-/// \exception FE_INVALID for signaling NaN or infinity
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half cos(half arg)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::cos(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    int abs = arg.data_ & 0x7FFF, k;
-    if(!abs)
-        return half(detail::binary, 0x3C00);
-    if(abs >= 0x7C00)
-        return half(detail::binary,
-                    (abs == 0x7C00) ? detail::invalid() : detail::signal(arg.data_));
-    if(abs < 0x2500)
-        return half(detail::binary, detail::rounded<half::round_style, true>(0x3BFF, 1, 1));
-    if(half::round_style != std::round_to_nearest && abs == 0x598C)
-        return half(detail::binary, detail::rounded<half::round_style, true>(0x80FC, 1, 1));
-    std::pair<detail::uint32, detail::uint32> sc = detail::sincos(detail::angle_arg(abs, k), 28);
-    detail::uint32 sign                          = -static_cast<detail::uint32>(((k >> 1) ^ k) & 1);
-    return half(detail::binary,
-                detail::fixed2half<half::round_style, 30, true, true, true>(
-                    (((k & 1) ? sc.first : sc.second) ^ sign) - sign));
-#endif
-}
-
-/// Tangent function.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for [std::tan](https://en.cppreference.com/w/cpp/numeric/math/tan).
-/// \param arg function argument
-/// \return tangent value of \a arg
-/// \exception FE_INVALID for signaling NaN or infinity
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half tan(half arg)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::tan(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    int abs = arg.data_ & 0x7FFF, exp = 13, k;
-    if(!abs)
-        return arg;
-    if(abs >= 0x7C00)
-        return half(detail::binary,
-                    (abs == 0x7C00) ? detail::invalid() : detail::signal(arg.data_));
-    if(abs < 0x2700)
-        return half(detail::binary, detail::rounded<half::round_style, true>(arg.data_, 0, 1));
-    if(half::round_style != std::round_to_nearest)
-        switch(abs)
-        {
-        case 0x658C:
-            return half(
-                detail::binary,
-                detail::rounded<half::round_style, true>((arg.data_ & 0x8000) | 0x07E6, 1, 1));
-        case 0x7330:
-            return half(
-                detail::binary,
-                detail::rounded<half::round_style, true>((~arg.data_ & 0x8000) | 0x4B62, 1, 1));
-        }
-    std::pair<detail::uint32, detail::uint32> sc = detail::sincos(detail::angle_arg(abs, k), 30);
-    if(k & 1)
-        sc = std::make_pair(-sc.second, sc.first);
-    detail::uint32 signy = detail::sign_mask(sc.first), signx = detail::sign_mask(sc.second);
-    detail::uint32 my = (sc.first ^ signy) - signy, mx = (sc.second ^ signx) - signx;
-    for(; my < 0x80000000; my <<= 1, --exp)
-        ;
-    for(; mx < 0x80000000; mx <<= 1, ++exp)
-        ;
-    return half(
-        detail::binary,
-        detail::tangent_post<half::round_style>(my, mx, exp, (signy ^ signx ^ arg.data_) & 0x8000));
-#endif
-}
-
-/// Arc sine.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::asin](https://en.cppreference.com/w/cpp/numeric/math/asin).
-/// \param arg function argument
-/// \return arc sine value of \a arg
-/// \exception FE_INVALID for signaling NaN or if abs(\a arg) > 1
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half asin(half arg)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::asin(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ & 0x8000;
-    if(!abs)
-        return arg;
-    if(abs >= 0x3C00)
-        return half(detail::binary,
-                    (abs > 0x7C00)
-                        ? detail::signal(arg.data_)
-                        : (abs > 0x3C00)
-                              ? detail::invalid()
-                              : detail::rounded<half::round_style, true>(sign | 0x3E48, 0, 1));
-    if(abs < 0x2900)
-        return half(detail::binary, detail::rounded<half::round_style, true>(arg.data_, 0, 1));
-    if(half::round_style != std::round_to_nearest && (abs == 0x2B44 || abs == 0x2DC3))
-        return half(detail::binary, detail::rounded<half::round_style, true>(arg.data_ + 1, 1, 1));
-    std::pair<detail::uint32, detail::uint32> sc = detail::atan2_args(abs);
-    detail::uint32 m =
-        detail::atan2(sc.first, sc.second, (half::round_style == std::round_to_nearest) ? 27 : 26);
-    return half(detail::binary,
-                detail::fixed2half<half::round_style, 30, false, true, true>(m, 14, sign));
-#endif
-}
-
-/// Arc cosine function.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::acos](https://en.cppreference.com/w/cpp/numeric/math/acos).
-/// \param arg function argument
-/// \return arc cosine value of \a arg
-/// \exception FE_INVALID for signaling NaN or if abs(\a arg) > 1
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half acos(half arg)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::acos(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ >> 15;
-    if(!abs)
-        return half(detail::binary, detail::rounded<half::round_style, true>(0x3E48, 0, 1));
-    if(abs >= 0x3C00)
-        return half(detail::binary,
-                    (abs > 0x7C00)
-                        ? detail::signal(arg.data_)
-                        : (abs > 0x3C00)
-                              ? detail::invalid()
-                              : sign ? detail::rounded<half::round_style, true>(0x4248, 0, 1) : 0);
-    std::pair<detail::uint32, detail::uint32> cs = detail::atan2_args(abs);
-    detail::uint32 m                             = detail::atan2(cs.second, cs.first, 28);
-    return half(detail::binary,
-                detail::fixed2half<half::round_style, 31, false, true, true>(
-                    sign ? (0xC90FDAA2 - m) : m, 15, 0, sign));
-#endif
-}
-
-/// Arc tangent function.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::atan](https://en.cppreference.com/w/cpp/numeric/math/atan).
-/// \param arg function argument
-/// \return arc tangent value of \a arg
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half atan(half arg)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::atan(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ & 0x8000;
-    if(!abs)
-        return arg;
-    if(abs >= 0x7C00)
-        return half(detail::binary,
-                    (abs == 0x7C00) ? detail::rounded<half::round_style, true>(sign | 0x3E48, 0, 1)
-                                    : detail::signal(arg.data_));
-    if(abs <= 0x2700)
-        return half(detail::binary, detail::rounded<half::round_style, true>(arg.data_ - 1, 1, 1));
-    int exp           = (abs >> 10) + (abs <= 0x3FF);
-    detail::uint32 my = (abs & 0x3FF) | ((abs > 0x3FF) << 10);
-    detail::uint32 m  = (exp > 15)
-                           ? detail::atan2(my << 19,
-                                           0x20000000 >> (exp - 15),
-                                           (half::round_style == std::round_to_nearest) ? 26 : 24)
-                           : detail::atan2(my << (exp + 4),
-                                           0x20000000,
-                                           (half::round_style == std::round_to_nearest) ? 30 : 28);
-    return half(detail::binary,
-                detail::fixed2half<half::round_style, 30, false, true, true>(m, 14, sign));
-#endif
-}
-
-/// Arc tangent function.
-/// This function may be 1 ULP off the correctly rounded exact result in ~0.005% of inputs for
-/// `std::round_to_nearest`,
-/// in ~0.1% of inputs for `std::round_toward_zero` and in ~0.02% of inputs for any other rounding
-/// mode.
-///
-/// **See also:** Documentation for
-/// [std::atan2](https://en.cppreference.com/w/cpp/numeric/math/atan2).
-/// \param y numerator
-/// \param x denominator
-/// \return arc tangent value
-/// \exception FE_INVALID if \a x or \a y is signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half atan2(half y, half x)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::atan2(detail::half2float<detail::internal_t>(y.data_),
-                               detail::half2float<detail::internal_t>(x.data_))));
-#else
-    unsigned int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, signx = x.data_ >> 15,
-                 signy = y.data_ & 0x8000;
-    if(absx >= 0x7C00 || absy >= 0x7C00)
-    {
-        if(absx > 0x7C00 || absy > 0x7C00)
-            return half(detail::binary, detail::signal(x.data_, y.data_));
-        if(absy == 0x7C00)
-            return half(detail::binary,
-                        (absx < 0x7C00)
-                            ? detail::rounded<half::round_style, true>(signy | 0x3E48, 0, 1)
-                            : signx
-                                  ? detail::rounded<half::round_style, true>(signy | 0x40B6, 0, 1)
-                                  : detail::rounded<half::round_style, true>(signy | 0x3A48, 0, 1));
-        return (x.data_ == 0x7C00)
-                   ? half(detail::binary, signy)
-                   : half(detail::binary,
-                          detail::rounded<half::round_style, true>(signy | 0x4248, 0, 1));
-    }
-    if(!absy)
-        return signx ? half(detail::binary,
-                            detail::rounded<half::round_style, true>(signy | 0x4248, 0, 1))
-                     : y;
-    if(!absx)
-        return half(detail::binary, detail::rounded<half::round_style, true>(signy | 0x3E48, 0, 1));
-    int d = (absy >> 10) + (absy <= 0x3FF) - (absx >> 10) - (absx <= 0x3FF);
-    if(d > (signx ? 18 : 12))
-        return half(detail::binary, detail::rounded<half::round_style, true>(signy | 0x3E48, 0, 1));
-    if(signx && d < -11)
-        return half(detail::binary, detail::rounded<half::round_style, true>(signy | 0x4248, 0, 1));
-    if(!signx && d < ((half::round_style == std::round_toward_zero) ? -15 : -9))
-    {
-        for(; absy < 0x400; absy <<= 1, --d)
-            ;
-        detail::uint32 mx = ((absx << 1) & 0x7FF) | 0x800, my = ((absy << 1) & 0x7FF) | 0x800;
-        int i = my < mx;
-        d -= i;
-        if(d < -25)
-            return half(detail::binary, detail::underflow<half::round_style>(signy));
-        my <<= 11 + i;
-        return half(detail::binary,
-                    detail::fixed2half<half::round_style, 11, false, false, true>(
-                        my / mx, d + 14, signy, my % mx != 0));
-    }
-    detail::uint32 m = detail::atan2(
-        ((absy & 0x3FF) | ((absy > 0x3FF) << 10)) << (19 + ((d < 0) ? d : (d > 0) ? 0 : -1)),
-        ((absx & 0x3FF) | ((absx > 0x3FF) << 10)) << (19 - ((d > 0) ? d : (d < 0) ? 0 : 1)));
-    return half(detail::binary,
-                detail::fixed2half<half::round_style, 31, false, true, true>(
-                    signx ? (0xC90FDAA2 - m) : m, 15, signy, signx));
-#endif
-}
-
-/// \}
-/// \anchor hyperbolic
-/// \name Hyperbolic functions
-/// \{
-
-/// Hyperbolic sine.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::sinh](https://en.cppreference.com/w/cpp/numeric/math/sinh).
-/// \param arg function argument
-/// \return hyperbolic sine value of \a arg
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half sinh(half arg)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::sinh(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    int abs = arg.data_ & 0x7FFF, exp;
-    if(!abs || abs >= 0x7C00)
-        return (abs > 0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
-    if(abs <= 0x2900)
-        return half(detail::binary, detail::rounded<half::round_style, true>(arg.data_, 0, 1));
-    std::pair<detail::uint32, detail::uint32> mm =
-        detail::hyperbolic_args(abs, exp, (half::round_style == std::round_to_nearest) ? 29 : 27);
-    detail::uint32 m = mm.first - mm.second;
-    for(exp += 13; m < 0x80000000 && exp; m <<= 1, --exp)
-        ;
-    unsigned int sign = arg.data_ & 0x8000;
-    if(exp > 29)
-        return half(detail::binary, detail::overflow<half::round_style>(sign));
-    return half(detail::binary,
-                detail::fixed2half<half::round_style, 31, false, false, true>(m, exp, sign));
-#endif
-}
-
-/// Hyperbolic cosine.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::cosh](https://en.cppreference.com/w/cpp/numeric/math/cosh).
-/// \param arg function argument
-/// \return hyperbolic cosine value of \a arg
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half cosh(half arg)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::cosh(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    int abs = arg.data_ & 0x7FFF, exp;
-    if(!abs)
-        return half(detail::binary, 0x3C00);
-    if(abs >= 0x7C00)
-        return half(detail::binary, (abs > 0x7C00) ? detail::signal(arg.data_) : 0x7C00);
-    std::pair<detail::uint32, detail::uint32> mm =
-        detail::hyperbolic_args(abs, exp, (half::round_style == std::round_to_nearest) ? 23 : 26);
-    detail::uint32 m = mm.first + mm.second, i = (~m & 0xFFFFFFFF) >> 31;
-    m = (m >> i) | (m & i) | 0x80000000;
-    if((exp += 13 + i) > 29)
-        return half(detail::binary, detail::overflow<half::round_style>());
-    return half(detail::binary,
-                detail::fixed2half<half::round_style, 31, false, false, true>(m, exp));
-#endif
-}
-
-/// Hyperbolic tangent.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::tanh](https://en.cppreference.com/w/cpp/numeric/math/tanh).
-/// \param arg function argument
-/// \return hyperbolic tangent value of \a arg
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half tanh(half arg)
-{
-#ifdef HALF_ARITHMETIC_TYPE
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::tanh(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    int abs = arg.data_ & 0x7FFF, exp;
-    if(!abs)
-        return arg;
-    if(abs >= 0x7C00)
-        return half(detail::binary,
-                    (abs > 0x7C00) ? detail::signal(arg.data_) : (arg.data_ - 0x4000));
-    if(abs >= 0x4500)
-        return half(detail::binary,
-                    detail::rounded<half::round_style, true>((arg.data_ & 0x8000) | 0x3BFF, 1, 1));
-    if(abs < 0x2700)
-        return half(detail::binary, detail::rounded<half::round_style, true>(arg.data_ - 1, 1, 1));
-    if(half::round_style != std::round_to_nearest && abs == 0x2D3F)
-        return half(detail::binary, detail::rounded<half::round_style, true>(arg.data_ - 3, 0, 1));
-    std::pair<detail::uint32, detail::uint32> mm = detail::hyperbolic_args(abs, exp, 27);
-    detail::uint32 my = mm.first - mm.second - (half::round_style != std::round_to_nearest),
-                   mx = mm.first + mm.second, i = (~mx & 0xFFFFFFFF) >> 31;
-    for(exp = 13; my < 0x80000000; my <<= 1, --exp)
-        ;
-    mx = (mx >> i) | 0x80000000;
-    return half(detail::binary,
-                detail::tangent_post<half::round_style>(my, mx, exp - i, arg.data_ & 0x8000));
-#endif
-}
-
-/// Hyperbolic area sine.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::asinh](https://en.cppreference.com/w/cpp/numeric/math/asinh).
-/// \param arg function argument
-/// \return area sine value of \a arg
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half asinh(half arg)
-{
-#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::asinh(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    int abs = arg.data_ & 0x7FFF;
-    if(!abs || abs >= 0x7C00)
-        return (abs > 0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
-    if(abs <= 0x2900)
-        return half(detail::binary, detail::rounded<half::round_style, true>(arg.data_ - 1, 1, 1));
-    if(half::round_style != std::round_to_nearest)
-        switch(abs)
-        {
-        case 0x32D4:
-            return half(detail::binary,
-                        detail::rounded<half::round_style, true>(arg.data_ - 13, 1, 1));
-        case 0x3B5B:
-            return half(detail::binary,
-                        detail::rounded<half::round_style, true>(arg.data_ - 197, 1, 1));
-        }
-    return half(detail::binary, detail::area<half::round_style, true>(arg.data_));
-#endif
-}
-
-/// Hyperbolic area cosine.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::acosh](https://en.cppreference.com/w/cpp/numeric/math/acosh).
-/// \param arg function argument
-/// \return area cosine value of \a arg
-/// \exception FE_INVALID for signaling NaN or arguments <1
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half acosh(half arg)
-{
-#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::acosh(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    int abs = arg.data_ & 0x7FFF;
-    if((arg.data_ & 0x8000) || abs < 0x3C00)
-        return half(detail::binary,
-                    (abs <= 0x7C00) ? detail::invalid() : detail::signal(arg.data_));
-    if(abs == 0x3C00)
-        return half(detail::binary, 0);
-    if(arg.data_ >= 0x7C00)
-        return (abs > 0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
-    return half(detail::binary, detail::area<half::round_style, false>(arg.data_));
-#endif
-}
-
-/// Hyperbolic area tangent.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::atanh](https://en.cppreference.com/w/cpp/numeric/math/atanh).
-/// \param arg function argument
-/// \return area tangent value of \a arg
-/// \exception FE_INVALID for signaling NaN or if abs(\a arg) > 1
-/// \exception FE_DIVBYZERO for +/-1
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half atanh(half arg)
-{
-#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::atanh(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    int abs = arg.data_ & 0x7FFF, exp = 0;
-    if(!abs)
-        return arg;
-    if(abs >= 0x3C00)
-        return half(detail::binary,
-                    (abs == 0x3C00)
-                        ? detail::pole(arg.data_ & 0x8000)
-                        : (abs <= 0x7C00) ? detail::invalid() : detail::signal(arg.data_));
-    if(abs < 0x2700)
-        return half(detail::binary, detail::rounded<half::round_style, true>(arg.data_, 0, 1));
-    detail::uint32 m = static_cast<detail::uint32>((abs & 0x3FF) | ((abs > 0x3FF) << 10))
-                       << ((abs >> 10) + (abs <= 0x3FF) + 6),
-                   my = 0x80000000 + m, mx = 0x80000000 - m;
-    for(; mx < 0x80000000; mx <<= 1, ++exp)
-        ;
-    int i = my >= mx, s;
-    return half(detail::binary,
-                detail::log2_post<half::round_style, 0xB8AA3B2A>(
-                    detail::log2((detail::divide64(my >> i, mx, s) + 1) >> 1, 27) + 0x10,
-                    exp + i - 1,
-                    16,
-                    arg.data_ & 0x8000));
-#endif
-}
-
-/// \}
-/// \anchor special
-/// \name Error and gamma functions
-/// \{
-
-/// Error function.
-/// This function may be 1 ULP off the correctly rounded exact result for any rounding mode in <0.5%
-/// of inputs.
-///
-/// **See also:** Documentation for [std::erf](https://en.cppreference.com/w/cpp/numeric/math/erf).
-/// \param arg function argument
-/// \return error function value of \a arg
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half erf(half arg)
-{
-#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::erf(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    unsigned int abs = arg.data_ & 0x7FFF;
-    if(!abs || abs >= 0x7C00)
-        return (abs >= 0x7C00)
-                   ? half(detail::binary,
-                          (abs == 0x7C00) ? (arg.data_ - 0x4000) : detail::signal(arg.data_))
-                   : arg;
-    if(abs >= 0x4200)
-        return half(detail::binary,
-                    detail::rounded<half::round_style, true>((arg.data_ & 0x8000) | 0x3BFF, 1, 1));
-    return half(detail::binary, detail::erf<half::round_style, false>(arg.data_));
-#endif
-}
-
-/// Complementary error function.
-/// This function may be 1 ULP off the correctly rounded exact result for any rounding mode in <0.5%
-/// of inputs.
-///
-/// **See also:** Documentation for
-/// [std::erfc](https://en.cppreference.com/w/cpp/numeric/math/erfc).
-/// \param arg function argument
-/// \return 1 minus error function value of \a arg
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half erfc(half arg)
-{
-#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::erfc(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ & 0x8000;
-    if(abs >= 0x7C00)
-        return (abs >= 0x7C00)
-                   ? half(detail::binary, (abs == 0x7C00) ? (sign >> 1) : detail::signal(arg.data_))
-                   : arg;
-    if(!abs)
-        return half(detail::binary, 0x3C00);
-    if(abs >= 0x4400)
-        return half(
-            detail::binary,
-            detail::rounded<half::round_style, true>((sign >> 1) - (sign >> 15), sign >> 15, 1));
-    return half(detail::binary, detail::erf<half::round_style, true>(arg.data_));
-#endif
-}
-
-/// Natural logarithm of gamma function.
-/// This function may be 1 ULP off the correctly rounded exact result for any rounding mode in
-/// ~0.025% of inputs.
-///
-/// **See also:** Documentation for
-/// [std::lgamma](https://en.cppreference.com/w/cpp/numeric/math/lgamma).
-/// \param arg function argument
-/// \return natural logarith of gamma function for \a arg
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_DIVBYZERO for 0 or negative integer arguments
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half lgamma(half arg)
-{
-#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::lgamma(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    int abs = arg.data_ & 0x7FFF;
-    if(abs >= 0x7C00)
-        return half(detail::binary, (abs == 0x7C00) ? 0x7C00 : detail::signal(arg.data_));
-    if(!abs || arg.data_ >= 0xE400 ||
-       (arg.data_ >= 0xBC00 && !(abs & ((1 << (25 - (abs >> 10))) - 1))))
-        return half(detail::binary, detail::pole());
-    if(arg.data_ == 0x3C00 || arg.data_ == 0x4000)
-        return half(detail::binary, 0);
-    return half(detail::binary, detail::gamma<half::round_style, true>(arg.data_));
-#endif
-}
-
-/// Gamma function.
-/// This function may be 1 ULP off the correctly rounded exact result for any rounding mode in
-/// <0.25% of inputs.
-///
-/// **See also:** Documentation for
-/// [std::tgamma](https://en.cppreference.com/w/cpp/numeric/math/tgamma).
-/// \param arg function argument
-/// \return gamma function value of \a arg
-/// \exception FE_INVALID for signaling NaN, negative infinity or negative integer arguments
-/// \exception FE_DIVBYZERO for 0
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half tgamma(half arg)
-{
-#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
-    return half(detail::binary,
-                detail::float2half<half::round_style>(
-                    std::tgamma(detail::half2float<detail::internal_t>(arg.data_))));
-#else
-    unsigned int abs = arg.data_ & 0x7FFF;
-    if(!abs)
-        return half(detail::binary, detail::pole(arg.data_));
-    if(abs >= 0x7C00)
-        return (arg.data_ == 0x7C00) ? arg : half(detail::binary, detail::signal(arg.data_));
-    if(arg.data_ >= 0xE400 || (arg.data_ >= 0xBC00 && !(abs & ((1 << (25 - (abs >> 10))) - 1))))
-        return half(detail::binary, detail::invalid());
-    if(arg.data_ >= 0xCA80)
-        return half(
-            detail::binary,
-            detail::underflow<half::round_style>((1 - ((abs >> (25 - (abs >> 10))) & 1)) << 15));
-    if(arg.data_ <= 0x100 || (arg.data_ >= 0x4900 && arg.data_ < 0x8000))
-        return half(detail::binary, detail::overflow<half::round_style>());
-    if(arg.data_ == 0x3C00)
-        return arg;
-    return half(detail::binary, detail::gamma<half::round_style, false>(arg.data_));
-#endif
-}
-
-/// \}
-/// \anchor rounding
-/// \name Rounding
-/// \{
-
-/// Nearest integer not less than half value.
-/// **See also:** Documentation for
-/// [std::ceil](https://en.cppreference.com/w/cpp/numeric/math/ceil).
-/// \param arg half to round
-/// \return nearest integer not less than \a arg
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_INEXACT if value had to be rounded
-inline half ceil(half arg)
-{
-    return half(detail::binary,
-                detail::integral<std::round_toward_infinity, true, true>(arg.data_));
-}
-
-/// Nearest integer not greater than half value.
-/// **See also:** Documentation for
-/// [std::floor](https://en.cppreference.com/w/cpp/numeric/math/floor).
-/// \param arg half to round
-/// \return nearest integer not greater than \a arg
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_INEXACT if value had to be rounded
-inline half floor(half arg)
-{
-    return half(detail::binary,
-                detail::integral<std::round_toward_neg_infinity, true, true>(arg.data_));
-}
-
-/// Nearest integer not greater in magnitude than half value.
-/// **See also:** Documentation for
-/// [std::trunc](https://en.cppreference.com/w/cpp/numeric/math/trunc).
-/// \param arg half to round
-/// \return nearest integer not greater in magnitude than \a arg
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_INEXACT if value had to be rounded
-inline half trunc(half arg)
-{
-    return half(detail::binary, detail::integral<std::round_toward_zero, true, true>(arg.data_));
-}
-
-/// Nearest integer.
-/// **See also:** Documentation for
-/// [std::round](https://en.cppreference.com/w/cpp/numeric/math/round).
-/// \param arg half to round
-/// \return nearest integer, rounded away from zero in half-way cases
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_INEXACT if value had to be rounded
-inline half round(half arg)
-{
-    return half(detail::binary, detail::integral<std::round_to_nearest, false, true>(arg.data_));
-}
-
-/// Nearest integer.
-/// **See also:** Documentation for
-/// [std::lround](https://en.cppreference.com/w/cpp/numeric/math/round).
-/// \param arg half to round
-/// \return nearest integer, rounded away from zero in half-way cases
-/// \exception FE_INVALID if value is not representable as `long`
-inline long lround(half arg)
-{
-    return detail::half2int<std::round_to_nearest, false, false, long>(arg.data_);
-}
-
-/// Nearest integer using half's internal rounding mode.
-/// **See also:** Documentation for
-/// [std::rint](https://en.cppreference.com/w/cpp/numeric/math/rint).
-/// \param arg half expression to round
-/// \return nearest integer using default rounding mode
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_INEXACT if value had to be rounded
-inline half rint(half arg)
-{
-    return half(detail::binary, detail::integral<half::round_style, true, true>(arg.data_));
-}
-
-/// Nearest integer using half's internal rounding mode.
-/// **See also:** Documentation for
-/// [std::lrint](https://en.cppreference.com/w/cpp/numeric/math/rint).
-/// \param arg half expression to round
-/// \return nearest integer using default rounding mode
-/// \exception FE_INVALID if value is not representable as `long`
-/// \exception FE_INEXACT if value had to be rounded
-inline long lrint(half arg)
-{
-    return detail::half2int<half::round_style, true, true, long>(arg.data_);
-}
-
-/// Nearest integer using half's internal rounding mode.
-/// **See also:** Documentation for
-/// [std::nearbyint](https://en.cppreference.com/w/cpp/numeric/math/nearbyint).
-/// \param arg half expression to round
-/// \return nearest integer using default rounding mode
-/// \exception FE_INVALID for signaling NaN
-inline half nearbyint(half arg)
-{
-    return half(detail::binary, detail::integral<half::round_style, true, false>(arg.data_));
-}
-#if HALF_ENABLE_CPP11_LONG_LONG
-/// Nearest integer.
-/// **See also:** Documentation for
-/// [std::llround](https://en.cppreference.com/w/cpp/numeric/math/round).
-/// \param arg half to round
-/// \return nearest integer, rounded away from zero in half-way cases
-/// \exception FE_INVALID if value is not representable as `long long`
-inline long long llround(half arg)
-{
-    return detail::half2int<std::round_to_nearest, false, false, long long>(arg.data_);
-}
-
-/// Nearest integer using half's internal rounding mode.
-/// **See also:** Documentation for
-/// [std::llrint](https://en.cppreference.com/w/cpp/numeric/math/rint).
-/// \param arg half expression to round
-/// \return nearest integer using default rounding mode
-/// \exception FE_INVALID if value is not representable as `long long`
-/// \exception FE_INEXACT if value had to be rounded
-inline long long llrint(half arg)
-{
-    return detail::half2int<half::round_style, true, true, long long>(arg.data_);
-}
-#endif
-
-/// \}
-/// \anchor float
-/// \name Floating point manipulation
-/// \{
-
-/// Decompress floating-point number.
-/// **See also:** Documentation for
-/// [std::frexp](https://en.cppreference.com/w/cpp/numeric/math/frexp).
-/// \param arg number to decompress
-/// \param exp address to store exponent at
-/// \return significant in range [0.5, 1)
-/// \exception FE_INVALID for signaling NaN
-inline half frexp(half arg, int* exp)
-{
-    *exp             = 0;
-    unsigned int abs = arg.data_ & 0x7FFF;
-    if(abs >= 0x7C00 || !abs)
-        return (abs > 0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
-    for(; abs < 0x400; abs <<= 1, --*exp)
-        ;
-    *exp += (abs >> 10) - 14;
-    return half(detail::binary, (arg.data_ & 0x8000) | 0x3800 | (abs & 0x3FF));
-}
-
-/// Multiply by power of two.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::scalbln](https://en.cppreference.com/w/cpp/numeric/math/scalbn).
-/// \param arg number to modify
-/// \param exp power of two to multiply with
-/// \return \a arg multplied by 2 raised to \a exp
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half scalbln(half arg, long exp)
-{
-    unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ & 0x8000;
-    if(abs >= 0x7C00 || !abs)
-        return (abs > 0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
-    for(; abs < 0x400; abs <<= 1, --exp)
-        ;
-    exp += abs >> 10;
-    if(exp > 30)
-        return half(detail::binary, detail::overflow<half::round_style>(sign));
-    else if(exp < -10)
-        return half(detail::binary, detail::underflow<half::round_style>(sign));
-    else if(exp > 0)
-        return half(detail::binary, sign | (exp << 10) | (abs & 0x3FF));
-    unsigned int m = (abs & 0x3FF) | 0x400;
-    return half(detail::binary,
-                detail::rounded<half::round_style, false>(
-                    sign | (m >> (1 - exp)), (m >> -exp) & 1, (m & ((1 << -exp) - 1)) != 0));
-}
-
-/// Multiply by power of two.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::scalbn](https://en.cppreference.com/w/cpp/numeric/math/scalbn).
-/// \param arg number to modify
-/// \param exp power of two to multiply with
-/// \return \a arg multplied by 2 raised to \a exp
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half scalbn(half arg, int exp) { return scalbln(arg, exp); }
-
-/// Multiply by power of two.
-/// This function is exact to rounding for all rounding modes.
-///
-/// **See also:** Documentation for
-/// [std::ldexp](https://en.cppreference.com/w/cpp/numeric/math/ldexp).
-/// \param arg number to modify
-/// \param exp power of two to multiply with
-/// \return \a arg multplied by 2 raised to \a exp
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-inline half ldexp(half arg, int exp) { return scalbln(arg, exp); }
-
-/// Extract integer and fractional parts.
-/// **See also:** Documentation for
-/// [std::modf](https://en.cppreference.com/w/cpp/numeric/math/modf).
-/// \param arg number to decompress
-/// \param iptr address to store integer part at
-/// \return fractional part
-/// \exception FE_INVALID for signaling NaN
-inline half modf(half arg, half* iptr)
-{
-    unsigned int abs = arg.data_ & 0x7FFF;
-    if(abs > 0x7C00)
-    {
-        arg          = half(detail::binary, detail::signal(arg.data_));
-        return *iptr = arg, arg;
-    }
-    if(abs >= 0x6400)
-        return *iptr = arg, half(detail::binary, arg.data_ & 0x8000);
-    if(abs < 0x3C00)
-        return iptr->data_ = arg.data_ & 0x8000, arg;
-    unsigned int exp = abs >> 10, mask = (1 << (25 - exp)) - 1, m = arg.data_ & mask;
-    iptr->data_ = arg.data_ & ~mask;
-    if(!m)
-        return half(detail::binary, arg.data_ & 0x8000);
-    for(; m < 0x400; m <<= 1, --exp)
-        ;
-    return half(detail::binary, (arg.data_ & 0x8000) | (exp << 10) | (m & 0x3FF));
-}
-
-/// Extract exponent.
-/// **See also:** Documentation for
-/// [std::ilogb](https://en.cppreference.com/w/cpp/numeric/math/ilogb).
-/// \param arg number to query
-/// \return floating-point exponent
-/// \retval FP_ILOGB0 for zero
-/// \retval FP_ILOGBNAN for NaN
-/// \retval INT_MAX for infinity
-/// \exception FE_INVALID for 0 or infinite values
-inline int ilogb(half arg)
-{
-    int abs = arg.data_ & 0x7FFF, exp;
-    if(!abs || abs >= 0x7C00)
-    {
-        detail::raise(FE_INVALID);
-        return !abs ? FP_ILOGB0 : (abs == 0x7C00) ? INT_MAX : FP_ILOGBNAN;
-    }
-    for(exp = (abs >> 10) - 15; abs < 0x200; abs <<= 1, --exp)
-        ;
-    return exp;
-}
-
-/// Extract exponent.
-/// **See also:** Documentation for
-/// [std::logb](https://en.cppreference.com/w/cpp/numeric/math/logb).
-/// \param arg number to query
-/// \return floating-point exponent
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_DIVBYZERO for 0
-inline half logb(half arg)
-{
-    int abs = arg.data_ & 0x7FFF, exp;
-    if(!abs)
-        return half(detail::binary, detail::pole(0x8000));
-    if(abs >= 0x7C00)
-        return half(detail::binary, (abs == 0x7C00) ? 0x7C00 : detail::signal(arg.data_));
-    for(exp = (abs >> 10) - 15; abs < 0x200; abs <<= 1, --exp)
-        ;
-    unsigned int value = static_cast<unsigned>(exp < 0) << 15;
-    if(exp)
-    {
-        unsigned int m = std::abs(exp) << 6;
-        for(exp = 18; m < 0x400; m <<= 1, --exp)
-            ;
-        value |= (exp << 10) + m;
-    }
-    return half(detail::binary, value);
-}
-
-/// Next representable value.
-/// **See also:** Documentation for
-/// [std::nextafter](https://en.cppreference.com/w/cpp/numeric/math/nextafter).
-/// \param from value to compute next representable value for
-/// \param to direction towards which to compute next value
-/// \return next representable value after \a from in direction towards \a to
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_OVERFLOW for infinite result from finite argument
-/// \exception FE_UNDERFLOW for subnormal result
-inline half nextafter(half from, half to)
-{
-    int fabs = from.data_ & 0x7FFF, tabs = to.data_ & 0x7FFF;
-    if(fabs > 0x7C00 || tabs > 0x7C00)
-        return half(detail::binary, detail::signal(from.data_, to.data_));
-    if(from.data_ == to.data_ || !(fabs | tabs))
-        return to;
-    if(!fabs)
-    {
-        detail::raise(FE_UNDERFLOW, !HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT);
-        return half(detail::binary, (to.data_ & 0x8000) + 1);
-    }
-    unsigned int out =
-        from.data_ +
-        (((from.data_ >> 15) ^
-          static_cast<unsigned>((from.data_ ^ (0x8000 | (0x8000 - (from.data_ >> 15)))) <
-                                (to.data_ ^ (0x8000 | (0x8000 - (to.data_ >> 15))))))
-         << 1) -
-        1;
-    detail::raise(FE_OVERFLOW, fabs < 0x7C00 && (out & 0x7C00) == 0x7C00);
-    detail::raise(FE_UNDERFLOW, !HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT && (out & 0x7C00) < 0x400);
-    return half(detail::binary, out);
-}
-
-/// Next representable value.
-/// **See also:** Documentation for
-/// [std::nexttoward](https://en.cppreference.com/w/cpp/numeric/math/nexttoward).
-/// \param from value to compute next representable value for
-/// \param to direction towards which to compute next value
-/// \return next representable value after \a from in direction towards \a to
-/// \exception FE_INVALID for signaling NaN
-/// \exception FE_OVERFLOW for infinite result from finite argument
-/// \exception FE_UNDERFLOW for subnormal result
-inline half nexttoward(half from, long double to)
-{
-    int fabs = from.data_ & 0x7FFF;
-    if(fabs > 0x7C00)
-        return half(detail::binary, detail::signal(from.data_));
-    long double lfrom = static_cast<long double>(from);
-    if(detail::builtin_isnan(to) || lfrom == to)
-        return half(static_cast<float>(to));
-    if(!fabs)
-    {
-        detail::raise(FE_UNDERFLOW, !HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT);
-        return half(detail::binary, (static_cast<unsigned>(detail::builtin_signbit(to)) << 15) + 1);
-    }
-    unsigned int out =
-        from.data_ + (((from.data_ >> 15) ^ static_cast<unsigned>(lfrom < to)) << 1) - 1;
-    detail::raise(FE_OVERFLOW, (out & 0x7FFF) == 0x7C00);
-    detail::raise(FE_UNDERFLOW, !HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT && (out & 0x7FFF) < 0x400);
-    return half(detail::binary, out);
-}
-
-/// Take sign.
-/// **See also:** Documentation for
-/// [std::copysign](https://en.cppreference.com/w/cpp/numeric/math/copysign).
-/// \param x value to change sign for
-/// \param y value to take sign from
-/// \return value equal to \a x in magnitude and to \a y in sign
-inline HALF_CONSTEXPR half copysign(half x, half y)
-{
-    return half(detail::binary, x.data_ ^ ((x.data_ ^ y.data_) & 0x8000));
-}
-
-/// \}
-/// \anchor classification
-/// \name Floating point classification
-/// \{
-
-/// Classify floating-point value.
-/// **See also:** Documentation for
-/// [std::fpclassify](https://en.cppreference.com/w/cpp/numeric/math/fpclassify).
-/// \param arg number to classify
-/// \retval FP_ZERO for positive and negative zero
-/// \retval FP_SUBNORMAL for subnormal numbers
-/// \retval FP_INFINITY for positive and negative infinity
-/// \retval FP_NAN for NaNs
-/// \retval FP_NORMAL for all other (normal) values
-inline HALF_CONSTEXPR int fpclassify(half arg)
-{
-    return !(arg.data_ & 0x7FFF)
-               ? FP_ZERO
-               : ((arg.data_ & 0x7FFF) < 0x400)
-                     ? FP_SUBNORMAL
-                     : ((arg.data_ & 0x7FFF) < 0x7C00)
-                           ? FP_NORMAL
-                           : ((arg.data_ & 0x7FFF) == 0x7C00) ? FP_INFINITE : FP_NAN;
-}
-
-/// Check if finite number.
-/// **See also:** Documentation for
-/// [std::isfinite](https://en.cppreference.com/w/cpp/numeric/math/isfinite).
-/// \param arg number to check
-/// \retval true if neither infinity nor NaN
-/// \retval false else
-inline HALF_CONSTEXPR bool isfinite(half arg) { return (arg.data_ & 0x7C00) != 0x7C00; }
-
-/// Check for infinity.
-/// **See also:** Documentation for
-/// [std::isinf](https://en.cppreference.com/w/cpp/numeric/math/isinf).
-/// \param arg number to check
-/// \retval true for positive or negative infinity
-/// \retval false else
-inline HALF_CONSTEXPR bool isinf(half arg) { return (arg.data_ & 0x7FFF) == 0x7C00; }
-
-/// Check for NaN.
-/// **See also:** Documentation for
-/// [std::isnan](https://en.cppreference.com/w/cpp/numeric/math/isnan).
-/// \param arg number to check
-/// \retval true for NaNs
-/// \retval false else
-inline HALF_CONSTEXPR bool isnan(half arg) { return (arg.data_ & 0x7FFF) > 0x7C00; }
-
-/// Check if normal number.
-/// **See also:** Documentation for
-/// [std::isnormal](https://en.cppreference.com/w/cpp/numeric/math/isnormal).
-/// \param arg number to check
-/// \retval true if normal number
-/// \retval false if either subnormal, zero, infinity or NaN
-inline HALF_CONSTEXPR bool isnormal(half arg)
-{
-    return ((arg.data_ & 0x7C00) != 0) & ((arg.data_ & 0x7C00) != 0x7C00);
-}
-
-/// Check sign.
-/// **See also:** Documentation for
-/// [std::signbit](https://en.cppreference.com/w/cpp/numeric/math/signbit).
-/// \param arg number to check
-/// \retval true for negative number
-/// \retval false for positive number
-inline HALF_CONSTEXPR bool signbit(half arg) { return (arg.data_ & 0x8000) != 0; }
-
-/// \}
-/// \anchor compfunc
-/// \name Comparison
-/// \{
-
-/// Quiet comparison for greater than.
-/// **See also:** Documentation for
-/// [std::isgreater](https://en.cppreference.com/w/cpp/numeric/math/isgreater).
-/// \param x first operand
-/// \param y second operand
-/// \retval true if \a x greater than \a y
-/// \retval false else
-inline HALF_CONSTEXPR bool isgreater(half x, half y)
-{
-    return ((x.data_ ^ (0x8000 | (0x8000 - (x.data_ >> 15)))) + (x.data_ >> 15)) >
-               ((y.data_ ^ (0x8000 | (0x8000 - (y.data_ >> 15)))) + (y.data_ >> 15)) &&
-           !isnan(x) && !isnan(y);
-}
-
-/// Quiet comparison for greater equal.
-/// **See also:** Documentation for
-/// [std::isgreaterequal](https://en.cppreference.com/w/cpp/numeric/math/isgreaterequal).
-/// \param x first operand
-/// \param y second operand
-/// \retval true if \a x greater equal \a y
-/// \retval false else
-inline HALF_CONSTEXPR bool isgreaterequal(half x, half y)
-{
-    return ((x.data_ ^ (0x8000 | (0x8000 - (x.data_ >> 15)))) + (x.data_ >> 15)) >=
-               ((y.data_ ^ (0x8000 | (0x8000 - (y.data_ >> 15)))) + (y.data_ >> 15)) &&
-           !isnan(x) && !isnan(y);
-}
-
-/// Quiet comparison for less than.
-/// **See also:** Documentation for
-/// [std::isless](https://en.cppreference.com/w/cpp/numeric/math/isless).
-/// \param x first operand
-/// \param y second operand
-/// \retval true if \a x less than \a y
-/// \retval false else
-inline HALF_CONSTEXPR bool isless(half x, half y)
-{
-    return ((x.data_ ^ (0x8000 | (0x8000 - (x.data_ >> 15)))) + (x.data_ >> 15)) <
-               ((y.data_ ^ (0x8000 | (0x8000 - (y.data_ >> 15)))) + (y.data_ >> 15)) &&
-           !isnan(x) && !isnan(y);
-}
-
-/// Quiet comparison for less equal.
-/// **See also:** Documentation for
-/// [std::islessequal](https://en.cppreference.com/w/cpp/numeric/math/islessequal).
-/// \param x first operand
-/// \param y second operand
-/// \retval true if \a x less equal \a y
-/// \retval false else
-inline HALF_CONSTEXPR bool islessequal(half x, half y)
-{
-    return ((x.data_ ^ (0x8000 | (0x8000 - (x.data_ >> 15)))) + (x.data_ >> 15)) <=
-               ((y.data_ ^ (0x8000 | (0x8000 - (y.data_ >> 15)))) + (y.data_ >> 15)) &&
-           !isnan(x) && !isnan(y);
-}
-
-/// Quiet comarison for less or greater.
-/// **See also:** Documentation for
-/// [std::islessgreater](https://en.cppreference.com/w/cpp/numeric/math/islessgreater).
-/// \param x first operand
-/// \param y second operand
-/// \retval true if either less or greater
-/// \retval false else
-inline HALF_CONSTEXPR bool islessgreater(half x, half y)
-{
-    return x.data_ != y.data_ && ((x.data_ | y.data_) & 0x7FFF) && !isnan(x) && !isnan(y);
-}
-
-/// Quiet check if unordered.
-/// **See also:** Documentation for
-/// [std::isunordered](https://en.cppreference.com/w/cpp/numeric/math/isunordered).
-/// \param x first operand
-/// \param y second operand
-/// \retval true if unordered (one or two NaN operands)
-/// \retval false else
-inline HALF_CONSTEXPR bool isunordered(half x, half y) { return isnan(x) || isnan(y); }
-
-/// \}
-/// \anchor casting
-/// \name Casting
-/// \{
-
-/// Cast to or from half-precision floating-point number.
-/// This casts between [half](\ref half_float::half) and any built-in arithmetic type. The values
-/// are converted
-/// directly using the default rounding mode, without any roundtrip over `float` that a
-/// `static_cast` would otherwise do.
-///
-/// Using this cast with neither of the two types being a [half](\ref half_float::half) or with any
-/// of the two types
-/// not being a built-in arithmetic type (apart from [half](\ref half_float::half), of course)
-/// results in a compiler
-/// error and casting between [half](\ref half_float::half)s returns the argument unmodified.
-/// \tparam T destination type (half or built-in arithmetic type)
-/// \tparam U source type (half or built-in arithmetic type)
-/// \param arg value to cast
-/// \return \a arg converted to destination type
-/// \exception FE_INVALID if \a T is integer type and result is not representable as \a T
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-template <typename T, typename U>
-T half_cast(U arg)
-{
-    return detail::half_caster<T, U>::cast(arg);
-}
-
-/// Cast to or from half-precision floating-point number.
-/// This casts between [half](\ref half_float::half) and any built-in arithmetic type. The values
-/// are converted
-/// directly using the specified rounding mode, without any roundtrip over `float` that a
-/// `static_cast` would otherwise do.
-///
-/// Using this cast with neither of the two types being a [half](\ref half_float::half) or with any
-/// of the two types
-/// not being a built-in arithmetic type (apart from [half](\ref half_float::half), of course)
-/// results in a compiler
-/// error and casting between [half](\ref half_float::half)s returns the argument unmodified.
-/// \tparam T destination type (half or built-in arithmetic type)
-/// \tparam R rounding mode to use.
-/// \tparam U source type (half or built-in arithmetic type)
-/// \param arg value to cast
-/// \return \a arg converted to destination type
-/// \exception FE_INVALID if \a T is integer type and result is not representable as \a T
-/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
-template <typename T, std::float_round_style R, typename U>
-T half_cast(U arg)
-{
-    return detail::half_caster<T, U, R>::cast(arg);
-}
-/// \}
-
-/// \}
-/// \anchor errors
-/// \name Error handling
-/// \{
-
-/// Clear exception flags.
-/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is
-/// disabled,
-/// but in that case manual flag management is the only way to raise flags.
-///
-/// **See also:** Documentation for
-/// [std::feclearexcept](https://en.cppreference.com/w/cpp/numeric/fenv/feclearexcept).
-/// \param excepts OR of exceptions to clear
-/// \retval 0 all selected flags cleared successfully
-inline int feclearexcept(int excepts)
-{
-    detail::errflags() &= ~excepts;
-    return 0;
-}
-
-/// Test exception flags.
-/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is
-/// disabled,
-/// but in that case manual flag management is the only way to raise flags.
-///
-/// **See also:** Documentation for
-/// [std::fetestexcept](https://en.cppreference.com/w/cpp/numeric/fenv/fetestexcept).
-/// \param excepts OR of exceptions to test
-/// \return OR of selected exceptions if raised
-inline int fetestexcept(int excepts) { return detail::errflags() & excepts; }
-
-/// Raise exception flags.
-/// This raises the specified floating point exceptions and also invokes any additional automatic
-/// exception handling as
-/// configured with the [HALF_ERRHANDLIG_...](\ref HALF_ERRHANDLING_ERRNO) preprocessor symbols.
-/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is
-/// disabled,
-/// but in that case manual flag management is the only way to raise flags.
-///
-/// **See also:** Documentation for
-/// [std::feraiseexcept](https://en.cppreference.com/w/cpp/numeric/fenv/feraiseexcept).
-/// \param excepts OR of exceptions to raise
-/// \retval 0 all selected exceptions raised successfully
-inline int feraiseexcept(int excepts)
-{
-    detail::errflags() |= excepts;
-    detail::raise(excepts);
-    return 0;
-}
-
-/// Save exception flags.
-/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is
-/// disabled,
-/// but in that case manual flag management is the only way to raise flags.
-///
-/// **See also:** Documentation for
-/// [std::fegetexceptflag](https://en.cppreference.com/w/cpp/numeric/fenv/feexceptflag).
-/// \param flagp adress to store flag state at
-/// \param excepts OR of flags to save
-/// \retval 0 for success
-inline int fegetexceptflag(int* flagp, int excepts)
-{
-    *flagp = detail::errflags() & excepts;
-    return 0;
-}
-
-/// Restore exception flags.
-/// This only copies the specified exception state (including unset flags) without incurring any
-/// additional exception handling.
-/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is
-/// disabled,
-/// but in that case manual flag management is the only way to raise flags.
-///
-/// **See also:** Documentation for
-/// [std::fesetexceptflag](https://en.cppreference.com/w/cpp/numeric/fenv/feexceptflag).
-/// \param flagp adress to take flag state from
-/// \param excepts OR of flags to restore
-/// \retval 0 for success
-inline int fesetexceptflag(const int* flagp, int excepts)
-{
-    detail::errflags() = (detail::errflags() | (*flagp & excepts)) & (*flagp | ~excepts);
-    return 0;
-}
-
-/// Throw C++ exceptions based on set exception flags.
-/// This function manually throws a corresponding C++ exception if one of the specified flags is
-/// set,
-/// no matter if automatic throwing (via [HALF_ERRHANDLING_THROW_...](\ref
-/// HALF_ERRHANDLING_THROW_INVALID)) is enabled or not.
-/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is
-/// disabled,
-/// but in that case manual flag management is the only way to raise flags.
-/// \param excepts OR of exceptions to test
-/// \param msg error message to use for exception description
-/// \throw std::domain_error if `FE_INVALID` or `FE_DIVBYZERO` is selected and set
-/// \throw std::overflow_error if `FE_OVERFLOW` is selected and set
-/// \throw std::underflow_error if `FE_UNDERFLOW` is selected and set
-/// \throw std::range_error if `FE_INEXACT` is selected and set
-inline void fethrowexcept(int excepts, const char* msg = "")
-{
-    excepts &= detail::errflags();
-    if(excepts & (FE_INVALID | FE_DIVBYZERO))
-        throw std::domain_error(msg);
-    if(excepts & FE_OVERFLOW)
-        throw std::overflow_error(msg);
-    if(excepts & FE_UNDERFLOW)
-        throw std::underflow_error(msg);
-    if(excepts & FE_INEXACT)
-        throw std::range_error(msg);
-}
-/// \}
-} // namespace half_float
-
-#undef HALF_UNUSED_NOERR
-#undef HALF_CONSTEXPR
-#undef HALF_CONSTEXPR_CONST
-#undef HALF_CONSTEXPR_NOERR
-#undef HALF_NOEXCEPT
-#undef HALF_NOTHROW
-#undef HALF_THREAD_LOCAL
-#undef HALF_TWOS_COMPLEMENT_INT
-#ifdef HALF_POP_WARNINGS
-#pragma warning(pop)
-#undef HALF_POP_WARNINGS
-#endif
-
-#endif
diff --git a/host/CMakeLists.txt b/host/CMakeLists.txt
index 26739efe34..30cc14d8ca 100644
--- a/host/CMakeLists.txt
+++ b/host/CMakeLists.txt
@@ -1,4 +1,2 @@
 add_subdirectory(host_tensor)
-add_subdirectory(online_compile)
 add_subdirectory(driver_offline)
-add_subdirectory(driver_online)
diff --git a/host/driver_offline/CMakeLists.txt b/host/driver_offline/CMakeLists.txt
index 927975d449..fec11e99af 100644
--- a/host/driver_offline/CMakeLists.txt
+++ b/host/driver_offline/CMakeLists.txt
@@ -9,11 +9,10 @@ include_directories(BEFORE
     ${PROJECT_SOURCE_DIR}/composable_kernel/include/problem_transform
     ${PROJECT_SOURCE_DIR}/composable_kernel/include/driver
     ${PROJECT_SOURCE_DIR}/external/rocm/include
-    ${PROJECT_SOURCE_DIR}/external/half/include
 )
 
-set(CONV_FWD_DRIVER_OFFLINE_SOURCE conv_fwd_driver_offline.cpp)
-set(CONV_BWD_DRIVER_OFFLINE_SOURCE conv_bwd_driver_offline.cpp)
+set(CONV_FWD_DRIVER_OFFLINE_SOURCE src/conv_fwd_driver_offline.cpp)
+set(CONV_BWD_DRIVER_OFFLINE_SOURCE src/conv_bwd_driver_offline.cpp)
 
 add_executable(conv_fwd_driver_offline ${CONV_FWD_DRIVER_OFFLINE_SOURCE})
 add_executable(conv_bwd_driver_offline ${CONV_BWD_DRIVER_OFFLINE_SOURCE})
diff --git a/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp b/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
similarity index 89%
rename from host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
rename to host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
index 49e0223b33..7bd82bf6d5 100644
--- a/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
@@ -2,7 +2,7 @@
 #include "device.hpp"
 #include "host_tensor.hpp"
 #include "transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp"
-#include "driver_dynamic_gemm_xdlops_v2r3.hpp"
+#include "driver_gemm_xdlops_v2r3.hpp"
 
 template <typename TInWei,
           typename TAcc,
@@ -14,7 +14,7 @@ template <typename TInWei,
           typename ConvDilations,
           typename InLeftPads,
           typename InRightPads>
-void device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk(
+void device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk(
     const InLengths& in_n_hi_wi_c_lengths,
     const WeiLengths& wei_k_y_x_c_lengths,
     const OutLengths& out_n_ho_wo_k_lengths,
@@ -35,11 +35,6 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyx
     constexpr auto I1 = Number<1>{};
     constexpr auto I2 = Number<2>{};
     constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
-    constexpr auto I6 = Number<6>{};
-    constexpr auto I7 = Number<7>{};
-    constexpr auto I8 = Number<8>{};
 
     DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
     DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
@@ -49,12 +44,9 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyx
     wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data());
     out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
 
-    const auto in_n_hi_wi_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths);
-    const auto wei_k_y_x_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths);
-    const auto out_n_ho_wo_k_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths);
+    const auto in_n_hi_wi_c_desc  = make_naive_tensor_descriptor_packed(in_n_hi_wi_c_lengths);
+    const auto wei_k_y_x_c_desc   = make_naive_tensor_descriptor_packed(wei_k_y_x_c_lengths);
+    const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(out_n_ho_wo_k_lengths);
 
 #if 1
     // [M, N, K0, K1] = [128, 128, 4, 4] for fp32
@@ -215,7 +207,7 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyx
     const auto in_gemmm_gemmn_grid_desc          = descs[I2];
 
     // HACK: hacks that control index calculation when iterating over A, B, C matrix
-    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks =
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},   // 0+: gemmk0
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: gemmm
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 2+: gemmk1
@@ -223,7 +215,7 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyx
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: Gemmm
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 2-: Gemmk1
 
-    constexpr auto out_gemmk0_gemmn_gemmk1_grid_iterator_hacks = make_tuple(
+    constexpr auto out_gemmk0_gemmn_gemmk1_grid_step_hacks = make_tuple(
         make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},   // 0+: gemmk0
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0>{},   // 1+: gemmn
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 2+: gemmk1
@@ -231,7 +223,7 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyx
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0>{},   // 1-: gemmn
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 2-: gemmk1
 
-    constexpr auto in_m0_m1_m2_n_grid_iterator_hacks = make_tuple(
+    constexpr auto in_m0_m1_m2_n_grid_step_hacks = make_tuple(
         make_tuple(
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 0+: MRepeat
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},  // 1+: NRepeat
@@ -251,15 +243,15 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyx
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6-: M2
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{})); // 7-: N1
 
-    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks =
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{};
 
-    constexpr auto out_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks =
+    constexpr auto out_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0>{};
 
     for(index_t i = 0; i < 5; ++i)
     {
-        float ave_time = driver_dynamic_gemm_xdlops_v2r3<
+        float ave_time = driver_gemm_xdlops_v2r3<
             BlockSize,
             TInWei,
             TAcc,
@@ -295,11 +287,11 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyx
             Sequence<1, 3, 7, 0, 2, 4, 5, 6>,
             6,
             GemmCThreadTransferDstScalarPerVector,
-            decltype(wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks),
-            decltype(out_gemmk0_gemmn_gemmk1_grid_iterator_hacks),
-            decltype(in_m0_m1_m2_n_grid_iterator_hacks),
-            decltype(wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks),
-            decltype(out_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks),
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_step_hacks),
+            decltype(out_gemmk0_gemmn_gemmk1_grid_step_hacks),
+            decltype(in_m0_m1_m2_n_grid_step_hacks),
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks),
+            decltype(out_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks),
             false // CAccessOrderMRepeatNRepeat
             >(static_cast<TInWei*>(wei_k_y_x_c_device_buf.GetDeviceBuffer()),
               static_cast<TOut*>(out_n_ho_wo_k_device_buf.GetDeviceBuffer()),
@@ -307,11 +299,11 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyx
               wei_gemmk0_gemmm_gemmk1_grid_desc,
               out_gemmk0_gemmn_gemmk1_grid_desc,
               in_gemmm_gemmn_grid_desc,
-              wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks,
-              out_gemmk0_gemmn_gemmk1_grid_iterator_hacks,
-              in_m0_m1_m2_n_grid_iterator_hacks,
-              wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks,
-              out_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks,
+              wei_gemmk0_gemmm_gemmk1_grid_step_hacks,
+              out_gemmk0_gemmn_gemmk1_grid_step_hacks,
+              in_m0_m1_m2_n_grid_step_hacks,
+              wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks,
+              out_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks,
               nrepeat);
 
         {
@@ -319,16 +311,13 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyx
             const auto K = out_n_ho_wo_k_lengths[I3];
             const auto C = wei_k_y_x_c_lengths[I3];
 
-            const auto Hi = in_n_hi_wi_c_lengths[I1];
-            const auto Wi = in_n_hi_wi_c_lengths[I2];
-
             const auto Ho = out_n_ho_wo_k_lengths[I1];
             const auto Wo = out_n_ho_wo_k_lengths[I2];
 
             const auto Y = wei_k_y_x_c_lengths[I1];
             const auto X = wei_k_y_x_c_lengths[I2];
 
-            float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
+            float perf = static_cast<float>((std::size_t(2) * N * K * Ho * Wo * C * Y * X)) /
                          (std::size_t(1000) * 1000 * 1000) / ave_time;
 
             std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
diff --git a/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp b/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
similarity index 88%
rename from host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
rename to host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
index ce4dd155f6..0ebf8571f4 100644
--- a/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
@@ -2,7 +2,7 @@
 #include "device.hpp"
 #include "host_tensor.hpp"
 #include "transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp"
-#include "driver_dynamic_gemm_xdlops_v2r3.hpp"
+#include "driver_gemm_xdlops_v2r3.hpp"
 
 template <typename TInWei,
           typename TAcc,
@@ -14,7 +14,7 @@ template <typename TInWei,
           typename ConvDilations,
           typename InLeftPads,
           typename InRightPads>
-void device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk(
+void device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk(
     const InLengths& in_n_hi_wi_c_lengths,
     const WeiLengths& wei_k_y_x_c_lengths,
     const OutLengths& out_n_ho_wo_k_lengths,
@@ -35,11 +35,6 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_k
     constexpr auto I1 = Number<1>{};
     constexpr auto I2 = Number<2>{};
     constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
-    constexpr auto I6 = Number<6>{};
-    constexpr auto I7 = Number<7>{};
-    constexpr auto I8 = Number<8>{};
 
     DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
     DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
@@ -49,12 +44,9 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_k
     wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data());
     out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
 
-    const auto in_n_hi_wi_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths);
-    const auto wei_k_y_x_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths);
-    const auto out_n_ho_wo_k_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths);
+    const auto in_n_hi_wi_c_desc  = make_naive_tensor_descriptor_packed(in_n_hi_wi_c_lengths);
+    const auto wei_k_y_x_c_desc   = make_naive_tensor_descriptor_packed(wei_k_y_x_c_lengths);
+    const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(out_n_ho_wo_k_lengths);
 
 #if 0
     // [M, N, K0, K1] = [256, 128, 4, 4] for fp32
@@ -187,7 +179,7 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_k
     const auto in_gemmm_gemmn_grid_desc          = descs[I2];
 
     // HACK: hacks that control index calculation when iterating over A, B, C matrix
-    constexpr auto out_gemmk0_gemmm_gemmk1_grid_iterator_hacks = make_tuple(
+    constexpr auto out_gemmk0_gemmm_gemmk1_grid_step_hacks = make_tuple(
         make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},   // 0+: gemmk0
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0>{},   // 1+: gemmm
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 2+: gemmk1
@@ -195,7 +187,7 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_k
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0>{},   // 1-: gemmm
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 2-: gemmk1
 
-    constexpr auto wei_gemmk0_gemmn_gemmk1_grid_iterator_hacks =
+    constexpr auto wei_gemmk0_gemmn_gemmk1_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},   // 0+: gemmk0
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: gemmn
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 2+: gemmk1
@@ -203,7 +195,7 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_k
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: Gemmn
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 2-: Gemmk1
 
-    constexpr auto in_m0_m1_m2_n_grid_iterator_hacks = make_tuple(
+    constexpr auto in_m0_m1_m2_n_grid_step_hacks = make_tuple(
         make_tuple(
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},  // 0+: MRepeat
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 1+: NRepeat
@@ -223,15 +215,15 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_k
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},   // 6-: M2
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 7-: N1
 
-    constexpr auto out_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks =
+    constexpr auto out_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0>{};
 
-    constexpr auto wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks =
+    constexpr auto wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{};
 
     for(index_t i = 0; i < 5; ++i)
     {
-        float ave_time = driver_dynamic_gemm_xdlops_v2r3<
+        float ave_time = driver_gemm_xdlops_v2r3<
             BlockSize,
             TInWei,
             TAcc,
@@ -271,11 +263,11 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_k
 #endif
             7,
             GemmCThreadTransferDstScalarPerVector,
-            decltype(out_gemmk0_gemmm_gemmk1_grid_iterator_hacks),
-            decltype(wei_gemmk0_gemmn_gemmk1_grid_iterator_hacks),
-            decltype(in_m0_m1_m2_n_grid_iterator_hacks),
-            decltype(out_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks),
-            decltype(wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks),
+            decltype(out_gemmk0_gemmm_gemmk1_grid_step_hacks),
+            decltype(wei_gemmk0_gemmn_gemmk1_grid_step_hacks),
+            decltype(in_m0_m1_m2_n_grid_step_hacks),
+            decltype(out_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks),
+            decltype(wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks),
             true // CAccessOrderMRepeatNRepeat
             >(static_cast<TOut*>(out_n_ho_wo_k_device_buf.GetDeviceBuffer()),
               static_cast<TInWei*>(wei_k_y_x_c_device_buf.GetDeviceBuffer()),
@@ -283,11 +275,11 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_k
               out_gemmk0_gemmm_gemmk1_grid_desc,
               wei_gemmk0_gemmn_gemmk1_grid_desc,
               in_gemmm_gemmn_grid_desc,
-              out_gemmk0_gemmm_gemmk1_grid_iterator_hacks,
-              wei_gemmk0_gemmn_gemmk1_grid_iterator_hacks,
-              in_m0_m1_m2_n_grid_iterator_hacks,
-              out_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks,
-              wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks,
+              out_gemmk0_gemmm_gemmk1_grid_step_hacks,
+              wei_gemmk0_gemmn_gemmk1_grid_step_hacks,
+              in_m0_m1_m2_n_grid_step_hacks,
+              out_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks,
+              wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks,
               nrepeat);
 
         {
@@ -295,16 +287,13 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_k
             const auto K = out_n_ho_wo_k_lengths[I3];
             const auto C = wei_k_y_x_c_lengths[I3];
 
-            const auto Hi = in_n_hi_wi_c_lengths[I1];
-            const auto Wi = in_n_hi_wi_c_lengths[I2];
-
             const auto Ho = out_n_ho_wo_k_lengths[I1];
             const auto Wo = out_n_ho_wo_k_lengths[I2];
 
             const auto Y = wei_k_y_x_c_lengths[I1];
             const auto X = wei_k_y_x_c_lengths[I2];
 
-            float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
+            float perf = static_cast<float>((std::size_t(2) * N * K * Ho * Wo * C * Y * X)) /
                          (std::size_t(1000) * 1000 * 1000) / ave_time;
 
             std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
diff --git a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
similarity index 84%
rename from host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
rename to host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
index 24ba775309..e6554cf0fe 100644
--- a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
@@ -2,7 +2,7 @@
 #include "device.hpp"
 #include "host_tensor.hpp"
 #include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp"
-#include "driver_dynamic_gemm_dlops_v1r2.hpp"
+#include "driver_gemm_dlops_v1r2.hpp"
 
 template <typename TInWei,
           typename TAcc,
@@ -14,7 +14,7 @@ template <typename TInWei,
           typename ConvDilations,
           typename InLeftPads,
           typename InRightPads>
-void device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
+void device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
     const InLengths& in_n_c_hi_wi_lengths,
     const WeiLengths& wei_k_c_y_x_lengths,
     const OutLengths& out_n_k_ho_wo_lengths,
@@ -34,12 +34,6 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
     constexpr auto I0 = Number<0>{};
     constexpr auto I1 = Number<1>{};
     constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
-    constexpr auto I6 = Number<6>{};
-    constexpr auto I7 = Number<7>{};
-    constexpr auto I8 = Number<8>{};
 
     DeviceMem in_n_c_hi_wi_device_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
     DeviceMem wei_k_c_y_x_device_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
@@ -49,12 +43,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
     wei_k_c_y_x_device_buf.ToDevice(wei_k_c_y_x.mData.data());
     out_n_k_ho_wo_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
 
-    const auto in_n_c_hi_wi_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
-    const auto wei_k_c_y_x_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
-    const auto out_n_k_ho_wo_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
+    const auto in_n_c_hi_wi_desc  = make_naive_tensor_descriptor_packed(in_n_c_hi_wi_lengths);
+    const auto wei_k_c_y_x_desc   = make_naive_tensor_descriptor_packed(wei_k_c_y_x_lengths);
+    const auto out_n_k_ho_wo_desc = make_naive_tensor_descriptor_packed(out_n_k_ho_wo_lengths);
 
 #if 1
     // cdata = 64, BlockSize = 256, 128x128x8
@@ -98,7 +89,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
                                                                         in_right_pads);
 
     // HACK: hacks that control index calculation when iterating over A, B, C matrix
-    constexpr auto wei_gemmk_gemmm0_gemmn1_grid_iterator_hacks =
+    constexpr auto wei_gemmk_gemmm0_gemmn1_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0>{},
@@ -108,7 +99,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
                               Sequence<0, 0, 0, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0>{}));
 
-    constexpr auto in_gemmk_gemmn0_gemmn1_grid_iterator_hacks =
+    constexpr auto in_gemmk_gemmn0_gemmn1_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}),
@@ -116,7 +107,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{}));
 
-    constexpr auto out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_iterator_hacks =
+    constexpr auto out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0>{},
@@ -130,10 +121,10 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
                               Sequence<0, 0, 2, 0, 0>{},
                               Sequence<0, 0, 2, 0, 0>{}));
 
-    constexpr auto wei_gemmk_gemmm0_gemmm1_grid_move_slice_window_iterator_hacks =
+    constexpr auto wei_gemmk_gemmm0_gemmm1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0>{};
 
-    constexpr auto in_gemmk_gemmn0_gemmn1_grid_move_slice_window_iterator_hacks =
+    constexpr auto in_gemmk_gemmn0_gemmn1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>{};
 
     const auto wei_gemmk_gemmm_grid_desc = descs[I0];
@@ -142,7 +133,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
 
     for(index_t i = 0; i < 5; ++i)
     {
-        float ave_time = driver_dynamic_gemm_dlops_v1r2<
+        float ave_time = driver_gemm_dlops_v1r2<
             BlockSize,
             TInWei,
             TAcc,
@@ -180,26 +171,26 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
             Sequence<3, 4, 5, 0, 1, 2>, // CThreadTransferSrcDstAccessOrder
             5,                          // CThreadTransferSrcDstVectorDim
             GemmCThreadTransferDstScalarPerVector_N11,
-            decltype(wei_gemmk_gemmm0_gemmn1_grid_iterator_hacks),
-            decltype(in_gemmk_gemmn0_gemmn1_grid_iterator_hacks),
-            decltype(out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_iterator_hacks),
-            decltype(wei_gemmk_gemmm0_gemmm1_grid_move_slice_window_iterator_hacks),
-            decltype(in_gemmk_gemmn0_gemmn1_grid_move_slice_window_iterator_hacks)>(
+            decltype(wei_gemmk_gemmm0_gemmn1_grid_step_hacks),
+            decltype(in_gemmk_gemmn0_gemmn1_grid_step_hacks),
+            decltype(out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_step_hacks),
+            decltype(wei_gemmk_gemmm0_gemmm1_grid_move_slice_window_step_hacks),
+            decltype(in_gemmk_gemmn0_gemmn1_grid_move_slice_window_step_hacks)>(
             static_cast<TInWei*>(wei_k_c_y_x_device_buf.GetDeviceBuffer()),
             static_cast<TInWei*>(in_n_c_hi_wi_device_buf.GetDeviceBuffer()),
             static_cast<TOut*>(out_n_k_ho_wo_device_buf.GetDeviceBuffer()),
             wei_gemmk_gemmm_grid_desc,
             in_gemmk_gemmn_grid_desc,
             out_gemmm_gemmn_grid_desc,
-            wei_gemmk_gemmm0_gemmn1_grid_iterator_hacks,
-            in_gemmk_gemmn0_gemmn1_grid_iterator_hacks,
-            out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_iterator_hacks,
-            wei_gemmk_gemmm0_gemmm1_grid_move_slice_window_iterator_hacks,
-            in_gemmk_gemmn0_gemmn1_grid_move_slice_window_iterator_hacks,
+            wei_gemmk_gemmm0_gemmn1_grid_step_hacks,
+            in_gemmk_gemmn0_gemmn1_grid_step_hacks,
+            out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_step_hacks,
+            wei_gemmk_gemmm0_gemmm1_grid_move_slice_window_step_hacks,
+            in_gemmk_gemmn0_gemmn1_grid_move_slice_window_step_hacks,
             nrepeat);
 
-        float perf = (float)calculate_convolution_flops(
-                         in_n_c_hi_wi_desc, wei_k_c_y_x_desc, out_n_k_ho_wo_desc) /
+        float perf = static_cast<float>(calculate_convolution_flops(
+                         in_n_c_hi_wi_desc, wei_k_c_y_x_desc, out_n_k_ho_wo_desc)) /
                      (std::size_t(1000) * 1000 * 1000) / ave_time;
 
         std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
diff --git a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
similarity index 94%
rename from host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
rename to host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
index b6b1cc8969..4a9d01081c 100644
--- a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
@@ -1,7 +1,7 @@
 #include <unistd.h>
 #include "device.hpp"
 #include "host_tensor.hpp"
-#include "driver_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
+#include "driver_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
 
 template <typename TInWei,
           typename TAcc,
@@ -13,7 +13,7 @@ template <typename TInWei,
           typename ConvDilations,
           typename InLeftPads,
           typename InRightPads>
-void device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(
+void device_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(
     const InLengths& in_n_c_hi_wi_lengths,
     const WeiLengths& wei_k_c_y_x_lengths,
     const OutLengths& out_n_k_ho_wo_lengths,
@@ -48,12 +48,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw
     wei_k_c_y_x_device_buf.ToDevice(wei_k_c_y_x.mData.data());
     out_n_k_ho_wo_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
 
-    const auto in_n_c_hi_wi_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
-    const auto wei_k_c_y_x_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
-    const auto out_n_k_ho_wo_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
+    const auto in_n_c_hi_wi_desc  = make_naive_tensor_descriptor_packed(in_n_c_hi_wi_lengths);
+    const auto wei_k_c_y_x_desc   = make_naive_tensor_descriptor_packed(wei_k_c_y_x_lengths);
+    const auto out_n_k_ho_wo_desc = make_naive_tensor_descriptor_packed(out_n_k_ho_wo_lengths);
 
 #if 0
     constexpr index_t BlockSize = 256;
@@ -212,9 +209,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw
     for(index_t i = 0; i < 5; ++i)
     {
 #if 0
-        float ave_time = launch_kernel_dynamic_gemm_xdlops_v1
+        float ave_time = launch_kernel_gemm_xdlops_v1
 #else
-        float ave_time = launch_kernel_dynamic_gemm_xdlops_v2
+        float ave_time = launch_kernel_gemm_xdlops_v2
 #endif
         <BlockSize,
          TInWei,
diff --git a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp
similarity index 88%
rename from host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp
rename to host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp
index cdd1084c0d..40685e81cf 100644
--- a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp
+++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp
@@ -2,7 +2,7 @@
 #include "device.hpp"
 #include "host_tensor.hpp"
 #include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
-#include "driver_dynamic_gemm_dlops_v1r3.hpp"
+#include "driver_gemm_dlops_v1r3.hpp"
 
 template <typename TInWei,
           typename TAcc,
@@ -14,7 +14,7 @@ template <typename TInWei,
           typename ConvDilations,
           typename InLeftPads,
           typename InRightPads>
-void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk(
+void device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk(
     const InLengths& in_n_hi_wi_c_lengths,
     const WeiLengths& wei_k_y_x_c_lengths,
     const OutLengths& out_n_ho_wo_k_lengths,
@@ -35,11 +35,6 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhw
     constexpr auto I1 = Number<1>{};
     constexpr auto I2 = Number<2>{};
     constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
-    constexpr auto I6 = Number<6>{};
-    constexpr auto I7 = Number<7>{};
-    constexpr auto I8 = Number<8>{};
 
     DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
     DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
@@ -49,14 +44,11 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhw
     wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data());
     out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
 
-    const auto in_n_hi_wi_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths);
-    const auto wei_k_y_x_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths);
-    const auto out_n_ho_wo_k_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths);
+    const auto in_n_hi_wi_c_desc  = make_naive_tensor_descriptor_packed(in_n_hi_wi_c_lengths);
+    const auto wei_k_y_x_c_desc   = make_naive_tensor_descriptor_packed(wei_k_y_x_c_lengths);
+    const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(out_n_ho_wo_k_lengths);
 
-#if 1
+#if 0
     // [M, N, K0, K1] = [128, 128, 8, 1] for fp32
     // cdata = 64, BlockSize = 256
     constexpr index_t BlockSize = 256;
@@ -163,7 +155,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhw
     const auto out_gemmm_gemmn_grid_desc         = descs[I2];
 
     // HACK: hacks that control index calculation when iterating over A, B, C matrix
-    constexpr auto in_gemmk0_gemmm0_gemmm1_gemmk1_grid_iterator_hacks = make_tuple(
+    constexpr auto in_gemmk0_gemmm0_gemmm1_gemmk1_grid_step_hacks = make_tuple(
         make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},   // 0+: GemmK0
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},   // 1+: GemmM0
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},   // 2+: GemmM1
@@ -173,7 +165,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhw
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},   // 3-: GemmM1
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{})); // 3-: GemmK1
 
-    constexpr auto wei_gemmk0_gemmn0_gemmn1_gemmk1_grid_iterator_hacks =
+    constexpr auto wei_gemmk0_gemmn0_gemmn1_gemmk1_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0>{},   // 0+: GemmK0
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: GemmN0
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0>{},   // 2+: GemmN1
@@ -183,7 +175,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhw
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0>{},   // 2-: GemmN1
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0>{})); // 3-: GemmK1
 
-    constexpr auto out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_iterator_hacks =
+    constexpr auto out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},   // 0+: GemmM0
                               Sequence<0, 0, 0, 0, 0>{},   // 1+: GemmM10
                               Sequence<0, 0, 0, 0, 0>{},   // 2+: GemmM11
@@ -197,15 +189,15 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhw
                               Sequence<0, 0, 0, 0, 0>{},   // 4-: GemmN10
                               Sequence<0, 0, 0, 0, 0>{})); // 5-: GemmN11
 
-    constexpr auto in_gemmk0_gemmm0_gemmm1_gemmk1_grid_move_slice_window_iterator_hacks =
+    constexpr auto in_gemmk0_gemmm0_gemmm1_gemmk1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0>{};
 
-    constexpr auto wei_gemmk0_gemmn0_gemmn1_gemmk1_grid_move_slice_window_iterator_hacks =
+    constexpr auto wei_gemmk0_gemmn0_gemmn1_gemmk1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0, 0, 0, 0>{};
 
     for(index_t i = 0; i < 5; ++i)
     {
-        float ave_time = driver_dynamic_gemm_dlops_v1r3<
+        float ave_time = driver_gemm_dlops_v1r3<
             BlockSize,
             TInWei,
             TAcc,
@@ -239,22 +231,22 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhw
             Sequence<0, 1, 2, 3, 4, 5>, // CThreadTransferSrcDstAccessOrder
             5,                          // CThreadTransferSrcDstVectorDim
             GemmCThreadTransferDstScalarPerVector_N11,
-            decltype(in_gemmk0_gemmm0_gemmm1_gemmk1_grid_iterator_hacks),
-            decltype(wei_gemmk0_gemmn0_gemmn1_gemmk1_grid_iterator_hacks),
-            decltype(out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_iterator_hacks),
-            decltype(in_gemmk0_gemmm0_gemmm1_gemmk1_grid_move_slice_window_iterator_hacks),
-            decltype(wei_gemmk0_gemmn0_gemmn1_gemmk1_grid_move_slice_window_iterator_hacks)>(
+            decltype(in_gemmk0_gemmm0_gemmm1_gemmk1_grid_step_hacks),
+            decltype(wei_gemmk0_gemmn0_gemmn1_gemmk1_grid_step_hacks),
+            decltype(out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_step_hacks),
+            decltype(in_gemmk0_gemmm0_gemmm1_gemmk1_grid_move_slice_window_step_hacks),
+            decltype(wei_gemmk0_gemmn0_gemmn1_gemmk1_grid_move_slice_window_step_hacks)>(
             static_cast<TInWei*>(in_n_hi_wi_c_device_buf.GetDeviceBuffer()),
             static_cast<TInWei*>(wei_k_y_x_c_device_buf.GetDeviceBuffer()),
             static_cast<TOut*>(out_n_ho_wo_k_device_buf.GetDeviceBuffer()),
             in_gemmk0_gemmm_gemmk1_grid_desc,
             wei_gemmk0_gemmn_gemmk1_grid_desc,
             out_gemmm_gemmn_grid_desc,
-            in_gemmk0_gemmm0_gemmm1_gemmk1_grid_iterator_hacks,
-            wei_gemmk0_gemmn0_gemmn1_gemmk1_grid_iterator_hacks,
-            out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_iterator_hacks,
-            in_gemmk0_gemmm0_gemmm1_gemmk1_grid_move_slice_window_iterator_hacks,
-            wei_gemmk0_gemmn0_gemmn1_gemmk1_grid_move_slice_window_iterator_hacks,
+            in_gemmk0_gemmm0_gemmm1_gemmk1_grid_step_hacks,
+            wei_gemmk0_gemmn0_gemmn1_gemmk1_grid_step_hacks,
+            out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_step_hacks,
+            in_gemmk0_gemmm0_gemmm1_gemmk1_grid_move_slice_window_step_hacks,
+            wei_gemmk0_gemmn0_gemmn1_gemmk1_grid_move_slice_window_step_hacks,
             nrepeat);
 
         {
@@ -262,16 +254,13 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhw
             const auto K = out_n_ho_wo_k_lengths[I3];
             const auto C = wei_k_y_x_c_lengths[I3];
 
-            const auto Hi = in_n_hi_wi_c_lengths[I1];
-            const auto Wi = in_n_hi_wi_c_lengths[I2];
-
             const auto Ho = out_n_ho_wo_k_lengths[I1];
             const auto Wo = out_n_ho_wo_k_lengths[I2];
 
             const auto Y = wei_k_y_x_c_lengths[I1];
             const auto X = wei_k_y_x_c_lengths[I2];
 
-            float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
+            float perf = static_cast<float>(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
                          (std::size_t(1000) * 1000 * 1000) / ave_time;
 
             std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
diff --git a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
similarity index 84%
rename from host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
rename to host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
index b56cbc0335..695ffeeb36 100644
--- a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
@@ -2,7 +2,7 @@
 #include "device.hpp"
 #include "host_tensor.hpp"
 #include "transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp"
-#include "driver_dynamic_gemm_xdlops_v2r3.hpp"
+#include "driver_gemm_xdlops_v2r3.hpp"
 
 template <typename TInWei,
           typename TAcc,
@@ -14,7 +14,7 @@ template <typename TInWei,
           typename ConvDilations,
           typename InLeftPads,
           typename InRightPads>
-void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw(
+void device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw(
     const InLengths& in_n_c_hi_wi_lengths,
     const WeiLengths& wei_k_c_y_x_lengths,
     const OutLengths& out_n_k_ho_wo_lengths,
@@ -34,12 +34,6 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
     constexpr auto I0 = Number<0>{};
     constexpr auto I1 = Number<1>{};
     constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
-    constexpr auto I6 = Number<6>{};
-    constexpr auto I7 = Number<7>{};
-    constexpr auto I8 = Number<8>{};
 
     DeviceMem in_n_c_hi_wi_device_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
     DeviceMem wei_k_c_y_x_device_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
@@ -49,12 +43,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
     wei_k_c_y_x_device_buf.ToDevice(wei_k_c_y_x.mData.data());
     out_n_k_ho_wo_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
 
-    const auto in_n_c_hi_wi_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
-    const auto wei_k_c_y_x_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
-    const auto out_n_k_ho_wo_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
+    const auto in_n_c_hi_wi_desc  = make_naive_tensor_descriptor_packed(in_n_c_hi_wi_lengths);
+    const auto wei_k_c_y_x_desc   = make_naive_tensor_descriptor_packed(wei_k_c_y_x_lengths);
+    const auto out_n_k_ho_wo_desc = make_naive_tensor_descriptor_packed(out_n_k_ho_wo_lengths);
 
 #if 1
     // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
@@ -101,12 +92,12 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
     const auto out_gemmm_gemmn_grid_desc         = descs[I2];
 
     // HACK: hacks that control index calculation when iterating over A, B, C matrix
-    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks = make_tuple(
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_step_hacks = make_tuple(
         make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
         make_tuple(
             Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}));
 
-    constexpr auto in_gemmk0_gemmn_gemmk1_grid_iterator_hacks =
+    constexpr auto in_gemmk0_gemmn_gemmk1_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
@@ -114,7 +105,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}));
 
-    constexpr auto out_m0_m1_m2_n_grid_iterator_hacks =
+    constexpr auto out_m0_m1_m2_n_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
                               Sequence<0, 0, 1, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0>{},
@@ -132,15 +123,15 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
                               Sequence<0, 0, 0, 0, 0>{},
                               Sequence<0, 0, 2, 0, 0>{}));
 
-    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks =
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0>{};
 
-    constexpr auto in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks =
+    constexpr auto in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>{};
 
     for(index_t i = 0; i < 5; ++i)
     {
-        float ave_time = driver_dynamic_gemm_xdlops_v2r3<
+        float ave_time = driver_gemm_xdlops_v2r3<
             BlockSize,
             TInWei,
             TAcc,
@@ -176,26 +167,26 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
             Sequence<3, 0, 1, 2, 7, 5, 4, 6>,
             7,
             GemmCThreadTransferDstScalarPerVector,
-            decltype(wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks),
-            decltype(in_gemmk0_gemmn_gemmk1_grid_iterator_hacks),
-            decltype(out_m0_m1_m2_n_grid_iterator_hacks),
-            decltype(wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks),
-            decltype(in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks),
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_step_hacks),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_step_hacks),
+            decltype(out_m0_m1_m2_n_grid_step_hacks),
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks),
             false>(static_cast<TInWei*>(wei_k_c_y_x_device_buf.GetDeviceBuffer()),
                    static_cast<TInWei*>(in_n_c_hi_wi_device_buf.GetDeviceBuffer()),
                    static_cast<TOut*>(out_n_k_ho_wo_device_buf.GetDeviceBuffer()),
                    wei_gemmk0_gemmm_gemmk1_grid_desc,
                    in_gemmk0_gemmn_gemmk1_grid_desc,
                    out_gemmm_gemmn_grid_desc,
-                   wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks,
-                   in_gemmk0_gemmn_gemmk1_grid_iterator_hacks,
-                   out_m0_m1_m2_n_grid_iterator_hacks,
-                   wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks,
-                   in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks,
+                   wei_gemmk0_gemmm_gemmk1_grid_step_hacks,
+                   in_gemmk0_gemmn_gemmk1_grid_step_hacks,
+                   out_m0_m1_m2_n_grid_step_hacks,
+                   wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks,
+                   in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks,
                    nrepeat);
 
-        float perf = (float)calculate_convolution_flops(
-                         in_n_c_hi_wi_desc, wei_k_c_y_x_desc, out_n_k_ho_wo_desc) /
+        float perf = static_cast<float>(calculate_convolution_flops(
+                         in_n_c_hi_wi_desc, wei_k_c_y_x_desc, out_n_k_ho_wo_desc)) /
                      (std::size_t(1000) * 1000 * 1000) / ave_time;
 
         std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
diff --git a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp
similarity index 84%
rename from host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp
rename to host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp
index 10284b48f3..141a326574 100644
--- a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp
@@ -2,7 +2,7 @@
 #include "device.hpp"
 #include "host_tensor.hpp"
 #include "transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp"
-#include "driver_dynamic_gemm_xdlops_v2r2.hpp"
+#include "driver_gemm_xdlops_v2r2.hpp"
 
 template <typename TInWei,
           typename TAcc,
@@ -14,7 +14,7 @@ template <typename TInWei,
           typename ConvDilations,
           typename InLeftPads,
           typename InRightPads>
-void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk(
+void device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk(
     const InLengths& in_n_hi_wi_c_lengths,
     const WeiLengths& wei_k_y_x_c_lengths,
     const OutLengths& out_n_ho_wo_k_lengths,
@@ -35,11 +35,6 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nh
     constexpr auto I1 = Number<1>{};
     constexpr auto I2 = Number<2>{};
     constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
-    constexpr auto I6 = Number<6>{};
-    constexpr auto I7 = Number<7>{};
-    constexpr auto I8 = Number<8>{};
 
     DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
     DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
@@ -49,12 +44,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nh
     wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data());
     out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
 
-    const auto in_n_hi_wi_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths);
-    const auto wei_k_y_x_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths);
-    const auto out_n_ho_wo_k_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths);
+    const auto in_n_hi_wi_c_desc  = make_naive_tensor_descriptor_packed(in_n_hi_wi_c_lengths);
+    const auto wei_k_y_x_c_desc   = make_naive_tensor_descriptor_packed(wei_k_y_x_c_lengths);
+    const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(out_n_ho_wo_k_lengths);
 
 #if 1
     // [M, N, K0, K1] = [256, 128, 4, 4] for fp32
@@ -129,12 +121,12 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nh
     const auto out_gemmm_gemmn_grid_desc         = descs[I2];
 
     // HACK: hacks that control index calculation when iterating over A, B, C matrix
-    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks = make_tuple(
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_step_hacks = make_tuple(
         make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
         make_tuple(
             Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}));
 
-    constexpr auto in_gemmk0_gemmn_gemmk1_grid_iterator_hacks =
+    constexpr auto in_gemmk0_gemmn_gemmk1_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
@@ -142,7 +134,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nh
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}));
 
-    constexpr auto out_m0_m1_m2_n_grid_iterator_hacks =
+    constexpr auto out_m0_m1_m2_n_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0>{},
@@ -152,15 +144,15 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nh
                               Sequence<0, 0, 0, 0, 0>{},
                               Sequence<0, 0, 2, 0, 0>{}));
 
-    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks =
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0>{};
 
-    constexpr auto in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks =
+    constexpr auto in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>{};
 
     for(index_t i = 0; i < 5; ++i)
     {
-        float ave_time = driver_dynamic_gemm_xdlops_v2r2<
+        float ave_time = driver_gemm_xdlops_v2r2<
             BlockSize,
             TInWei,
             TAcc,
@@ -195,22 +187,22 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nh
             Sequence<2, 3, 0, 1>,
             2,
             GemmCThreadTransferDstScalarPerVector,
-            decltype(wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks),
-            decltype(in_gemmk0_gemmn_gemmk1_grid_iterator_hacks),
-            decltype(out_m0_m1_m2_n_grid_iterator_hacks),
-            decltype(wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks),
-            decltype(in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks)>(
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_step_hacks),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_step_hacks),
+            decltype(out_m0_m1_m2_n_grid_step_hacks),
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks)>(
             static_cast<TInWei*>(wei_k_y_x_c_device_buf.GetDeviceBuffer()),
             static_cast<TInWei*>(in_n_hi_wi_c_device_buf.GetDeviceBuffer()),
             static_cast<TOut*>(out_n_ho_wo_k_device_buf.GetDeviceBuffer()),
             wei_gemmk0_gemmm_gemmk1_grid_desc,
             in_gemmk0_gemmn_gemmk1_grid_desc,
             out_gemmm_gemmn_grid_desc,
-            wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks,
-            in_gemmk0_gemmn_gemmk1_grid_iterator_hacks,
-            out_m0_m1_m2_n_grid_iterator_hacks,
-            wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks,
-            in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks,
+            wei_gemmk0_gemmm_gemmk1_grid_step_hacks,
+            in_gemmk0_gemmn_gemmk1_grid_step_hacks,
+            out_m0_m1_m2_n_grid_step_hacks,
+            wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks,
+            in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks,
             nrepeat);
 
         {
@@ -218,9 +210,6 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nh
             const auto K = out_n_ho_wo_k_lengths[I3];
             const auto C = wei_k_y_x_c_lengths[I3];
 
-            const auto Hi = in_n_hi_wi_c_lengths[I1];
-            const auto Wi = in_n_hi_wi_c_lengths[I2];
-
             const auto Ho = out_n_ho_wo_k_lengths[I1];
             const auto Wo = out_n_ho_wo_k_lengths[I2];
 
diff --git a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp
similarity index 90%
rename from host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp
rename to host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp
index f2a30fb525..692751bfb3 100644
--- a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp
@@ -2,7 +2,7 @@
 #include "device.hpp"
 #include "host_tensor.hpp"
 #include "transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp"
-#include "driver_dynamic_gemm_xdlops_v2r3.hpp"
+#include "driver_gemm_xdlops_v2r3.hpp"
 
 template <typename TInWei,
           typename TAcc,
@@ -14,7 +14,7 @@ template <typename TInWei,
           typename ConvDilations,
           typename InLeftPads,
           typename InRightPads>
-void device_dynamic_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk(
+void device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk(
     const InLengths& in_n_hi_wi_c_lengths,
     const WeiLengths& wei_k_y_x_c_lengths,
     const OutLengths& out_n_ho_wo_k_lengths,
@@ -49,12 +49,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nh
     wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data());
     out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
 
-    const auto in_n_hi_wi_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths);
-    const auto wei_k_y_x_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths);
-    const auto out_n_ho_wo_k_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths);
+    const auto in_n_hi_wi_c_desc  = make_naive_tensor_descriptor_packed(in_n_hi_wi_c_lengths);
+    const auto wei_k_y_x_c_desc   = make_naive_tensor_descriptor_packed(wei_k_y_x_c_lengths);
+    const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(out_n_ho_wo_k_lengths);
 
 #if 1
     // [M, N, K0, K1] = [256, 128, 4, 4] for fp32
@@ -185,12 +182,12 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nh
     const auto out_gemmm_gemmn_grid_desc         = descs[I2];
 
     // HACK: hacks that control index calculation when iterating over A, B, C matrix
-    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks = make_tuple(
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_step_hacks = make_tuple(
         make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
         make_tuple(
             Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}));
 
-    constexpr auto in_gemmk0_gemmn_gemmk1_grid_iterator_hacks =
+    constexpr auto in_gemmk0_gemmn_gemmk1_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
@@ -198,7 +195,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nh
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}));
 
-    constexpr auto out_m0_m1_m2_n_grid_iterator_hacks =
+    constexpr auto out_m0_m1_m2_n_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
                               Sequence<0, 0, 1, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0>{},
@@ -216,15 +213,15 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nh
                               Sequence<0, 0, 0, 0, 0>{},
                               Sequence<0, 0, 2, 0, 0>{}));
 
-    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks =
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0>{};
 
-    constexpr auto in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks =
+    constexpr auto in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>{};
 
     for(index_t i = 0; i < 5; ++i)
     {
-        float ave_time = driver_dynamic_gemm_xdlops_v2r3<
+        float ave_time = driver_gemm_xdlops_v2r3<
             BlockSize,
             TInWei,
             TAcc,
@@ -259,11 +256,11 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nh
             Sequence<2, 3, 0, 1, 7, 5, 4, 6>,
             6,
             GemmCThreadTransferDstScalarPerVector,
-            decltype(wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks),
-            decltype(in_gemmk0_gemmn_gemmk1_grid_iterator_hacks),
-            decltype(out_m0_m1_m2_n_grid_iterator_hacks),
-            decltype(wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks),
-            decltype(in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks),
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_step_hacks),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_step_hacks),
+            decltype(out_m0_m1_m2_n_grid_step_hacks),
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks),
             false // CAccessOrderMRepeatNRepeat
             >(static_cast<TInWei*>(wei_k_y_x_c_device_buf.GetDeviceBuffer()),
               static_cast<TInWei*>(in_n_hi_wi_c_device_buf.GetDeviceBuffer()),
@@ -271,11 +268,11 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nh
               wei_gemmk0_gemmm_gemmk1_grid_desc,
               in_gemmk0_gemmn_gemmk1_grid_desc,
               out_gemmm_gemmn_grid_desc,
-              wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks,
-              in_gemmk0_gemmn_gemmk1_grid_iterator_hacks,
-              out_m0_m1_m2_n_grid_iterator_hacks,
-              wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks,
-              in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks,
+              wei_gemmk0_gemmm_gemmk1_grid_step_hacks,
+              in_gemmk0_gemmn_gemmk1_grid_step_hacks,
+              out_m0_m1_m2_n_grid_step_hacks,
+              wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks,
+              in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks,
               nrepeat);
 
         {
diff --git a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
similarity index 89%
rename from host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
rename to host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
index 601878c347..7067291c8a 100644
--- a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
@@ -2,7 +2,7 @@
 #include "device.hpp"
 #include "host_tensor.hpp"
 #include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
-#include "driver_dynamic_gemm_xdlops_v2r3.hpp"
+#include "driver_gemm_xdlops_v2r3.hpp"
 
 template <typename TInWei,
           typename TAcc,
@@ -14,7 +14,7 @@ template <typename TInWei,
           typename ConvDilations,
           typename InLeftPads,
           typename InRightPads>
-void device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk(
+void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk(
     const InLengths& in_n_hi_wi_c_lengths,
     const WeiLengths& wei_k_y_x_c_lengths,
     const OutLengths& out_n_ho_wo_k_lengths,
@@ -35,11 +35,6 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nh
     constexpr auto I1 = Number<1>{};
     constexpr auto I2 = Number<2>{};
     constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
-    constexpr auto I6 = Number<6>{};
-    constexpr auto I7 = Number<7>{};
-    constexpr auto I8 = Number<8>{};
 
     DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
     DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
@@ -49,12 +44,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nh
     wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data());
     out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
 
-    const auto in_n_hi_wi_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths);
-    const auto wei_k_y_x_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths);
-    const auto out_n_ho_wo_k_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths);
+    const auto in_n_hi_wi_c_desc  = make_naive_tensor_descriptor_packed(in_n_hi_wi_c_lengths);
+    const auto wei_k_y_x_c_desc   = make_naive_tensor_descriptor_packed(wei_k_y_x_c_lengths);
+    const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(out_n_ho_wo_k_lengths);
 
 #if 0
     // [M, N, K0, K1] = [256, 128, 4, 4] for fp32
@@ -241,7 +233,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nh
     const auto out_gemmm_gemmn_grid_desc         = descs[I2];
 
     // HACK: hacks that control index calculation when iterating over A, B, C matrix
-    constexpr auto in_gemmk0_gemmm_gemmk1_grid_iterator_hacks =
+    constexpr auto in_gemmk0_gemmm_gemmk1_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},   // 0+: GemmK0
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},   // 1+: GemmM
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),  // 2+: GemmK1
@@ -249,7 +241,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nh
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},   // 1-: GemmM
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})); // 2-: GemmK1
 
-    constexpr auto wei_gemmk0_gemmn_gemmk1_grid_iterator_hacks =
+    constexpr auto wei_gemmk0_gemmn_gemmk1_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},   // 0+: GemmK0
                               Sequence<0, 0, 0, 0, 0>{},   // 1+: GemmN
                               Sequence<0, 0, 0, 0, 0>{}),  // 2+: GemmK1
@@ -257,7 +249,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nh
                               Sequence<0, 0, 0, 0, 0>{},   // 1-: GemmN
                               Sequence<0, 0, 0, 0, 0>{})); // 2-: GemmK1
 
-    constexpr auto out_m0_m1_m2_n_grid_iterator_hacks =
+    constexpr auto out_m0_m1_m2_n_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},   // 0+: MRepeat
                               Sequence<0, 0, 0, 0, 0>{},   // 1+: NRepeat
                               Sequence<0, 0, 0, 0, 0>{},   // 2+: MWaves
@@ -275,15 +267,15 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nh
                               Sequence<0, 0, 0, 0, 0>{},   // 6-: M2
                               Sequence<0, 0, 0, 0, 0>{})); // 7-: N1
 
-    constexpr auto in_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks =
+    constexpr auto in_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>{};
 
-    constexpr auto wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks =
+    constexpr auto wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0>{};
 
     for(index_t i = 0; i < 5; ++i)
     {
-        float ave_time = driver_dynamic_gemm_xdlops_v2r3<
+        float ave_time = driver_gemm_xdlops_v2r3<
             BlockSize,
             TInWei,
             TAcc,
@@ -319,11 +311,11 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nh
             Sequence<2, 3, 0, 1, 7, 5, 4, 6>,
             7,
             GemmCThreadTransferDstScalarPerVector,
-            decltype(in_gemmk0_gemmm_gemmk1_grid_iterator_hacks),
-            decltype(wei_gemmk0_gemmn_gemmk1_grid_iterator_hacks),
-            decltype(out_m0_m1_m2_n_grid_iterator_hacks),
-            decltype(in_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks),
-            decltype(wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks),
+            decltype(in_gemmk0_gemmm_gemmk1_grid_step_hacks),
+            decltype(wei_gemmk0_gemmn_gemmk1_grid_step_hacks),
+            decltype(out_m0_m1_m2_n_grid_step_hacks),
+            decltype(in_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks),
+            decltype(wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks),
             false // CAccessOrderMRepeatNRepeat
             >(static_cast<TInWei*>(in_n_hi_wi_c_device_buf.GetDeviceBuffer()),
               static_cast<TInWei*>(wei_k_y_x_c_device_buf.GetDeviceBuffer()),
@@ -331,11 +323,11 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nh
               in_gemmk0_gemmm_gemmk1_grid_desc,
               wei_gemmk0_gemmn_gemmk1_grid_desc,
               out_gemmm_gemmn_grid_desc,
-              in_gemmk0_gemmm_gemmk1_grid_iterator_hacks,
-              wei_gemmk0_gemmn_gemmk1_grid_iterator_hacks,
-              out_m0_m1_m2_n_grid_iterator_hacks,
-              in_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks,
-              wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks,
+              in_gemmk0_gemmm_gemmk1_grid_step_hacks,
+              wei_gemmk0_gemmn_gemmk1_grid_step_hacks,
+              out_m0_m1_m2_n_grid_step_hacks,
+              in_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks,
+              wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks,
               nrepeat);
 
         {
@@ -343,16 +335,13 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nh
             const auto K = out_n_ho_wo_k_lengths[I3];
             const auto C = wei_k_y_x_c_lengths[I3];
 
-            const auto Hi = in_n_hi_wi_c_lengths[I1];
-            const auto Wi = in_n_hi_wi_c_lengths[I2];
-
             const auto Ho = out_n_ho_wo_k_lengths[I1];
             const auto Wo = out_n_ho_wo_k_lengths[I2];
 
             const auto Y = wei_k_y_x_c_lengths[I1];
             const auto X = wei_k_y_x_c_lengths[I2];
 
-            float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
+            float perf = static_cast<float>((std::size_t(2) * N * K * Ho * Wo * C * Y * X)) /
                          (std::size_t(1000) * 1000 * 1000) / ave_time;
 
             std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
diff --git a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
similarity index 91%
rename from host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
rename to host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
index ca0d47c33a..b5e5f91d59 100644
--- a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
@@ -1,8 +1,8 @@
 #include <unistd.h>
 #include "device.hpp"
 #include "host_tensor.hpp"
-#include "driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp"
-#include "driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp"
+#include "driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp"
+#include "driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp"
 
 template <typename TInWei,
           ck::index_t InWeiVectorSize,
@@ -15,7 +15,7 @@ template <typename TInWei,
           typename ConvDilations,
           typename InLeftPads,
           typename InRightPads>
-void device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
+void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
     const InLengths& in_n_c_hi_wi_lengths,
     const WeiLengths& wei_k_c_y_x_lengths,
     const OutLengths& out_n_k_ho_wo_lengths,
@@ -26,7 +26,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
     const Tensor<TInWei>& in_n_c_hi_wi,
     const Tensor<TInWei>& wei_k_c_y_x,
     Tensor<TOut>& out_n_k_ho_wo,
-    ck::index_t nrepeat)
+    ck::index_t /* nrepeat */)
 {
     using namespace ck;
 
@@ -85,12 +85,10 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
     in_n_c0_hi_wi_c1_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data());
     wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data());
 
-    const auto in_n_c0_hi_wi_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, C0, Hi, Wi));
-    const auto wei_k_c0_y_x_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C0, Y, X));
+    const auto in_n_c0_hi_wi_desc = make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi));
+    const auto wei_k_c0_y_x_desc  = make_naive_tensor_descriptor_packed(make_tuple(K, C0, Y, X));
     const auto out_n_k0_ho_wo_k1_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Ho, Wo, K1));
+        make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1));
 
 #if 1
     // cdata = 64, BlockSize = 64, 16x8x32x4
diff --git a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
similarity index 88%
rename from host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
rename to host/driver_offline/include/device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
index 8fb276b464..e1b7c5486c 100644
--- a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
@@ -3,7 +3,7 @@
 #include "device.hpp"
 #include "host_tensor.hpp"
 #include "transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp"
-#include "driver_dynamic_contraction_dlops_v1r2.hpp"
+#include "driver_contraction_dlops_v1r2.hpp"
 
 template <typename TInWei,
           typename TAcc,
@@ -15,7 +15,7 @@ template <typename TInWei,
           typename ConvDilations,
           typename InLeftPads,
           typename InRightPads>
-void device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
+void device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
     const InLengths& in_n_c_hi_wi_lengths,
     const WeiLengths& wei_k_c_y_x_lengths,
     const OutLengths& out_n_k_ho_wo_lengths,
@@ -44,12 +44,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
     wei_k_c_y_x_device_buf.ToDevice(wei_k_c_y_x.mData.data());
     out_n_k_ho_wo_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
 
-    const auto in_desc_n_c_hi_wi =
-        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
-    const auto wei_desc_k_c_y_x =
-        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
-    const auto out_desc_n_k_ho_wo =
-        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
+    const auto in_desc_n_c_hi_wi  = make_naive_tensor_descriptor_packed(in_n_c_hi_wi_lengths);
+    const auto wei_desc_k_c_y_x   = make_naive_tensor_descriptor_packed(wei_k_c_y_x_lengths);
+    const auto out_desc_n_k_ho_wo = make_naive_tensor_descriptor_packed(out_n_k_ho_wo_lengths);
 
 #if 1
     // [8, 1, 128, 1] * [8, 4, 32, 1] = [1, 128, 4, 32] for fp32
@@ -133,7 +130,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
     const auto out_grid_desc_gm0_gm1_gn0_gn1 = descs[I2];
 
     // HACK: hacks that control index calculation when iterating over A, B, C matrix
-    constexpr auto wei_grid_iterator_hacks =
+    constexpr auto wei_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0>{},   // 0+: GK0
                               Sequence<0, 0, 0, 0, 0, 0, 0>{},   // 1+: GM0
                               Sequence<0, 0, 0, 0, 0, 0, 0>{},   // 2+: GM10
@@ -145,7 +142,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
                               Sequence<0, 0, 0, 0, 0, 0, 0>{},   // 3-: GM11
                               Sequence<0, 0, 0, 0, 0, 0, 0>{})); // 4-: GK1
 
-    constexpr auto in_grid_iterator_hacks = make_tuple(
+    constexpr auto in_grid_step_hacks = make_tuple(
         make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0>{},   // 0+: GK0
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},   // 1+: GN0
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},   // 2+: GN10
@@ -157,7 +154,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},   // 3-: GN11
                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 4-: GK1
 
-    constexpr auto out_grid_iterator_hacks = make_tuple(
+    constexpr auto out_grid_step_hacks = make_tuple(
         make_tuple(
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 0+: GM10
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},  // 1+: BM0
@@ -173,14 +170,14 @@ void device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{},   // 4-: BN0
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{})); // 5-: GN1
 
-    constexpr auto wei_grid_move_slice_window_iterator_hacks = Sequence<0, 0, 0, 0, 0, 0, 0>{};
+    constexpr auto wei_grid_move_slice_window_step_hacks = Sequence<0, 0, 0, 0, 0, 0, 0>{};
 
-    constexpr auto in_grid_move_slice_window_iterator_hacks =
+    constexpr auto in_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0>{};
 
     for(index_t i = 0; i < 5; ++i)
     {
-        float ave_time = driver_dynamic_contraction_dlops_v1r2<
+        float ave_time = driver_contraction_dlops_v1r2<
             BlockSize,
             TInWei,
             TAcc,
@@ -214,26 +211,26 @@ void device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
             Sequence<3, 4, 5, 0, 1, 2>, // CThreadTransferSrcDstAccessOrder
             5,                          // CThreadTransferSrcDstVectorDim
             CThreadTransferDstScalarPerVector_BN1,
-            decltype(wei_grid_iterator_hacks),
-            decltype(in_grid_iterator_hacks),
-            decltype(out_grid_iterator_hacks),
-            decltype(wei_grid_move_slice_window_iterator_hacks),
-            decltype(in_grid_move_slice_window_iterator_hacks)>(
+            decltype(wei_grid_step_hacks),
+            decltype(in_grid_step_hacks),
+            decltype(out_grid_step_hacks),
+            decltype(wei_grid_move_slice_window_step_hacks),
+            decltype(in_grid_move_slice_window_step_hacks)>(
             static_cast<TInWei*>(wei_k_c_y_x_device_buf.GetDeviceBuffer()),
             static_cast<TInWei*>(in_n_c_hi_wi_device_buf.GetDeviceBuffer()),
             static_cast<TOut*>(out_n_k_ho_wo_device_buf.GetDeviceBuffer()),
             wei_grid_desc_gk0_gm0_gm1_gk1,
             in_grid_desc_gk0_gn0_gn1_gk1,
             out_grid_desc_gm0_gm1_gn0_gn1,
-            wei_grid_iterator_hacks,
-            in_grid_iterator_hacks,
-            out_grid_iterator_hacks,
-            wei_grid_move_slice_window_iterator_hacks,
-            in_grid_move_slice_window_iterator_hacks,
+            wei_grid_step_hacks,
+            in_grid_step_hacks,
+            out_grid_step_hacks,
+            wei_grid_move_slice_window_step_hacks,
+            in_grid_move_slice_window_step_hacks,
             nrepeat);
 
-        float perf = (float)calculate_convolution_flops(
-                         in_desc_n_c_hi_wi, wei_desc_k_c_y_x, out_desc_n_k_ho_wo) /
+        float perf = static_cast<float>(calculate_convolution_flops(
+                         in_desc_n_c_hi_wi, wei_desc_k_c_y_x, out_desc_n_k_ho_wo)) /
                      (std::size_t(1000) * 1000 * 1000) / ave_time;
 
         std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
diff --git a/host/driver_offline/include/driver_dynamic_contraction_dlops_v1r2.hpp b/host/driver_offline/include/driver_contraction_dlops_v1r2.hpp
similarity index 84%
rename from host/driver_offline/include/driver_dynamic_contraction_dlops_v1r2.hpp
rename to host/driver_offline/include/driver_contraction_dlops_v1r2.hpp
index 2f175962c1..d207728a2e 100644
--- a/host/driver_offline/include/driver_dynamic_contraction_dlops_v1r2.hpp
+++ b/host/driver_offline/include/driver_contraction_dlops_v1r2.hpp
@@ -1,10 +1,10 @@
-#ifndef DRIVER_DYNAMIC_CONTRACTION_DLOPS_V1R2_HPP
-#define DRIVER_DYNAMIC_CONTRACTION_DLOPS_V1R2_HPP
+#ifndef DRIVER_CONTRACTION_DLOPS_V1R2_HPP
+#define DRIVER_CONTRACTION_DLOPS_V1R2_HPP
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "gridwise_dynamic_contraction_dlops_v1r2.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "gridwise_contraction_dlops_v1r2.hpp"
 
 template <ck::index_t BlockSize,
           typename FloatAB,
@@ -39,24 +39,24 @@ template <ck::index_t BlockSize,
           typename CThreadTransferSrcDstAccessOrder,
           ck::index_t CThreadTransferSrcDstVectorDim,
           ck::index_t CThreadTransferDstScalarPerVector,
-          typename AGridIteratorHacks,
-          typename BGridIteratorHacks,
-          typename CGridIteratorHacks,
-          typename AGridMoveSliceWindowIteratorHacks,
-          typename BGridMoveSliceWindowIteratorHacks>
+          typename AGridStepHacks,
+          typename BGridStepHacks,
+          typename CGridStepHacks,
+          typename AGridMoveSliceWindowStepHacks,
+          typename BGridMoveSliceWindowStepHacks>
 __host__ float
-driver_dynamic_contraction_dlops_v1r2(const FloatAB* p_a_grid,
-                                      const FloatAB* p_b_grid,
-                                      FloatC* p_c_grid,
-                                      const AGridDesc_GK0_GM0_GM1_GK1& a_grid_desc_gk0_gm0_gm1_gk1,
-                                      const BGridDesc_GK0_GN0_GN1_GK1& b_grid_desc_gk0_gn0_gn1_gk1,
-                                      const CGridDesc_GM0_GM1_GN0_GN1& c_grid_desc_gm0_gm1_gn0_gn1,
-                                      AGridIteratorHacks,
-                                      BGridIteratorHacks,
-                                      CGridIteratorHacks,
-                                      AGridMoveSliceWindowIteratorHacks,
-                                      BGridMoveSliceWindowIteratorHacks,
-                                      ck::index_t nrepeat)
+driver_contraction_dlops_v1r2(const FloatAB* p_a_grid,
+                              const FloatAB* p_b_grid,
+                              FloatC* p_c_grid,
+                              const AGridDesc_GK0_GM0_GM1_GK1& a_grid_desc_gk0_gm0_gm1_gk1,
+                              const BGridDesc_GK0_GN0_GN1_GK1& b_grid_desc_gk0_gn0_gn1_gk1,
+                              const CGridDesc_GM0_GM1_GN0_GN1& c_grid_desc_gm0_gm1_gn0_gn1,
+                              AGridStepHacks,
+                              BGridStepHacks,
+                              CGridStepHacks,
+                              AGridMoveSliceWindowStepHacks,
+                              BGridMoveSliceWindowStepHacks,
+                              ck::index_t nrepeat)
 
 {
     using namespace ck;
@@ -70,7 +70,7 @@ driver_dynamic_contraction_dlops_v1r2(const FloatAB* p_a_grid,
 
     // GEMM
     using GridwiseContraction =
-        GridwiseDynamicContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1<
+        GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1<
             BlockSize,
             FloatAB,
             FloatAcc,
@@ -104,11 +104,11 @@ driver_dynamic_contraction_dlops_v1r2(const FloatAB* p_a_grid,
             CThreadTransferSrcDstAccessOrder,
             CThreadTransferSrcDstVectorDim,
             CThreadTransferDstScalarPerVector,
-            AGridIteratorHacks,
-            BGridIteratorHacks,
-            CGridIteratorHacks,
-            AGridMoveSliceWindowIteratorHacks,
-            BGridMoveSliceWindowIteratorHacks>;
+            AGridStepHacks,
+            BGridStepHacks,
+            CGridStepHacks,
+            AGridMoveSliceWindowStepHacks,
+            BGridMoveSliceWindowStepHacks>;
 
     const auto GK0 = a_grid_desc_gk0_gm0_gm1_gk1.GetLength(I0);
 
@@ -116,7 +116,7 @@ driver_dynamic_contraction_dlops_v1r2(const FloatAB* p_a_grid,
            a_grid_desc_gk0_gm0_gm1_gk1, b_grid_desc_gk0_gn0_gn1_gk1, c_grid_desc_gm0_gm1_gn0_gn1))
     {
         throw std::runtime_error("wrong! "
-                                 "GridwiseDynamicContraction_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_"
+                                 "GridwiseContraction_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_"
                                  "GM0_GM1_GN0_GN1 has invalid setting");
     }
 
@@ -178,7 +178,7 @@ driver_dynamic_contraction_dlops_v1r2(const FloatAB* p_a_grid,
 
     if(has_main_k_block_loop && has_double_tail_k_block_loop)
     {
-        const auto kernel = kernel_dynamic_contraction_dlops_v1r2<
+        const auto kernel = kernel_contraction_dlops_v1r2<
             GridwiseContraction,
             FloatAB,
             FloatC,
@@ -194,7 +194,6 @@ driver_dynamic_contraction_dlops_v1r2(const FloatAB* p_a_grid,
                                           dim3(grid_size),
                                           dim3(BlockSize),
                                           0,
-                                          0,
                                           p_a_grid,
                                           p_b_grid,
                                           p_c_grid,
@@ -205,7 +204,7 @@ driver_dynamic_contraction_dlops_v1r2(const FloatAB* p_a_grid,
     }
     else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
     {
-        const auto kernel = kernel_dynamic_contraction_dlops_v1r2<
+        const auto kernel = kernel_contraction_dlops_v1r2<
             GridwiseContraction,
             FloatAB,
             FloatC,
@@ -221,7 +220,6 @@ driver_dynamic_contraction_dlops_v1r2(const FloatAB* p_a_grid,
                                           dim3(grid_size),
                                           dim3(BlockSize),
                                           0,
-                                          0,
                                           p_a_grid,
                                           p_b_grid,
                                           p_c_grid,
@@ -232,7 +230,7 @@ driver_dynamic_contraction_dlops_v1r2(const FloatAB* p_a_grid,
     }
     else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
     {
-        const auto kernel = kernel_dynamic_contraction_dlops_v1r2<
+        const auto kernel = kernel_contraction_dlops_v1r2<
             GridwiseContraction,
             FloatAB,
             FloatC,
@@ -248,7 +246,6 @@ driver_dynamic_contraction_dlops_v1r2(const FloatAB* p_a_grid,
                                           dim3(grid_size),
                                           dim3(BlockSize),
                                           0,
-                                          0,
                                           p_a_grid,
                                           p_b_grid,
                                           p_c_grid,
@@ -259,7 +256,7 @@ driver_dynamic_contraction_dlops_v1r2(const FloatAB* p_a_grid,
     }
     else
     {
-        const auto kernel = kernel_dynamic_contraction_dlops_v1r2<
+        const auto kernel = kernel_contraction_dlops_v1r2<
             GridwiseContraction,
             FloatAB,
             FloatC,
@@ -275,7 +272,6 @@ driver_dynamic_contraction_dlops_v1r2(const FloatAB* p_a_grid,
                                           dim3(grid_size),
                                           dim3(BlockSize),
                                           0,
-                                          0,
                                           p_a_grid,
                                           p_b_grid,
                                           p_c_grid,
diff --git a/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
similarity index 86%
rename from host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
rename to host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
index 7c4b1043f3..efd4ce6a19 100644
--- a/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
@@ -1,10 +1,10 @@
-#ifndef DRIVER_DYNAMIC_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_NCHW_KCYX_NKHW_HPP
-#define DRIVER_DYNAMIC_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_NCHW_KCYX_NKHW_HPP
+#ifndef DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_NCHW_KCYX_NKHW_HPP
+#define DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_NCHW_KCYX_NKHW_HPP
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "gridwise_dynamic_gemm_dlops_v2.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "gridwise_gemm_dlops_v2.hpp"
 #include "gridwise_operation_wrapper.hpp"
 
 template <ck::index_t BlockSize,
@@ -34,9 +34,9 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
               typename ConvDilations,
               typename InLeftPads,
               typename InRightPads>
-    __host__ void Run(const ck::DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
-                      const ck::DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
-                      const ck::DynamicTensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
+    __host__ void Run(const ck::TensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
+                      const ck::TensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
+                      const ck::TensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
                       const ConvStrides& conv_strides,
                       const ConvDilations& conv_dilations,
                       const InLeftPads& in_left_pads,
@@ -82,14 +82,14 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
         const auto InRightPadW = in_right_pads[I1];
 
         // weight tensor
-        const auto wei_e_k_global_desc = transform_dynamic_tensor_descriptor(
-            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C * Y * X)),
+        const auto wei_e_k_global_desc = transform_tensor_descriptor(
+            make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)),
             make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
             make_tuple(Sequence<0>{}, Sequence<1>{}),
             make_tuple(Sequence<1>{}, Sequence<0>{}));
 
         // input tensor
-        const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
+        const auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor(
             in_n_c_hi_wi_global_desc,
             make_tuple(make_pass_through_transform(N),
                        make_pass_through_transform(C),
@@ -98,7 +98,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
             make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
             make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
 
-        const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor(
+        const auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor(
             in_n_c_hip_wip_global_desc,
             make_tuple(
                 make_pass_through_transform(N),
@@ -108,7 +108,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
             make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
             make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
 
-        const auto in_e_n_ho_wo_global_desc = transform_dynamic_tensor_descriptor(
+        const auto in_e_n_ho_wo_global_desc = transform_tensor_descriptor(
             in_n_c_y_ho_x_wo_global_desc,
             make_tuple(make_merge_transform(make_tuple(C, Y, X)),
                        make_pass_through_transform(N),
@@ -118,8 +118,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
             make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
 
         // output tensor
-        const auto out_k_n_ho_wo_global_desc = transform_dynamic_tensor_descriptor(
-            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Ho, Wo, K1)),
+        const auto out_k_n_ho_wo_global_desc = transform_tensor_descriptor(
+            make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1)),
             make_tuple(make_merge_transform(make_tuple(K0, K1)),
                        make_pass_through_transform(N),
                        make_pass_through_transform(Ho),
@@ -136,13 +136,13 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
         }
 
         // hack to control index calculation when iterating over a_k_m_global tensor
-        constexpr auto a_e_k_global_iterator_hacks =
+        constexpr auto a_e_k_global_step_hacks =
             make_tuple(make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}),
                        make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}));
 
-        constexpr auto a_e_k_global_move_slice_window_iterator_hack = Sequence<0, 0, 0>{};
+        constexpr auto a_e_k_global_move_slice_window_step_hack = Sequence<0, 0, 0>{};
 
-        constexpr auto b_e_n_ho_wo_global_iterator_hacks =
+        constexpr auto b_e_n_ho_wo_global_step_hacks =
             make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
                                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
                                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
@@ -152,12 +152,12 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
                                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
                                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
 
-        constexpr auto b_e_n_ho_wo_global_move_slice_window_iterator_hack =
+        constexpr auto b_e_n_ho_wo_global_move_slice_window_step_hack =
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{};
 
         // hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
         // hack for NKHW format
-        constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks =
+        constexpr auto c_k_n_ho_wo_global_tensor_step_hacks =
             make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0>{},
                                   Sequence<0, 0, 0, 0, 0>{},
                                   Sequence<0, 0, 0, 0, 0>{},
@@ -169,7 +169,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
 
 #if 1
         // GEMM
-        using gridwise_gemm = GridwiseDynamicGemmDlops_km_kn_mn_v3<
+        using gridwise_gemm = GridwiseGemmDlops_km_kn_mn_v3<
             BlockSize,
             FloatAB,
             FloatAcc,
@@ -202,11 +202,11 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
             Sequence<0, 2, 3, 1>,
             0,
             CThreadTransferDstScalarPerVector_W,
-            decltype(a_e_k_global_iterator_hacks),
-            decltype(b_e_n_ho_wo_global_iterator_hacks),
-            decltype(c_k_n_ho_wo_global_tensor_iterator_hacks),
-            decltype(a_e_k_global_move_slice_window_iterator_hack),
-            decltype(b_e_n_ho_wo_global_move_slice_window_iterator_hack)>;
+            decltype(a_e_k_global_step_hacks),
+            decltype(b_e_n_ho_wo_global_step_hacks),
+            decltype(c_k_n_ho_wo_global_tensor_step_hacks),
+            decltype(a_e_k_global_move_slice_window_step_hack),
+            decltype(b_e_n_ho_wo_global_move_slice_window_step_hack)>;
 
         const auto GridSize = (K / KPerBlock) * (Ho / HoPerBlock) * (Wo / WoPerBlock) * N;
 
@@ -244,7 +244,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
                                   dim3(GridSize),
                                   dim3(BlockSize),
                                   0,
-                                  0,
                                   wei_e_k_global_desc,
                                   p_wei_global,
                                   in_e_n_ho_wo_global_desc,
@@ -270,7 +269,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
                                   dim3(GridSize),
                                   dim3(BlockSize),
                                   0,
-                                  0,
                                   wei_e_k_global_desc,
                                   p_wei_global,
                                   in_e_n_ho_wo_global_desc,
@@ -296,7 +294,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
                                   dim3(GridSize),
                                   dim3(BlockSize),
                                   0,
-                                  0,
                                   wei_e_k_global_desc,
                                   p_wei_global,
                                   in_e_n_ho_wo_global_desc,
@@ -322,7 +319,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
                                   dim3(GridSize),
                                   dim3(BlockSize),
                                   0,
-                                  0,
                                   wei_e_k_global_desc,
                                   p_wei_global,
                                   in_e_n_ho_wo_global_desc,
@@ -338,10 +334,11 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
 
             float ave_time = timer.GetElapsedTime() / nrepeat;
 
-            float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc,
-                                                            wei_k_c_y_x_global_desc,
-                                                            out_n_k0_ho_wo_k1_global_desc) /
-                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+            float perf =
+                static_cast<float>(calculate_convolution_flops(in_n_c_hi_wi_global_desc,
+                                                               wei_k_c_y_x_global_desc,
+                                                               out_n_k0_ho_wo_k1_global_desc)) /
+                (std::size_t(1000) * 1000 * 1000) / ave_time;
 
             std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
                       << std::endl;
diff --git a/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp b/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
similarity index 86%
rename from host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
rename to host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
index b7f8e6039c..70f73cbf4a 100644
--- a/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
+++ b/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
@@ -1,10 +1,10 @@
-#ifndef DRIVER_DYNAMIC_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NCHW_KCYX_NKHW_OUTPAD_HPP
-#define DRIVER_DYNAMIC_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NCHW_KCYX_NKHW_OUTPAD_HPP
+#ifndef DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NCHW_KCYX_NKHW_OUTPAD_HPP
+#define DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NCHW_KCYX_NKHW_OUTPAD_HPP
 
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "gridwise_dynamic_gemm_dlops_v2.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "gridwise_gemm_dlops_v2.hpp"
 #include "gridwise_operation_wrapper.hpp"
 
 template <ck::index_t BlockSize,
@@ -34,9 +34,9 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
               typename ConvDilations,
               typename InLeftPads,
               typename InRightPads>
-    __host__ void Run(const ck::DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
-                      const ck::DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
-                      const ck::DynamicTensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
+    __host__ void Run(const ck::TensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
+                      const ck::TensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
+                      const ck::TensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
                       const ConvStrides& conv_strides,
                       const ConvDilations& conv_dilations,
                       const InLeftPads& in_left_pads,
@@ -93,14 +93,14 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
                   << std::endl;
 
         // weight tensor
-        const auto wei_e_k_global_desc = transform_dynamic_tensor_descriptor(
-            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C * Y * X)),
+        const auto wei_e_k_global_desc = transform_tensor_descriptor(
+            make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)),
             make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
             make_tuple(Sequence<0>{}, Sequence<1>{}),
             make_tuple(Sequence<1>{}, Sequence<0>{}));
 
         // input tensor
-        const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
+        const auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor(
             in_n_c_hi_wi_global_desc,
             make_tuple(make_pass_through_transform(N),
                        make_pass_through_transform(C),
@@ -109,7 +109,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
             make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
             make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
 
-        const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor(
+        const auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor(
             in_n_c_hip_wip_global_desc,
             make_tuple(
                 make_pass_through_transform(N),
@@ -119,7 +119,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
             make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
             make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
 
-        const auto in_e_n_ho_wo_global_desc = transform_dynamic_tensor_descriptor(
+        const auto in_e_n_ho_wo_global_desc = transform_tensor_descriptor(
             in_n_c_y_ho_x_wo_global_desc,
             make_tuple(make_merge_transform(make_tuple(C, Y, X)),
                        make_pass_through_transform(N),
@@ -129,8 +129,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
             make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
 
         // output tensor
-        const auto out_k_n_hop_wop_global_desc = transform_dynamic_tensor_descriptor(
-            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Ho, Wo, K1)),
+        const auto out_k_n_hop_wop_global_desc = transform_tensor_descriptor(
+            make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1)),
             make_tuple(make_merge_transform(make_tuple(K0, K1)),
                        make_pass_through_transform(N),
                        make_pad_transform(Ho, 0, OutRightPadH),
@@ -149,13 +149,13 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
         }
 
         // hack to control index calculation when iterating over a_k_m_global tensor
-        constexpr auto a_e_k_global_iterator_hacks =
+        constexpr auto a_e_k_global_step_hacks =
             make_tuple(make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}),
                        make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}));
 
-        constexpr auto a_e_k_global_move_slice_window_iterator_hack = Sequence<0, 0, 0>{};
+        constexpr auto a_e_k_global_move_slice_window_step_hack = Sequence<0, 0, 0>{};
 
-        constexpr auto b_e_n_ho_wo_global_iterator_hacks =
+        constexpr auto b_e_n_ho_wo_global_step_hacks =
             make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
                                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
                                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
@@ -165,12 +165,12 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
                                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
                                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
 
-        constexpr auto b_e_n_ho_wo_global_move_slice_window_iterator_hack =
+        constexpr auto b_e_n_ho_wo_global_move_slice_window_step_hack =
             Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{};
 
         // hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
         // hack for NKHW format
-        constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks =
+        constexpr auto c_k_n_ho_wo_global_tensor_step_hacks =
             make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0>{},
                                   Sequence<0, 0, 0, 0, 0>{},
                                   Sequence<0, 0, 0, 0, 0>{},
@@ -181,7 +181,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
                                   Sequence<0, 0, 0, 0, 0>{}));
 
         // GEMM
-        using gridwise_gemm = GridwiseDynamicGemmDlops_km_kn_mn_v3<
+        using gridwise_gemm = GridwiseGemmDlops_km_kn_mn_v3<
             BlockSize,
             FloatAB,
             FloatAcc,
@@ -214,11 +214,11 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
             Sequence<0, 2, 3, 1>,
             0,
             CThreadTransferDstScalarPerVector_W,
-            decltype(a_e_k_global_iterator_hacks),
-            decltype(b_e_n_ho_wo_global_iterator_hacks),
-            decltype(c_k_n_ho_wo_global_tensor_iterator_hacks),
-            decltype(a_e_k_global_move_slice_window_iterator_hack),
-            decltype(b_e_n_ho_wo_global_move_slice_window_iterator_hack)>;
+            decltype(a_e_k_global_step_hacks),
+            decltype(b_e_n_ho_wo_global_step_hacks),
+            decltype(c_k_n_ho_wo_global_tensor_step_hacks),
+            decltype(a_e_k_global_move_slice_window_step_hack),
+            decltype(b_e_n_ho_wo_global_move_slice_window_step_hack)>;
 
         const auto GridSize = (K / KPerBlock) * (Hop / HoPerBlock) * (Wop / WoPerBlock) * N;
 
@@ -257,7 +257,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
                                   dim3(GridSize),
                                   dim3(BlockSize),
                                   0,
-                                  0,
                                   wei_e_k_global_desc,
                                   p_wei_global,
                                   in_e_n_ho_wo_global_desc,
@@ -284,7 +283,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
                                   dim3(GridSize),
                                   dim3(BlockSize),
                                   0,
-                                  0,
                                   wei_e_k_global_desc,
                                   p_wei_global,
                                   in_e_n_ho_wo_global_desc,
@@ -311,7 +309,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
                                   dim3(GridSize),
                                   dim3(BlockSize),
                                   0,
-                                  0,
                                   wei_e_k_global_desc,
                                   p_wei_global,
                                   in_e_n_ho_wo_global_desc,
@@ -338,7 +335,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
                                   dim3(GridSize),
                                   dim3(BlockSize),
                                   0,
-                                  0,
                                   wei_e_k_global_desc,
                                   p_wei_global,
                                   in_e_n_ho_wo_global_desc,
@@ -354,10 +350,11 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
 
             float ave_time = timer.GetElapsedTime() / nrepeat;
 
-            float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc,
-                                                            wei_k_c_y_x_global_desc,
-                                                            out_n_k0_ho_wo_k1_global_desc) /
-                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+            float perf =
+                static_cast<float>(calculate_convolution_flops(in_n_c_hi_wi_global_desc,
+                                                               wei_k_c_y_x_global_desc,
+                                                               out_n_k0_ho_wo_k1_global_desc)) /
+                (std::size_t(1000) * 1000 * 1000) / ave_time;
 
             std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
                       << std::endl;
diff --git a/host/driver_offline/include/driver_dynamic_gemm_dlops_v1r2.hpp b/host/driver_offline/include/driver_dynamic_gemm_dlops_v1r2.hpp
deleted file mode 100644
index 0ebc68b48a..0000000000
--- a/host/driver_offline/include/driver_dynamic_gemm_dlops_v1r2.hpp
+++ /dev/null
@@ -1,415 +0,0 @@
-#ifndef DRIVER_DYNAMIC_GEMM_DLOPS_V1R2
-#define DRIVER_DYNAMIC_GEMM_DLOPS_V1R2
-
-#include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "gridwise_dynamic_gemm_dlops_v1r2.hpp"
-
-template <ck::index_t BlockSize,
-          typename FloatAB,
-          typename FloatAcc,
-          typename FloatC,
-          ck::InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
-          typename AKMGridDesc,
-          typename BKNGridDesc,
-          typename CMNGridDesc,
-          ck::index_t MPerBlock,
-          ck::index_t NPerBlock,
-          ck::index_t KPerBlock,
-          ck::index_t M1PerThread,
-          ck::index_t N1PerThread,
-          ck::index_t KPerThread,
-          ck::index_t M1N1ThreadClusterM10,
-          ck::index_t M1N1ThreadClusterN10,
-          ck::index_t M1N1ThreadClusterM11,
-          ck::index_t M1N1ThreadClusterN11,
-          typename ABlockTransferThreadSliceLengths_K_M0_M1,
-          typename ABlockTransferThreadClusterLengths_K_M0_M1,
-          typename ABlockTransferThreadClusterArrangeOrder,
-          typename ABlockTransferSrcAccessOrder,
-          ck::index_t ABlockTransferSrcVectorDim,
-          ck::index_t ABlockTransferSrcScalarPerVector,
-          ck::index_t ABlockTransferDstScalarPerVector_M1,
-          bool AThreadTransferSrcResetCoordinateAfterRun,
-          typename BBlockTransferThreadSliceLengths_K_N0_N1,
-          typename BBlockTransferThreadClusterLengths_K_N0_N1,
-          typename BBlockTransferThreadClusterArrangeOrder,
-          typename BBlockTransferSrcAccessOrder,
-          ck::index_t BBlockTransferSrcVectorDim,
-          ck::index_t BBlockTransferSrcScalarPerVector,
-          ck::index_t BBlockTransferDstScalarPerVector_N1,
-          bool BThreadTransferSrcResetCoordinateAfterRun,
-          typename CThreadTransferSrcDstAccessOrder,
-          ck::index_t CThreadTransferSrcDstVectorDim,
-          ck::index_t CThreadTransferDstScalarPerVector,
-          typename AGridIteratorHacks,
-          typename BGridIteratorHacks,
-          typename CGridIteratorHacks,
-          typename AGridMoveSliceWindowIteratorHacks,
-          typename BGridMoveSliceWindowIteratorHacks>
-__host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
-                                              const FloatAB* p_b_grid,
-                                              FloatC* p_c_grid,
-                                              const AKMGridDesc& a_k_m_grid_desc,
-                                              const BKNGridDesc& b_k_n_grid_desc,
-                                              const CMNGridDesc& c_m_n_grid_desc,
-                                              AGridIteratorHacks,
-                                              BGridIteratorHacks,
-                                              CGridIteratorHacks,
-                                              AGridMoveSliceWindowIteratorHacks,
-                                              BGridMoveSliceWindowIteratorHacks,
-                                              ck::index_t nrepeat)
-
-{
-    using namespace ck;
-
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
-
-    // GEMM
-    using GridwiseGemm =
-        GridwiseDynamicGemmDlops_km_kn_mn_v1r2<BlockSize,
-                                               FloatAB,
-                                               FloatAcc,
-                                               FloatC,
-                                               CGlobalMemoryDataOperation,
-                                               AKMGridDesc,
-                                               BKNGridDesc,
-                                               CMNGridDesc,
-                                               MPerBlock,
-                                               NPerBlock,
-                                               KPerBlock,
-                                               M1PerThread,
-                                               N1PerThread,
-                                               KPerThread,
-                                               M1N1ThreadClusterM10,
-                                               M1N1ThreadClusterN10,
-                                               M1N1ThreadClusterM11,
-                                               M1N1ThreadClusterN11,
-                                               ABlockTransferThreadSliceLengths_K_M0_M1,
-                                               ABlockTransferThreadClusterLengths_K_M0_M1,
-                                               ABlockTransferThreadClusterArrangeOrder,
-                                               ABlockTransferSrcAccessOrder,
-                                               ABlockTransferSrcVectorDim,
-                                               ABlockTransferSrcScalarPerVector,
-                                               ABlockTransferDstScalarPerVector_M1,
-                                               AThreadTransferSrcResetCoordinateAfterRun,
-                                               BBlockTransferThreadSliceLengths_K_N0_N1,
-                                               BBlockTransferThreadClusterLengths_K_N0_N1,
-                                               BBlockTransferThreadClusterArrangeOrder,
-                                               BBlockTransferSrcAccessOrder,
-                                               BBlockTransferSrcVectorDim,
-                                               BBlockTransferSrcScalarPerVector,
-                                               BBlockTransferDstScalarPerVector_N1,
-                                               BThreadTransferSrcResetCoordinateAfterRun,
-                                               CThreadTransferSrcDstAccessOrder,
-                                               CThreadTransferSrcDstVectorDim,
-                                               CThreadTransferDstScalarPerVector,
-                                               AGridIteratorHacks,
-                                               BGridIteratorHacks,
-                                               CGridIteratorHacks,
-                                               AGridMoveSliceWindowIteratorHacks,
-                                               BGridMoveSliceWindowIteratorHacks>;
-
-    const auto M = a_k_m_grid_desc.GetLength(I1);
-    const auto N = b_k_n_grid_desc.GetLength(I1);
-    const auto K = a_k_m_grid_desc.GetLength(I0);
-
-    if(!GridwiseGemm::CheckValidity(a_k_m_grid_desc, b_k_n_grid_desc, c_m_n_grid_desc))
-    {
-        throw std::runtime_error(
-            "wrong! GridwiseDynamicGemmDlops_km_kn_mn_v1r2 has invalid setting");
-    }
-
-    const auto a_k_m0_m1_grid_desc = GridwiseGemm::MakeAKM0M1GridDescriptor(a_k_m_grid_desc);
-    const auto b_k_n0_n1_grid_desc = GridwiseGemm::MakeBKN0N1GridDescriptor(b_k_n_grid_desc);
-
-    using AKM0M1GridDesc = decltype(a_k_m0_m1_grid_desc);
-    using BKN0N1GridDesc = decltype(b_k_n0_n1_grid_desc);
-
-    // c_m0_m10_m11_n0_n10_n11_grid_desc
-    const auto c_m0_m10_m11_n0_n10_n11_grid_desc =
-        GridwiseGemm::MakeCM0M10M11N0N10N11GridDescriptor(c_m_n_grid_desc);
-
-    using CM0M10M11N0N10N11GridDesc = decltype(c_m0_m10_m11_n0_n10_n11_grid_desc);
-
-    // c_blockid_to_m0_n0_block_cluster_adaptor
-    const auto c_blockid_to_m0_n0_block_cluster_adaptor =
-        GridwiseGemm::MakeCBlockIdToM0N0BlockClusterAdaptor(c_m_n_grid_desc);
-
-    using CBlockIdToM0N0BlockClusterAdaptor = decltype(c_blockid_to_m0_n0_block_cluster_adaptor);
-
-    const index_t grid_size = GridwiseGemm::CalculateGridSize(M, N);
-
-    const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K);
-
-    const bool has_double_tail_k_block_loop = GridwiseGemm::CalculateHasDoubleTailKBlockLoop(K);
-
-    {
-        std::cout << "a_k_m0_m1_grid_desc{" << a_k_m0_m1_grid_desc.GetLength(I0) << ", "
-                  << a_k_m0_m1_grid_desc.GetLength(I1) << ", " << a_k_m0_m1_grid_desc.GetLength(I2)
-                  << "}" << std::endl;
-
-        std::cout << "b_k_n0_n1_grid_desc{" << b_k_n0_n1_grid_desc.GetLength(I0) << ", "
-                  << b_k_n0_n1_grid_desc.GetLength(I1) << ", " << b_k_n0_n1_grid_desc.GetLength(I2)
-                  << "}" << std::endl;
-
-        std::cout << "c_m0_m10_m11_n0_n10_n11_grid_desc{ "
-                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I0) << ", "
-                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I1) << ", "
-                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I2) << ", "
-                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I3) << ", "
-                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I4) << ", "
-                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I5) << "}" << std::endl;
-    }
-
-#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
-    float ave_time = 0;
-
-    if(has_main_k_block_loop && has_double_tail_k_block_loop)
-    {
-        const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm,
-                                           FloatAB,
-                                           FloatC,
-                                           remove_reference_t<AKM0M1GridDesc>,
-                                           remove_reference_t<BKN0N1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           true,
-                                           true>;
-
-        ave_time = launch_and_time_kernel(kernel,
-                                          nrepeat,
-                                          dim3(grid_size),
-                                          dim3(BlockSize),
-                                          0,
-                                          0,
-                                          p_a_grid,
-                                          p_b_grid,
-                                          p_c_grid,
-                                          a_k_m0_m1_grid_desc,
-                                          b_k_n0_n1_grid_desc,
-                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
-                                          c_blockid_to_m0_n0_block_cluster_adaptor);
-    }
-    else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
-    {
-        const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm,
-                                           FloatAB,
-                                           FloatC,
-                                           remove_reference_t<AKM0M1GridDesc>,
-                                           remove_reference_t<BKN0N1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           true,
-                                           false>;
-
-        ave_time = launch_and_time_kernel(kernel,
-                                          nrepeat,
-                                          dim3(grid_size),
-                                          dim3(BlockSize),
-                                          0,
-                                          0,
-                                          p_a_grid,
-                                          p_b_grid,
-                                          p_c_grid,
-                                          a_k_m0_m1_grid_desc,
-                                          b_k_n0_n1_grid_desc,
-                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
-                                          c_blockid_to_m0_n0_block_cluster_adaptor);
-    }
-    else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
-    {
-        const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm,
-                                           FloatAB,
-                                           FloatC,
-                                           remove_reference_t<AKM0M1GridDesc>,
-                                           remove_reference_t<BKN0N1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           false,
-                                           true>;
-
-        ave_time = launch_and_time_kernel(kernel,
-                                          nrepeat,
-                                          dim3(grid_size),
-                                          dim3(BlockSize),
-                                          0,
-                                          0,
-                                          p_a_grid,
-                                          p_b_grid,
-                                          p_c_grid,
-                                          a_k_m0_m1_grid_desc,
-                                          b_k_n0_n1_grid_desc,
-                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
-                                          c_blockid_to_m0_n0_block_cluster_adaptor);
-    }
-    else
-    {
-        const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm,
-                                           FloatAB,
-                                           FloatC,
-                                           remove_reference_t<AKM0M1GridDesc>,
-                                           remove_reference_t<BKN0N1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           false,
-                                           false>;
-
-        ave_time = launch_and_time_kernel(kernel,
-                                          nrepeat,
-                                          dim3(grid_size),
-                                          dim3(BlockSize),
-                                          0,
-                                          0,
-                                          p_a_grid,
-                                          p_b_grid,
-                                          p_c_grid,
-                                          a_k_m0_m1_grid_desc,
-                                          b_k_n0_n1_grid_desc,
-                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
-                                          c_blockid_to_m0_n0_block_cluster_adaptor);
-    }
-
-    return ave_time;
-#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
-    DeviceMem a_k_m0_m1_grid_desc_dev_buf(sizeof(AKM0M1GridDesc));
-    DeviceMem b_k_n0_n1_grid_desc_dev_buf(sizeof(BKN0N1GridDesc));
-    DeviceMem c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf(sizeof(CM0M10M11N0N10N11GridDesc));
-    DeviceMem c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf(
-        sizeof(CBlockIdToM0N0BlockClusterAdaptor));
-
-    a_k_m0_m1_grid_desc_dev_buf.ToDevice(&a_k_m0_m1_grid_desc);
-    b_k_n0_n1_grid_desc_dev_buf.ToDevice(&b_k_n0_n1_grid_desc);
-    c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.ToDevice(&c_m0_m10_m11_n0_n10_n11_grid_desc);
-    c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.ToDevice(
-        &c_blockid_to_m0_n0_block_cluster_adaptor);
-
-    float ave_time = 0;
-
-    if(has_main_k_block_loop && has_double_tail_k_block_loop)
-    {
-        const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm,
-                                           FloatAB,
-                                           FloatC,
-                                           remove_reference_t<AKM0M1GridDesc>,
-                                           remove_reference_t<BKN0N1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           true,
-                                           true>;
-
-        ave_time = launch_and_time_kernel(
-            kernel,
-            nrepeat,
-            dim3(grid_size),
-            dim3(BlockSize),
-            0,
-            0,
-            p_a_grid,
-            p_b_grid,
-            p_c_grid,
-            (void CONSTANT*)a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer());
-    }
-    else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
-    {
-        const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm,
-                                           FloatAB,
-                                           FloatC,
-                                           remove_reference_t<AKM0M1GridDesc>,
-                                           remove_reference_t<BKN0N1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           true,
-                                           false>;
-
-        ave_time = launch_and_time_kernel(
-            kernel,
-            nrepeat,
-            dim3(grid_size),
-            dim3(BlockSize),
-            0,
-            0,
-            p_a_grid,
-            p_b_grid,
-            p_c_grid,
-            (void CONSTANT*)a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer());
-    }
-    else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
-    {
-        const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm,
-                                           FloatAB,
-                                           FloatC,
-                                           remove_reference_t<AKM0M1GridDesc>,
-                                           remove_reference_t<BKN0N1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           false,
-                                           true>;
-
-        ave_time = launch_and_time_kernel(
-            kernel,
-            nrepeat,
-            dim3(grid_size),
-            dim3(BlockSize),
-            0,
-            0,
-            p_a_grid,
-            p_b_grid,
-            p_c_grid,
-            (void CONSTANT*)a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer());
-    }
-    else
-    {
-        const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm,
-                                           FloatAB,
-                                           FloatC,
-                                           remove_reference_t<AKM0M1GridDesc>,
-                                           remove_reference_t<BKN0N1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           false,
-                                           false>;
-
-        ave_time = launch_and_time_kernel(
-            kernel,
-            nrepeat,
-            dim3(grid_size),
-            dim3(BlockSize),
-            0,
-            0,
-            p_a_grid,
-            p_b_grid,
-            p_c_grid,
-            (void CONSTANT*)a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer());
-    }
-
-    return ave_time;
-#endif
-}
-#endif
diff --git a/host/driver_offline/include/driver_dynamic_gemm_dlops_v1r3.hpp b/host/driver_offline/include/driver_dynamic_gemm_dlops_v1r3.hpp
deleted file mode 100644
index d075eac822..0000000000
--- a/host/driver_offline/include/driver_dynamic_gemm_dlops_v1r3.hpp
+++ /dev/null
@@ -1,411 +0,0 @@
-#ifndef DRIVER_DYNAMIC_GEMM_DLOPS_V1R3
-#define DRIVER_DYNAMIC_GEMM_DLOPS_V1R3
-
-#include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "gridwise_dynamic_gemm_dlops_v1r3.hpp"
-
-template <ck::index_t BlockSize,
-          typename FloatAB,
-          typename FloatAcc,
-          typename FloatC,
-          ck::InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
-          typename AK0MK1GridDesc,
-          typename BK0NK1GridDesc,
-          typename CMNGridDesc,
-          ck::index_t MPerBlock,
-          ck::index_t NPerBlock,
-          ck::index_t KPerBlock,
-          ck::index_t M1PerThread,
-          ck::index_t N1PerThread,
-          ck::index_t KPerThread,
-          typename M1N1ThreadClusterM1Xs,
-          typename M1N1ThreadClusterN1Xs,
-          typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
-          typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
-          typename ABlockTransferThreadClusterArrangeOrder,
-          typename ABlockTransferSrcAccessOrder,
-          typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
-          typename ABlockTransferSrcVectorTensorContiguousDimOrder,
-          typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
-          typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
-          typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
-          typename BBlockTransferThreadClusterArrangeOrder,
-          typename BBlockTransferSrcAccessOrder,
-          typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
-          typename BBlockTransferSrcVectorTensorContiguousDimOrder,
-          typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
-          typename CThreadTransferSrcDstAccessOrder,
-          ck::index_t CThreadTransferSrcDstVectorDim,
-          ck::index_t CThreadTransferDstScalarPerVector,
-          typename AGridIteratorHacks,
-          typename BGridIteratorHacks,
-          typename CGridIteratorHacks,
-          typename AGridMoveSliceWindowIteratorHacks,
-          typename BGridMoveSliceWindowIteratorHacks>
-__host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
-                                              const FloatAB* p_b_grid,
-                                              FloatC* p_c_grid,
-                                              const AK0MK1GridDesc& a_k0_m_k1_grid_desc,
-                                              const BK0NK1GridDesc& b_k0_n_k1_grid_desc,
-                                              const CMNGridDesc& c_m_n_grid_desc,
-                                              AGridIteratorHacks,
-                                              BGridIteratorHacks,
-                                              CGridIteratorHacks,
-                                              AGridMoveSliceWindowIteratorHacks,
-                                              BGridMoveSliceWindowIteratorHacks,
-                                              ck::index_t nrepeat)
-
-{
-    using namespace ck;
-
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
-
-    // GEMM
-    using GridwiseGemm =
-        GridwiseDynamicGemmDlops_km_kn_mn_v1r3<BlockSize,
-                                               FloatAB,
-                                               FloatAcc,
-                                               FloatC,
-                                               CGlobalMemoryDataOperation,
-                                               AK0MK1GridDesc,
-                                               BK0NK1GridDesc,
-                                               CMNGridDesc,
-                                               MPerBlock,
-                                               NPerBlock,
-                                               KPerBlock,
-                                               M1PerThread,
-                                               N1PerThread,
-                                               KPerThread,
-                                               M1N1ThreadClusterM1Xs,
-                                               M1N1ThreadClusterN1Xs,
-                                               ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
-                                               ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
-                                               ABlockTransferThreadClusterArrangeOrder,
-                                               ABlockTransferSrcAccessOrder,
-                                               ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
-                                               ABlockTransferSrcVectorTensorContiguousDimOrder,
-                                               ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
-                                               BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
-                                               BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
-                                               BBlockTransferThreadClusterArrangeOrder,
-                                               BBlockTransferSrcAccessOrder,
-                                               BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
-                                               BBlockTransferSrcVectorTensorContiguousDimOrder,
-                                               BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
-                                               CThreadTransferSrcDstAccessOrder,
-                                               CThreadTransferSrcDstVectorDim,
-                                               CThreadTransferDstScalarPerVector,
-                                               AGridIteratorHacks,
-                                               BGridIteratorHacks,
-                                               CGridIteratorHacks,
-                                               AGridMoveSliceWindowIteratorHacks,
-                                               BGridMoveSliceWindowIteratorHacks>;
-
-    const auto M  = a_k0_m_k1_grid_desc.GetLength(I1);
-    const auto N  = b_k0_n_k1_grid_desc.GetLength(I1);
-    const auto K0 = a_k0_m_k1_grid_desc.GetLength(I0);
-
-    if(!GridwiseGemm::CheckValidity(a_k0_m_k1_grid_desc, b_k0_n_k1_grid_desc, c_m_n_grid_desc))
-    {
-        throw std::runtime_error(
-            "wrong! GridwiseDynamicGemmDlops_km_kn_mn_v1r3 has invalid setting");
-    }
-
-    const auto a_k0_m0_m1_k1_grid_desc =
-        GridwiseGemm::MakeAK0M0M1K1GridDescriptor(a_k0_m_k1_grid_desc);
-    const auto b_k0_n0_n1_k1_grid_desc =
-        GridwiseGemm::MakeBK0N0N1K1GridDescriptor(b_k0_n_k1_grid_desc);
-
-    using AK0M0M1K1GridDesc = decltype(a_k0_m0_m1_k1_grid_desc);
-    using BK0N0N1K1GridDesc = decltype(b_k0_n0_n1_k1_grid_desc);
-
-    // c_m0_m10_m11_n0_n10_n11_grid_desc
-    const auto c_m0_m10_m11_n0_n10_n11_grid_desc =
-        GridwiseGemm::MakeCM0M10M11N0N10N11GridDescriptor(c_m_n_grid_desc);
-
-    using CM0M10M11N0N10N11GridDesc = decltype(c_m0_m10_m11_n0_n10_n11_grid_desc);
-
-    // c_blockid_to_m0_n0_block_cluster_adaptor
-    const auto c_blockid_to_m0_n0_block_cluster_adaptor =
-        GridwiseGemm::MakeCBlockIdToM0N0BlockClusterAdaptor(c_m_n_grid_desc);
-
-    using CBlockIdToM0N0BlockClusterAdaptor = decltype(c_blockid_to_m0_n0_block_cluster_adaptor);
-
-    const index_t grid_size = GridwiseGemm::CalculateGridSize(M, N);
-
-    const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K0);
-
-    const bool has_double_tail_k_block_loop = GridwiseGemm::CalculateHasDoubleTailKBlockLoop(K0);
-
-    {
-        std::cout << "a_k0_m0_m1_k1_grid_desc{" << a_k0_m0_m1_k1_grid_desc.GetLength(I0) << ", "
-                  << a_k0_m0_m1_k1_grid_desc.GetLength(I1) << ", "
-                  << a_k0_m0_m1_k1_grid_desc.GetLength(I2) << ", "
-                  << a_k0_m0_m1_k1_grid_desc.GetLength(I3) << "}" << std::endl;
-
-        std::cout << "b_k0_n0_n1_k1_grid_desc{" << b_k0_n0_n1_k1_grid_desc.GetLength(I0) << ", "
-                  << b_k0_n0_n1_k1_grid_desc.GetLength(I1) << ", "
-                  << b_k0_n0_n1_k1_grid_desc.GetLength(I2) << ", "
-                  << b_k0_n0_n1_k1_grid_desc.GetLength(I3) << "}" << std::endl;
-
-        std::cout << "c_m0_m10_m11_n0_n10_n11_grid_desc{ "
-                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I0) << ", "
-                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I1) << ", "
-                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I2) << ", "
-                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I3) << ", "
-                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I4) << ", "
-                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I5) << "}" << std::endl;
-    }
-
-#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
-    float ave_time = 0;
-
-    if(has_main_k_block_loop && has_double_tail_k_block_loop)
-    {
-        const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm,
-                                           FloatAB,
-                                           FloatC,
-                                           remove_reference_t<AK0M0M1K1GridDesc>,
-                                           remove_reference_t<BK0N0N1K1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           true,
-                                           true>;
-
-        ave_time = launch_and_time_kernel(kernel,
-                                          nrepeat,
-                                          dim3(grid_size),
-                                          dim3(BlockSize),
-                                          0,
-                                          0,
-                                          p_a_grid,
-                                          p_b_grid,
-                                          p_c_grid,
-                                          a_k0_m0_m1_k1_grid_desc,
-                                          b_k0_n0_n1_k1_grid_desc,
-                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
-                                          c_blockid_to_m0_n0_block_cluster_adaptor);
-    }
-    else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
-    {
-        const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm,
-                                           FloatAB,
-                                           FloatC,
-                                           remove_reference_t<AK0M0M1K1GridDesc>,
-                                           remove_reference_t<BK0N0N1K1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           true,
-                                           false>;
-
-        ave_time = launch_and_time_kernel(kernel,
-                                          nrepeat,
-                                          dim3(grid_size),
-                                          dim3(BlockSize),
-                                          0,
-                                          0,
-                                          p_a_grid,
-                                          p_b_grid,
-                                          p_c_grid,
-                                          a_k0_m0_m1_k1_grid_desc,
-                                          b_k0_n0_n1_k1_grid_desc,
-                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
-                                          c_blockid_to_m0_n0_block_cluster_adaptor);
-    }
-    else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
-    {
-        const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm,
-                                           FloatAB,
-                                           FloatC,
-                                           remove_reference_t<AK0M0M1K1GridDesc>,
-                                           remove_reference_t<BK0N0N1K1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           false,
-                                           true>;
-
-        ave_time = launch_and_time_kernel(kernel,
-                                          nrepeat,
-                                          dim3(grid_size),
-                                          dim3(BlockSize),
-                                          0,
-                                          0,
-                                          p_a_grid,
-                                          p_b_grid,
-                                          p_c_grid,
-                                          a_k0_m0_m1_k1_grid_desc,
-                                          b_k0_n0_n1_k1_grid_desc,
-                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
-                                          c_blockid_to_m0_n0_block_cluster_adaptor);
-    }
-    else
-    {
-        const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm,
-                                           FloatAB,
-                                           FloatC,
-                                           remove_reference_t<AK0M0M1K1GridDesc>,
-                                           remove_reference_t<BK0N0N1K1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           false,
-                                           false>;
-
-        ave_time = launch_and_time_kernel(kernel,
-                                          nrepeat,
-                                          dim3(grid_size),
-                                          dim3(BlockSize),
-                                          0,
-                                          0,
-                                          p_a_grid,
-                                          p_b_grid,
-                                          p_c_grid,
-                                          a_k0_m0_m1_k1_grid_desc,
-                                          b_k0_n0_n1_k1_grid_desc,
-                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
-                                          c_blockid_to_m0_n0_block_cluster_adaptor);
-    }
-
-    return ave_time;
-#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
-    DeviceMem a_k0_m0_m1_k1_grid_desc_dev_buf(sizeof(AK0M0M1K1GridDesc));
-    DeviceMem b_k0_n0_n1_k1_grid_desc_dev_buf(sizeof(BK0N0N1K1GridDesc));
-    DeviceMem c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf(sizeof(CM0M10M11N0N10N11GridDesc));
-    DeviceMem c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf(
-        sizeof(CBlockIdToM0N0BlockClusterAdaptor));
-
-    a_k0_m0_m1_k1_grid_desc_dev_buf.ToDevice(&a_k0_m0_m1_k1_grid_desc);
-    b_k0_n0_n1_k1_grid_desc_dev_buf.ToDevice(&b_k0_n0_n1_k1_grid_desc);
-    c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.ToDevice(&c_m0_m10_m11_n0_n10_n11_grid_desc);
-    c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.ToDevice(
-        &c_blockid_to_m0_n0_block_cluster_adaptor);
-
-    float ave_time = 0;
-
-    if(has_main_k_block_loop && has_double_tail_k_block_loop)
-    {
-        const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm,
-                                           FloatAB,
-                                           FloatC,
-                                           remove_reference_t<AK0M0M1K1GridDesc>,
-                                           remove_reference_t<BK0N0N1K1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           true,
-                                           true>;
-
-        ave_time = launch_and_time_kernel(
-            kernel,
-            nrepeat,
-            dim3(grid_size),
-            dim3(BlockSize),
-            0,
-            0,
-            p_a_grid,
-            p_b_grid,
-            p_c_grid,
-            (void CONSTANT*)a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer());
-    }
-    else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
-    {
-        const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm,
-                                           FloatAB,
-                                           FloatC,
-                                           remove_reference_t<AK0M0M1K1GridDesc>,
-                                           remove_reference_t<BK0N0N1K1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           true,
-                                           false>;
-
-        ave_time = launch_and_time_kernel(
-            kernel,
-            nrepeat,
-            dim3(grid_size),
-            dim3(BlockSize),
-            0,
-            0,
-            p_a_grid,
-            p_b_grid,
-            p_c_grid,
-            (void CONSTANT*)a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer());
-    }
-    else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
-    {
-        const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm,
-                                           FloatAB,
-                                           FloatC,
-                                           remove_reference_t<AK0M0M1K1GridDesc>,
-                                           remove_reference_t<BK0N0N1K1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           false,
-                                           true>;
-
-        ave_time = launch_and_time_kernel(
-            kernel,
-            nrepeat,
-            dim3(grid_size),
-            dim3(BlockSize),
-            0,
-            0,
-            p_a_grid,
-            p_b_grid,
-            p_c_grid,
-            (void CONSTANT*)a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer());
-    }
-    else
-    {
-        const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm,
-                                           FloatAB,
-                                           FloatC,
-                                           remove_reference_t<AK0M0M1K1GridDesc>,
-                                           remove_reference_t<BK0N0N1K1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           false,
-                                           false>;
-
-        ave_time = launch_and_time_kernel(
-            kernel,
-            nrepeat,
-            dim3(grid_size),
-            dim3(BlockSize),
-            0,
-            0,
-            p_a_grid,
-            p_b_grid,
-            p_c_grid,
-            (void CONSTANT*)a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(),
-            (void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer());
-    }
-
-    return ave_time;
-#endif
-}
-#endif
diff --git a/host/driver_offline/include/driver_dynamic_gemm_xdlops_v2r3.hpp b/host/driver_offline/include/driver_dynamic_gemm_xdlops_v2r3.hpp
deleted file mode 100644
index 481d08188d..0000000000
--- a/host/driver_offline/include/driver_dynamic_gemm_xdlops_v2r3.hpp
+++ /dev/null
@@ -1,196 +0,0 @@
-#ifndef DRIVER_DYNAMIC_GEMM_XDLOPS_V2R3
-#define DRIVER_DYNAMIC_GEMM_XDLOPS_V2R3
-
-#include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "gridwise_dynamic_gemm_xdlops_v2r3.hpp"
-
-template <ck::index_t BlockSize,
-          typename FloatAB,
-          typename FloatAcc,
-          typename FloatC,
-          ck::InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
-          typename AK0MK1GridDesc,
-          typename BK0NK1GridDesc,
-          typename CMNGridDesc,
-          ck::index_t MPerBlock,
-          ck::index_t NPerBlock,
-          ck::index_t KPerBlock,
-          ck::index_t MPerWave,
-          ck::index_t NPerWave,
-          ck::index_t K1,
-          ck::index_t MRepeat,
-          ck::index_t NRepeat,
-          typename ABlockTransferThreadSliceLengths_K0_M_K1,
-          typename ABlockTransferThreadClusterLengths_K0_M_K1,
-          typename ABlockTransferThreadClusterArrangeOrder,
-          typename ABlockTransferSrcAccessOrder,
-          ck::index_t ABlockTransferSrcVectorDim,
-          ck::index_t ABlockTransferSrcScalarPerVector,
-          ck::index_t ABlockTransferDstScalarPerVector_K1,
-          bool AThreadTransferSrcResetCoordinateAfterRun,
-          typename BBlockTransferThreadSliceLengths_K0_N_K1,
-          typename BBlockTransferThreadClusterLengths_K0_N_K1,
-          typename BBlockTransferThreadClusterArrangeOrder,
-          typename BBlockTransferSrcAccessOrder,
-          ck::index_t BBlockTransferSrcVectorDim,
-          ck::index_t BBlockTransferSrcScalarPerVector,
-          ck::index_t BBlockTransferDstScalarPerVector_K1,
-          bool BThreadTransferSrcResetCoordinateAfterRun,
-          typename CThreadTransferSrcDstAccessOrder,
-          ck::index_t CThreadTransferSrcDstVectorDim,
-          ck::index_t CThreadTransferDstScalarPerVector,
-          typename AGridIteratorHacks,
-          typename BGridIteratorHacks,
-          typename CGridIteratorHacks,
-          typename AGridMoveSliceWindowIteratorHacks,
-          typename BGridMoveSliceWindowIteratorHacks,
-          bool CAccessOrderMRepeatNRepeat>
-__host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
-                                               const FloatAB* p_b_grid,
-                                               FloatC* p_c_grid,
-                                               const AK0MK1GridDesc& a_k0_m_k1_grid_desc,
-                                               const BK0NK1GridDesc& b_k0_n_k1_grid_desc,
-                                               const CMNGridDesc& c_m_n_grid_desc,
-                                               AGridIteratorHacks,
-                                               BGridIteratorHacks,
-                                               CGridIteratorHacks,
-                                               AGridMoveSliceWindowIteratorHacks,
-                                               BGridMoveSliceWindowIteratorHacks,
-                                               ck::index_t nrepeat)
-
-{
-    using namespace ck;
-
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
-
-    using GridwiseGemm =
-        GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
-                                                       FloatAB,
-                                                       FloatAcc,
-                                                       FloatC,
-                                                       CGlobalMemoryDataOperation,
-                                                       AK0MK1GridDesc,
-                                                       BK0NK1GridDesc,
-                                                       CMNGridDesc,
-                                                       MPerBlock,
-                                                       NPerBlock,
-                                                       KPerBlock,
-                                                       MPerWave,
-                                                       NPerWave,
-                                                       K1,
-                                                       MRepeat,
-                                                       NRepeat,
-                                                       ABlockTransferThreadSliceLengths_K0_M_K1,
-                                                       ABlockTransferThreadClusterLengths_K0_M_K1,
-                                                       ABlockTransferThreadClusterArrangeOrder,
-                                                       ABlockTransferSrcAccessOrder,
-                                                       ABlockTransferSrcVectorDim,
-                                                       ABlockTransferSrcScalarPerVector,
-                                                       ABlockTransferDstScalarPerVector_K1,
-                                                       AThreadTransferSrcResetCoordinateAfterRun,
-                                                       BBlockTransferThreadSliceLengths_K0_N_K1,
-                                                       BBlockTransferThreadClusterLengths_K0_N_K1,
-                                                       BBlockTransferThreadClusterArrangeOrder,
-                                                       BBlockTransferSrcAccessOrder,
-                                                       BBlockTransferSrcVectorDim,
-                                                       BBlockTransferSrcScalarPerVector,
-                                                       BBlockTransferDstScalarPerVector_K1,
-                                                       BThreadTransferSrcResetCoordinateAfterRun,
-                                                       CThreadTransferSrcDstAccessOrder,
-                                                       CThreadTransferSrcDstVectorDim,
-                                                       CThreadTransferDstScalarPerVector,
-                                                       AGridIteratorHacks,
-                                                       BGridIteratorHacks,
-                                                       CGridIteratorHacks,
-                                                       AGridMoveSliceWindowIteratorHacks,
-                                                       BGridMoveSliceWindowIteratorHacks,
-                                                       CAccessOrderMRepeatNRepeat>;
-
-    {
-        std::cout << "a_k0_m_k1_grid_desc{" << a_k0_m_k1_grid_desc.GetLength(I0) << ", "
-                  << a_k0_m_k1_grid_desc.GetLength(I1) << ", " << a_k0_m_k1_grid_desc.GetLength(I2)
-                  << "}" << std::endl;
-
-        std::cout << "b_k0_n_k1_grid_desc{" << b_k0_n_k1_grid_desc.GetLength(I0) << ", "
-                  << b_k0_n_k1_grid_desc.GetLength(I1) << ", " << b_k0_n_k1_grid_desc.GetLength(I2)
-                  << "}" << std::endl;
-
-        std::cout << "c_m_n_grid_desc{ " << c_m_n_grid_desc.GetLength(I0) << ", "
-                  << c_m_n_grid_desc.GetLength(I1) << "}" << std::endl;
-    }
-
-    if(!GridwiseGemm::CheckValidity(a_k0_m_k1_grid_desc, b_k0_n_k1_grid_desc, c_m_n_grid_desc))
-    {
-        throw std::runtime_error(
-            "wrong! GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v2r3 has invalid setting");
-    }
-
-    const auto c_m0_m1_m2_n_grid_desc = GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
-
-    using CM0M1M2NGridDesc = decltype(c_m0_m1_m2_n_grid_desc);
-
-    const auto c_block_cluster_adaptor = GridwiseGemm::MakeCBlockClusterAdaptor(c_m_n_grid_desc);
-
-    using CBlockClusterAdaptor = decltype(c_block_cluster_adaptor);
-
-    const index_t grid_size = GridwiseGemm::CalculateGridSize(c_m_n_grid_desc);
-
-    const auto kernel = kernel_dynamic_gemm_xdlops_v2r3<GridwiseGemm,
-                                                        FloatAB,
-                                                        FloatC,
-                                                        remove_reference_t<AK0MK1GridDesc>,
-                                                        remove_reference_t<BK0NK1GridDesc>,
-                                                        remove_reference_t<CM0M1M2NGridDesc>,
-                                                        remove_reference_t<CBlockClusterAdaptor>>;
-
-#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
-    float ave_time = launch_and_time_kernel(kernel,
-                                            nrepeat,
-                                            dim3(grid_size),
-                                            dim3(BlockSize),
-                                            0,
-                                            0,
-                                            p_a_grid,
-                                            p_b_grid,
-                                            p_c_grid,
-                                            a_k0_m_k1_grid_desc,
-                                            b_k0_n_k1_grid_desc,
-                                            c_m0_m1_m2_n_grid_desc,
-                                            c_block_cluster_adaptor);
-
-#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
-    DeviceMem a_k0_m_k1_grid_desc_dev_buf(sizeof(AK0MK1GridDesc));
-    DeviceMem b_k0_n_k1_grid_desc_dev_buf(sizeof(BK0NK1GridDesc));
-    DeviceMem c_m0_m1_m2_n_grid_desc_dev_buf(sizeof(CM0M1M2NGridDesc));
-    DeviceMem c_block_cluster_adaptor_dev_buf(sizeof(CBlockClusterAdaptor));
-
-    a_k0_m_k1_grid_desc_dev_buf.ToDevice(&a_k0_m_k1_grid_desc);
-    b_k0_n_k1_grid_desc_dev_buf.ToDevice(&b_k0_n_k1_grid_desc);
-    c_m0_m1_m2_n_grid_desc_dev_buf.ToDevice(&c_m0_m1_m2_n_grid_desc);
-    c_block_cluster_adaptor_dev_buf.ToDevice(&c_block_cluster_adaptor);
-
-    float ave_time =
-        launch_and_time_kernel(kernel,
-                               nrepeat,
-                               dim3(grid_size),
-                               dim3(BlockSize),
-                               0,
-                               0,
-                               p_a_grid,
-                               p_b_grid,
-                               p_c_grid,
-                               (void CONSTANT*)a_k0_m_k1_grid_desc_dev_buf.GetDeviceBuffer(),
-                               (void CONSTANT*)b_k0_n_k1_grid_desc_dev_buf.GetDeviceBuffer(),
-                               (void CONSTANT*)c_m0_m1_m2_n_grid_desc_dev_buf.GetDeviceBuffer(),
-                               (void CONSTANT*)c_block_cluster_adaptor_dev_buf.GetDeviceBuffer());
-#endif
-    return ave_time;
-}
-#endif
diff --git a/host/driver_offline/include/driver_gemm_dlops_v1r2.hpp b/host/driver_offline/include/driver_gemm_dlops_v1r2.hpp
new file mode 100644
index 0000000000..bf5f7f1c0f
--- /dev/null
+++ b/host/driver_offline/include/driver_gemm_dlops_v1r2.hpp
@@ -0,0 +1,413 @@
+#ifndef DRIVER_GEMM_DLOPS_V1R2
+#define DRIVER_GEMM_DLOPS_V1R2
+
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "gridwise_gemm_dlops_v1r2.hpp"
+
+template <ck::index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          ck::InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
+          typename AKMGridDesc,
+          typename BKNGridDesc,
+          typename CMNGridDesc,
+          ck::index_t MPerBlock,
+          ck::index_t NPerBlock,
+          ck::index_t KPerBlock,
+          ck::index_t M1PerThread,
+          ck::index_t N1PerThread,
+          ck::index_t KPerThread,
+          ck::index_t M1N1ThreadClusterM10,
+          ck::index_t M1N1ThreadClusterN10,
+          ck::index_t M1N1ThreadClusterM11,
+          ck::index_t M1N1ThreadClusterN11,
+          typename ABlockTransferThreadSliceLengths_K_M0_M1,
+          typename ABlockTransferThreadClusterLengths_K_M0_M1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          ck::index_t ABlockTransferSrcVectorDim,
+          ck::index_t ABlockTransferSrcScalarPerVector,
+          ck::index_t ABlockTransferDstScalarPerVector_M1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          typename BBlockTransferThreadSliceLengths_K_N0_N1,
+          typename BBlockTransferThreadClusterLengths_K_N0_N1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          ck::index_t BBlockTransferSrcVectorDim,
+          ck::index_t BBlockTransferSrcScalarPerVector,
+          ck::index_t BBlockTransferDstScalarPerVector_N1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          typename CThreadTransferSrcDstAccessOrder,
+          ck::index_t CThreadTransferSrcDstVectorDim,
+          ck::index_t CThreadTransferDstScalarPerVector,
+          typename AGridStepHacks,
+          typename BGridStepHacks,
+          typename CGridStepHacks,
+          typename AGridMoveSliceWindowStepHacks,
+          typename BGridMoveSliceWindowStepHacks>
+__host__ float driver_gemm_dlops_v1r2(const FloatAB* p_a_grid,
+                                      const FloatAB* p_b_grid,
+                                      FloatC* p_c_grid,
+                                      const AKMGridDesc& a_k_m_grid_desc,
+                                      const BKNGridDesc& b_k_n_grid_desc,
+                                      const CMNGridDesc& c_m_n_grid_desc,
+                                      AGridStepHacks,
+                                      BGridStepHacks,
+                                      CGridStepHacks,
+                                      AGridMoveSliceWindowStepHacks,
+                                      BGridMoveSliceWindowStepHacks,
+                                      ck::index_t nrepeat)
+
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+
+    // GEMM
+    using GridwiseGemm = GridwiseGemmDlops_km_kn_mn_v1r2<BlockSize,
+                                                         FloatAB,
+                                                         FloatAcc,
+                                                         FloatC,
+                                                         CGlobalMemoryDataOperation,
+                                                         AKMGridDesc,
+                                                         BKNGridDesc,
+                                                         CMNGridDesc,
+                                                         MPerBlock,
+                                                         NPerBlock,
+                                                         KPerBlock,
+                                                         M1PerThread,
+                                                         N1PerThread,
+                                                         KPerThread,
+                                                         M1N1ThreadClusterM10,
+                                                         M1N1ThreadClusterN10,
+                                                         M1N1ThreadClusterM11,
+                                                         M1N1ThreadClusterN11,
+                                                         ABlockTransferThreadSliceLengths_K_M0_M1,
+                                                         ABlockTransferThreadClusterLengths_K_M0_M1,
+                                                         ABlockTransferThreadClusterArrangeOrder,
+                                                         ABlockTransferSrcAccessOrder,
+                                                         ABlockTransferSrcVectorDim,
+                                                         ABlockTransferSrcScalarPerVector,
+                                                         ABlockTransferDstScalarPerVector_M1,
+                                                         AThreadTransferSrcResetCoordinateAfterRun,
+                                                         BBlockTransferThreadSliceLengths_K_N0_N1,
+                                                         BBlockTransferThreadClusterLengths_K_N0_N1,
+                                                         BBlockTransferThreadClusterArrangeOrder,
+                                                         BBlockTransferSrcAccessOrder,
+                                                         BBlockTransferSrcVectorDim,
+                                                         BBlockTransferSrcScalarPerVector,
+                                                         BBlockTransferDstScalarPerVector_N1,
+                                                         BThreadTransferSrcResetCoordinateAfterRun,
+                                                         CThreadTransferSrcDstAccessOrder,
+                                                         CThreadTransferSrcDstVectorDim,
+                                                         CThreadTransferDstScalarPerVector,
+                                                         AGridStepHacks,
+                                                         BGridStepHacks,
+                                                         CGridStepHacks,
+                                                         AGridMoveSliceWindowStepHacks,
+                                                         BGridMoveSliceWindowStepHacks>;
+
+    const auto M = a_k_m_grid_desc.GetLength(I1);
+    const auto N = b_k_n_grid_desc.GetLength(I1);
+    const auto K = a_k_m_grid_desc.GetLength(I0);
+
+    if(!GridwiseGemm::CheckValidity(a_k_m_grid_desc, b_k_n_grid_desc, c_m_n_grid_desc))
+    {
+        throw std::runtime_error("wrong! GridwiseGemmDlops_km_kn_mn_v1r2 has invalid setting");
+    }
+
+    const auto a_k_m0_m1_grid_desc = GridwiseGemm::MakeAKM0M1GridDescriptor(a_k_m_grid_desc);
+    const auto b_k_n0_n1_grid_desc = GridwiseGemm::MakeBKN0N1GridDescriptor(b_k_n_grid_desc);
+
+    using AKM0M1GridDesc = decltype(a_k_m0_m1_grid_desc);
+    using BKN0N1GridDesc = decltype(b_k_n0_n1_grid_desc);
+
+    // c_m0_m10_m11_n0_n10_n11_grid_desc
+    const auto c_m0_m10_m11_n0_n10_n11_grid_desc =
+        GridwiseGemm::MakeCM0M10M11N0N10N11GridDescriptor(c_m_n_grid_desc);
+
+    using CM0M10M11N0N10N11GridDesc = decltype(c_m0_m10_m11_n0_n10_n11_grid_desc);
+
+    // c_blockid_to_m0_n0_block_cluster_adaptor
+    const auto c_blockid_to_m0_n0_block_cluster_adaptor =
+        GridwiseGemm::MakeCBlockIdToM0N0BlockClusterAdaptor(c_m_n_grid_desc);
+
+    using CBlockIdToM0N0BlockClusterAdaptor = decltype(c_blockid_to_m0_n0_block_cluster_adaptor);
+
+    const index_t grid_size = GridwiseGemm::CalculateGridSize(M, N);
+
+    const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K);
+
+    const bool has_double_tail_k_block_loop = GridwiseGemm::CalculateHasDoubleTailKBlockLoop(K);
+
+    {
+        std::cout << "a_k_m0_m1_grid_desc{" << a_k_m0_m1_grid_desc.GetLength(I0) << ", "
+                  << a_k_m0_m1_grid_desc.GetLength(I1) << ", " << a_k_m0_m1_grid_desc.GetLength(I2)
+                  << "}" << std::endl;
+
+        std::cout << "b_k_n0_n1_grid_desc{" << b_k_n0_n1_grid_desc.GetLength(I0) << ", "
+                  << b_k_n0_n1_grid_desc.GetLength(I1) << ", " << b_k_n0_n1_grid_desc.GetLength(I2)
+                  << "}" << std::endl;
+
+        std::cout << "c_m0_m10_m11_n0_n10_n11_grid_desc{ "
+                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I0) << ", "
+                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I1) << ", "
+                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I2) << ", "
+                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I3) << ", "
+                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I4) << ", "
+                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I5) << "}" << std::endl;
+    }
+
+#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
+    float ave_time = 0;
+
+    if(has_main_k_block_loop && has_double_tail_k_block_loop)
+    {
+        const auto kernel =
+            kernel_gemm_dlops_v1r2<GridwiseGemm,
+                                   FloatAB,
+                                   FloatC,
+                                   remove_reference_t<AKM0M1GridDesc>,
+                                   remove_reference_t<BKN0N1GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   true,
+                                   true>;
+
+        ave_time = launch_and_time_kernel(kernel,
+                                          nrepeat,
+                                          dim3(grid_size),
+                                          dim3(BlockSize),
+                                          0,
+                                          p_a_grid,
+                                          p_b_grid,
+                                          p_c_grid,
+                                          a_k_m0_m1_grid_desc,
+                                          b_k_n0_n1_grid_desc,
+                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
+                                          c_blockid_to_m0_n0_block_cluster_adaptor);
+    }
+    else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
+    {
+        const auto kernel =
+            kernel_gemm_dlops_v1r2<GridwiseGemm,
+                                   FloatAB,
+                                   FloatC,
+                                   remove_reference_t<AKM0M1GridDesc>,
+                                   remove_reference_t<BKN0N1GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   true,
+                                   false>;
+
+        ave_time = launch_and_time_kernel(kernel,
+                                          nrepeat,
+                                          dim3(grid_size),
+                                          dim3(BlockSize),
+                                          0,
+                                          p_a_grid,
+                                          p_b_grid,
+                                          p_c_grid,
+                                          a_k_m0_m1_grid_desc,
+                                          b_k_n0_n1_grid_desc,
+                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
+                                          c_blockid_to_m0_n0_block_cluster_adaptor);
+    }
+    else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
+    {
+        const auto kernel =
+            kernel_gemm_dlops_v1r2<GridwiseGemm,
+                                   FloatAB,
+                                   FloatC,
+                                   remove_reference_t<AKM0M1GridDesc>,
+                                   remove_reference_t<BKN0N1GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   false,
+                                   true>;
+
+        ave_time = launch_and_time_kernel(kernel,
+                                          nrepeat,
+                                          dim3(grid_size),
+                                          dim3(BlockSize),
+                                          0,
+                                          p_a_grid,
+                                          p_b_grid,
+                                          p_c_grid,
+                                          a_k_m0_m1_grid_desc,
+                                          b_k_n0_n1_grid_desc,
+                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
+                                          c_blockid_to_m0_n0_block_cluster_adaptor);
+    }
+    else
+    {
+        const auto kernel =
+            kernel_gemm_dlops_v1r2<GridwiseGemm,
+                                   FloatAB,
+                                   FloatC,
+                                   remove_reference_t<AKM0M1GridDesc>,
+                                   remove_reference_t<BKN0N1GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   false,
+                                   false>;
+
+        ave_time = launch_and_time_kernel(kernel,
+                                          nrepeat,
+                                          dim3(grid_size),
+                                          dim3(BlockSize),
+                                          0,
+                                          p_a_grid,
+                                          p_b_grid,
+                                          p_c_grid,
+                                          a_k_m0_m1_grid_desc,
+                                          b_k_n0_n1_grid_desc,
+                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
+                                          c_blockid_to_m0_n0_block_cluster_adaptor);
+    }
+
+    return ave_time;
+#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
+    DeviceMem a_k_m0_m1_grid_desc_dev_buf(sizeof(AKM0M1GridDesc));
+    DeviceMem b_k_n0_n1_grid_desc_dev_buf(sizeof(BKN0N1GridDesc));
+    DeviceMem c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf(sizeof(CM0M10M11N0N10N11GridDesc));
+    DeviceMem c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf(
+        sizeof(CBlockIdToM0N0BlockClusterAdaptor));
+
+    a_k_m0_m1_grid_desc_dev_buf.ToDevice(&a_k_m0_m1_grid_desc);
+    b_k_n0_n1_grid_desc_dev_buf.ToDevice(&b_k_n0_n1_grid_desc);
+    c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.ToDevice(&c_m0_m10_m11_n0_n10_n11_grid_desc);
+    c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.ToDevice(
+        &c_blockid_to_m0_n0_block_cluster_adaptor);
+
+    float ave_time = 0;
+
+    if(has_main_k_block_loop && has_double_tail_k_block_loop)
+    {
+        const auto kernel =
+            kernel_gemm_dlops_v1r2<GridwiseGemm,
+                                   FloatAB,
+                                   FloatC,
+                                   remove_reference_t<AKM0M1GridDesc>,
+                                   remove_reference_t<BKN0N1GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   true,
+                                   true>;
+
+        ave_time = launch_and_time_kernel(
+            kernel,
+            nrepeat,
+            dim3(grid_size),
+            dim3(BlockSize),
+            0,
+            p_a_grid,
+            p_b_grid,
+            p_c_grid,
+            cast_pointer_to_constant_address_space(a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
+    }
+    else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
+    {
+        const auto kernel =
+            kernel_gemm_dlops_v1r2<GridwiseGemm,
+                                   FloatAB,
+                                   FloatC,
+                                   remove_reference_t<AKM0M1GridDesc>,
+                                   remove_reference_t<BKN0N1GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   true,
+                                   false>;
+
+        ave_time = launch_and_time_kernel(
+            kernel,
+            nrepeat,
+            dim3(grid_size),
+            dim3(BlockSize),
+            0,
+            p_a_grid,
+            p_b_grid,
+            p_c_grid,
+            cast_pointer_to_constant_address_space(a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
+    }
+    else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
+    {
+        const auto kernel =
+            kernel_gemm_dlops_v1r2<GridwiseGemm,
+                                   FloatAB,
+                                   FloatC,
+                                   remove_reference_t<AKM0M1GridDesc>,
+                                   remove_reference_t<BKN0N1GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   false,
+                                   true>;
+
+        ave_time = launch_and_time_kernel(
+            kernel,
+            nrepeat,
+            dim3(grid_size),
+            dim3(BlockSize),
+            0,
+            p_a_grid,
+            p_b_grid,
+            p_c_grid,
+            cast_pointer_to_constant_address_space(a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
+    }
+    else
+    {
+        const auto kernel =
+            kernel_gemm_dlops_v1r2<GridwiseGemm,
+                                   FloatAB,
+                                   FloatC,
+                                   remove_reference_t<AKM0M1GridDesc>,
+                                   remove_reference_t<BKN0N1GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   false,
+                                   false>;
+
+        ave_time = launch_and_time_kernel(
+            kernel,
+            nrepeat,
+            dim3(grid_size),
+            dim3(BlockSize),
+            0,
+            p_a_grid,
+            p_b_grid,
+            p_c_grid,
+            cast_pointer_to_constant_address_space(a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
+    }
+
+    return ave_time;
+#endif
+}
+#endif
diff --git a/host/driver_offline/include/driver_gemm_dlops_v1r3.hpp b/host/driver_offline/include/driver_gemm_dlops_v1r3.hpp
new file mode 100644
index 0000000000..4470918820
--- /dev/null
+++ b/host/driver_offline/include/driver_gemm_dlops_v1r3.hpp
@@ -0,0 +1,418 @@
+#ifndef DRIVER_GEMM_DLOPS_V1R3
+#define DRIVER_GEMM_DLOPS_V1R3
+
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "gridwise_gemm_dlops_v1r3.hpp"
+
+template <ck::index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          ck::InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
+          typename AK0MK1GridDesc,
+          typename BK0NK1GridDesc,
+          typename CMNGridDesc,
+          ck::index_t MPerBlock,
+          ck::index_t NPerBlock,
+          ck::index_t KPerBlock,
+          ck::index_t M1PerThread,
+          ck::index_t N1PerThread,
+          ck::index_t KPerThread,
+          typename M1N1ThreadClusterM1Xs,
+          typename M1N1ThreadClusterN1Xs,
+          typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
+          typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
+          typename ABlockTransferSrcVectorTensorContiguousDimOrder,
+          typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
+          typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
+          typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
+          typename BBlockTransferSrcVectorTensorContiguousDimOrder,
+          typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
+          typename CThreadTransferSrcDstAccessOrder,
+          ck::index_t CThreadTransferSrcDstVectorDim,
+          ck::index_t CThreadTransferDstScalarPerVector,
+          typename AGridStepHacks,
+          typename BGridStepHacks,
+          typename CGridStepHacks,
+          typename AGridMoveSliceWindowStepHacks,
+          typename BGridMoveSliceWindowStepHacks>
+__host__ float driver_gemm_dlops_v1r3(const FloatAB* p_a_grid,
+                                      const FloatAB* p_b_grid,
+                                      FloatC* p_c_grid,
+                                      const AK0MK1GridDesc& a_k0_m_k1_grid_desc,
+                                      const BK0NK1GridDesc& b_k0_n_k1_grid_desc,
+                                      const CMNGridDesc& c_m_n_grid_desc,
+                                      AGridStepHacks,
+                                      BGridStepHacks,
+                                      CGridStepHacks,
+                                      AGridMoveSliceWindowStepHacks,
+                                      BGridMoveSliceWindowStepHacks,
+                                      ck::index_t nrepeat)
+
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+
+    // GEMM
+    using GridwiseGemm =
+        GridwiseGemmDlops_km_kn_mn_v1r3<BlockSize,
+                                        FloatAB,
+                                        FloatAcc,
+                                        FloatC,
+                                        CGlobalMemoryDataOperation,
+                                        AK0MK1GridDesc,
+                                        BK0NK1GridDesc,
+                                        CMNGridDesc,
+                                        MPerBlock,
+                                        NPerBlock,
+                                        KPerBlock,
+                                        M1PerThread,
+                                        N1PerThread,
+                                        KPerThread,
+                                        M1N1ThreadClusterM1Xs,
+                                        M1N1ThreadClusterN1Xs,
+                                        ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
+                                        ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
+                                        ABlockTransferThreadClusterArrangeOrder,
+                                        ABlockTransferSrcAccessOrder,
+                                        ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
+                                        ABlockTransferSrcVectorTensorContiguousDimOrder,
+                                        ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
+                                        BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
+                                        BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
+                                        BBlockTransferThreadClusterArrangeOrder,
+                                        BBlockTransferSrcAccessOrder,
+                                        BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
+                                        BBlockTransferSrcVectorTensorContiguousDimOrder,
+                                        BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
+                                        CThreadTransferSrcDstAccessOrder,
+                                        CThreadTransferSrcDstVectorDim,
+                                        CThreadTransferDstScalarPerVector,
+                                        AGridStepHacks,
+                                        BGridStepHacks,
+                                        CGridStepHacks,
+                                        AGridMoveSliceWindowStepHacks,
+                                        BGridMoveSliceWindowStepHacks>;
+
+    const auto M  = a_k0_m_k1_grid_desc.GetLength(I1);
+    const auto N  = b_k0_n_k1_grid_desc.GetLength(I1);
+    const auto K0 = a_k0_m_k1_grid_desc.GetLength(I0);
+
+    if(!GridwiseGemm::CheckValidity(a_k0_m_k1_grid_desc, b_k0_n_k1_grid_desc, c_m_n_grid_desc))
+    {
+        throw std::runtime_error("wrong! GridwiseGemmDlops_km_kn_mn_v1r3 has invalid setting");
+    }
+
+    const auto a_k0_m0_m1_k1_grid_desc =
+        GridwiseGemm::MakeAK0M0M1K1GridDescriptor(a_k0_m_k1_grid_desc);
+    const auto b_k0_n0_n1_k1_grid_desc =
+        GridwiseGemm::MakeBK0N0N1K1GridDescriptor(b_k0_n_k1_grid_desc);
+
+    using AK0M0M1K1GridDesc = decltype(a_k0_m0_m1_k1_grid_desc);
+    using BK0N0N1K1GridDesc = decltype(b_k0_n0_n1_k1_grid_desc);
+
+    // c_m0_m10_m11_n0_n10_n11_grid_desc
+    const auto c_m0_m10_m11_n0_n10_n11_grid_desc =
+        GridwiseGemm::MakeCM0M10M11N0N10N11GridDescriptor(c_m_n_grid_desc);
+
+    using CM0M10M11N0N10N11GridDesc = decltype(c_m0_m10_m11_n0_n10_n11_grid_desc);
+
+    // c_blockid_to_m0_n0_block_cluster_adaptor
+    const auto c_blockid_to_m0_n0_block_cluster_adaptor =
+        GridwiseGemm::MakeCBlockIdToM0N0BlockClusterAdaptor(c_m_n_grid_desc);
+
+    using CBlockIdToM0N0BlockClusterAdaptor = decltype(c_blockid_to_m0_n0_block_cluster_adaptor);
+
+    const index_t grid_size = GridwiseGemm::CalculateGridSize(M, N);
+
+    const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K0);
+
+    const bool has_double_tail_k_block_loop = GridwiseGemm::CalculateHasDoubleTailKBlockLoop(K0);
+
+    {
+        std::cout << "a_k0_m0_m1_k1_grid_desc{" << a_k0_m0_m1_k1_grid_desc.GetLength(I0) << ", "
+                  << a_k0_m0_m1_k1_grid_desc.GetLength(I1) << ", "
+                  << a_k0_m0_m1_k1_grid_desc.GetLength(I2) << ", "
+                  << a_k0_m0_m1_k1_grid_desc.GetLength(I3) << "}" << std::endl;
+
+        std::cout << "b_k0_n0_n1_k1_grid_desc{" << b_k0_n0_n1_k1_grid_desc.GetLength(I0) << ", "
+                  << b_k0_n0_n1_k1_grid_desc.GetLength(I1) << ", "
+                  << b_k0_n0_n1_k1_grid_desc.GetLength(I2) << ", "
+                  << b_k0_n0_n1_k1_grid_desc.GetLength(I3) << "}" << std::endl;
+
+        std::cout << "c_m0_m10_m11_n0_n10_n11_grid_desc{ "
+                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I0) << ", "
+                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I1) << ", "
+                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I2) << ", "
+                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I3) << ", "
+                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I4) << ", "
+                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I5) << "}" << std::endl;
+    }
+
+#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
+    float ave_time = 0;
+
+    if(has_main_k_block_loop && has_double_tail_k_block_loop)
+    {
+        const auto kernel =
+            kernel_gemm_dlops_v1r3<GridwiseGemm,
+                                   FloatAB,
+                                   FloatC,
+                                   remove_reference_t<AK0M0M1K1GridDesc>,
+                                   remove_reference_t<BK0N0N1K1GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   true,
+                                   true>;
+
+        ave_time = launch_and_time_kernel(kernel,
+                                          nrepeat,
+                                          dim3(grid_size),
+                                          dim3(BlockSize),
+                                          0,
+                                          p_a_grid,
+                                          p_b_grid,
+                                          p_c_grid,
+                                          a_k0_m0_m1_k1_grid_desc,
+                                          b_k0_n0_n1_k1_grid_desc,
+                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
+                                          c_blockid_to_m0_n0_block_cluster_adaptor);
+    }
+    else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
+    {
+        const auto kernel =
+            kernel_gemm_dlops_v1r3<GridwiseGemm,
+                                   FloatAB,
+                                   FloatC,
+                                   remove_reference_t<AK0M0M1K1GridDesc>,
+                                   remove_reference_t<BK0N0N1K1GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   true,
+                                   false>;
+
+        ave_time = launch_and_time_kernel(kernel,
+                                          nrepeat,
+                                          dim3(grid_size),
+                                          dim3(BlockSize),
+                                          0,
+                                          p_a_grid,
+                                          p_b_grid,
+                                          p_c_grid,
+                                          a_k0_m0_m1_k1_grid_desc,
+                                          b_k0_n0_n1_k1_grid_desc,
+                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
+                                          c_blockid_to_m0_n0_block_cluster_adaptor);
+    }
+    else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
+    {
+        const auto kernel =
+            kernel_gemm_dlops_v1r3<GridwiseGemm,
+                                   FloatAB,
+                                   FloatC,
+                                   remove_reference_t<AK0M0M1K1GridDesc>,
+                                   remove_reference_t<BK0N0N1K1GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   false,
+                                   true>;
+
+        ave_time = launch_and_time_kernel(kernel,
+                                          nrepeat,
+                                          dim3(grid_size),
+                                          dim3(BlockSize),
+                                          0,
+                                          p_a_grid,
+                                          p_b_grid,
+                                          p_c_grid,
+                                          a_k0_m0_m1_k1_grid_desc,
+                                          b_k0_n0_n1_k1_grid_desc,
+                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
+                                          c_blockid_to_m0_n0_block_cluster_adaptor);
+    }
+    else
+    {
+        const auto kernel =
+            kernel_gemm_dlops_v1r3<GridwiseGemm,
+                                   FloatAB,
+                                   FloatC,
+                                   remove_reference_t<AK0M0M1K1GridDesc>,
+                                   remove_reference_t<BK0N0N1K1GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   false,
+                                   false>;
+
+        ave_time = launch_and_time_kernel(kernel,
+                                          nrepeat,
+                                          dim3(grid_size),
+                                          dim3(BlockSize),
+                                          0,
+                                          p_a_grid,
+                                          p_b_grid,
+                                          p_c_grid,
+                                          a_k0_m0_m1_k1_grid_desc,
+                                          b_k0_n0_n1_k1_grid_desc,
+                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
+                                          c_blockid_to_m0_n0_block_cluster_adaptor);
+    }
+
+    return ave_time;
+#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
+    DeviceMem a_k0_m0_m1_k1_grid_desc_dev_buf(sizeof(AK0M0M1K1GridDesc));
+    DeviceMem b_k0_n0_n1_k1_grid_desc_dev_buf(sizeof(BK0N0N1K1GridDesc));
+    DeviceMem c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf(sizeof(CM0M10M11N0N10N11GridDesc));
+    DeviceMem c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf(
+        sizeof(CBlockIdToM0N0BlockClusterAdaptor));
+
+    a_k0_m0_m1_k1_grid_desc_dev_buf.ToDevice(&a_k0_m0_m1_k1_grid_desc);
+    b_k0_n0_n1_k1_grid_desc_dev_buf.ToDevice(&b_k0_n0_n1_k1_grid_desc);
+    c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.ToDevice(&c_m0_m10_m11_n0_n10_n11_grid_desc);
+    c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.ToDevice(
+        &c_blockid_to_m0_n0_block_cluster_adaptor);
+
+    float ave_time = 0;
+
+    if(has_main_k_block_loop && has_double_tail_k_block_loop)
+    {
+        const auto kernel =
+            kernel_gemm_dlops_v1r3<GridwiseGemm,
+                                   FloatAB,
+                                   FloatC,
+                                   remove_reference_t<AK0M0M1K1GridDesc>,
+                                   remove_reference_t<BK0N0N1K1GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   true,
+                                   true>;
+
+        ave_time = launch_and_time_kernel(
+            kernel,
+            nrepeat,
+            dim3(grid_size),
+            dim3(BlockSize),
+            0,
+            p_a_grid,
+            p_b_grid,
+            p_c_grid,
+            cast_pointer_to_constant_address_space(
+                a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
+    }
+    else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
+    {
+        const auto kernel =
+            kernel_gemm_dlops_v1r3<GridwiseGemm,
+                                   FloatAB,
+                                   FloatC,
+                                   remove_reference_t<AK0M0M1K1GridDesc>,
+                                   remove_reference_t<BK0N0N1K1GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   true,
+                                   false>;
+
+        ave_time = launch_and_time_kernel(
+            kernel,
+            nrepeat,
+            dim3(grid_size),
+            dim3(BlockSize),
+            0,
+            p_a_grid,
+            p_b_grid,
+            p_c_grid,
+            cast_pointer_to_constant_address_space(
+                a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
+    }
+    else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
+    {
+        const auto kernel =
+            kernel_gemm_dlops_v1r3<GridwiseGemm,
+                                   FloatAB,
+                                   FloatC,
+                                   remove_reference_t<AK0M0M1K1GridDesc>,
+                                   remove_reference_t<BK0N0N1K1GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   false,
+                                   true>;
+
+        ave_time = launch_and_time_kernel(
+            kernel,
+            nrepeat,
+            dim3(grid_size),
+            dim3(BlockSize),
+            0,
+            p_a_grid,
+            p_b_grid,
+            p_c_grid,
+            cast_pointer_to_constant_address_space(
+                a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
+    }
+    else
+    {
+        const auto kernel =
+            kernel_gemm_dlops_v1r3<GridwiseGemm,
+                                   FloatAB,
+                                   FloatC,
+                                   remove_reference_t<AK0M0M1K1GridDesc>,
+                                   remove_reference_t<BK0N0N1K1GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   false,
+                                   false>;
+
+        ave_time = launch_and_time_kernel(
+            kernel,
+            nrepeat,
+            dim3(grid_size),
+            dim3(BlockSize),
+            0,
+            p_a_grid,
+            p_b_grid,
+            p_c_grid,
+            cast_pointer_to_constant_address_space(
+                a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
+    }
+
+    return ave_time;
+#endif
+}
+#endif
diff --git a/host/driver_offline/include/driver_gemm_xdlops_v2r3.hpp b/host/driver_offline/include/driver_gemm_xdlops_v2r3.hpp
new file mode 100644
index 0000000000..edfce52a19
--- /dev/null
+++ b/host/driver_offline/include/driver_gemm_xdlops_v2r3.hpp
@@ -0,0 +1,191 @@
+#ifndef DRIVER_GEMM_XDLOPS_V2R3
+#define DRIVER_GEMM_XDLOPS_V2R3
+
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "gridwise_gemm_xdlops_v2r3.hpp"
+
+template <ck::index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          ck::InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
+          typename AK0MK1GridDesc,
+          typename BK0NK1GridDesc,
+          typename CMNGridDesc,
+          ck::index_t MPerBlock,
+          ck::index_t NPerBlock,
+          ck::index_t KPerBlock,
+          ck::index_t MPerWave,
+          ck::index_t NPerWave,
+          ck::index_t K1,
+          ck::index_t MRepeat,
+          ck::index_t NRepeat,
+          typename ABlockTransferThreadSliceLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          ck::index_t ABlockTransferSrcVectorDim,
+          ck::index_t ABlockTransferSrcScalarPerVector,
+          ck::index_t ABlockTransferDstScalarPerVector_K1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          typename BBlockTransferThreadSliceLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          ck::index_t BBlockTransferSrcVectorDim,
+          ck::index_t BBlockTransferSrcScalarPerVector,
+          ck::index_t BBlockTransferDstScalarPerVector_K1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          typename CThreadTransferSrcDstAccessOrder,
+          ck::index_t CThreadTransferSrcDstVectorDim,
+          ck::index_t CThreadTransferDstScalarPerVector,
+          typename AGridStepHacks,
+          typename BGridStepHacks,
+          typename CGridStepHacks,
+          typename AGridMoveSliceWindowStepHacks,
+          typename BGridMoveSliceWindowStepHacks,
+          bool CAccessOrderMRepeatNRepeat>
+__host__ float driver_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
+                                       const FloatAB* p_b_grid,
+                                       FloatC* p_c_grid,
+                                       const AK0MK1GridDesc& a_k0_m_k1_grid_desc,
+                                       const BK0NK1GridDesc& b_k0_n_k1_grid_desc,
+                                       const CMNGridDesc& c_m_n_grid_desc,
+                                       AGridStepHacks,
+                                       BGridStepHacks,
+                                       CGridStepHacks,
+                                       AGridMoveSliceWindowStepHacks,
+                                       BGridMoveSliceWindowStepHacks,
+                                       ck::index_t nrepeat)
+
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+
+    using GridwiseGemm =
+        GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
+                                                FloatAB,
+                                                FloatAcc,
+                                                FloatC,
+                                                CGlobalMemoryDataOperation,
+                                                AK0MK1GridDesc,
+                                                BK0NK1GridDesc,
+                                                CMNGridDesc,
+                                                MPerBlock,
+                                                NPerBlock,
+                                                KPerBlock,
+                                                MPerWave,
+                                                NPerWave,
+                                                K1,
+                                                MRepeat,
+                                                NRepeat,
+                                                ABlockTransferThreadSliceLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                ABlockTransferSrcAccessOrder,
+                                                ABlockTransferSrcVectorDim,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_K1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                BBlockTransferThreadSliceLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                BBlockTransferSrcAccessOrder,
+                                                BBlockTransferSrcVectorDim,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_K1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                CThreadTransferSrcDstAccessOrder,
+                                                CThreadTransferSrcDstVectorDim,
+                                                CThreadTransferDstScalarPerVector,
+                                                AGridStepHacks,
+                                                BGridStepHacks,
+                                                CGridStepHacks,
+                                                AGridMoveSliceWindowStepHacks,
+                                                BGridMoveSliceWindowStepHacks,
+                                                CAccessOrderMRepeatNRepeat>;
+
+    {
+        std::cout << "a_k0_m_k1_grid_desc{" << a_k0_m_k1_grid_desc.GetLength(I0) << ", "
+                  << a_k0_m_k1_grid_desc.GetLength(I1) << ", " << a_k0_m_k1_grid_desc.GetLength(I2)
+                  << "}" << std::endl;
+
+        std::cout << "b_k0_n_k1_grid_desc{" << b_k0_n_k1_grid_desc.GetLength(I0) << ", "
+                  << b_k0_n_k1_grid_desc.GetLength(I1) << ", " << b_k0_n_k1_grid_desc.GetLength(I2)
+                  << "}" << std::endl;
+
+        std::cout << "c_m_n_grid_desc{ " << c_m_n_grid_desc.GetLength(I0) << ", "
+                  << c_m_n_grid_desc.GetLength(I1) << "}" << std::endl;
+    }
+
+    if(!GridwiseGemm::CheckValidity(a_k0_m_k1_grid_desc, b_k0_n_k1_grid_desc, c_m_n_grid_desc))
+    {
+        throw std::runtime_error(
+            "wrong! GridwiseGemm_km_kn_m0m1n0n1_xdlops_v2r3 has invalid setting");
+    }
+
+    const auto c_m0_m1_m2_n_grid_desc = GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
+
+    using CM0M1M2NGridDesc = decltype(c_m0_m1_m2_n_grid_desc);
+
+    const auto c_block_cluster_adaptor = GridwiseGemm::MakeCBlockClusterAdaptor(c_m_n_grid_desc);
+
+    using CBlockClusterAdaptor = decltype(c_block_cluster_adaptor);
+
+    const index_t grid_size = GridwiseGemm::CalculateGridSize(c_m_n_grid_desc);
+
+    const auto kernel = kernel_gemm_xdlops_v2r3<GridwiseGemm,
+                                                FloatAB,
+                                                FloatC,
+                                                remove_reference_t<AK0MK1GridDesc>,
+                                                remove_reference_t<BK0NK1GridDesc>,
+                                                remove_reference_t<CM0M1M2NGridDesc>,
+                                                remove_reference_t<CBlockClusterAdaptor>>;
+
+#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
+    float ave_time = launch_and_time_kernel(kernel,
+                                            nrepeat,
+                                            dim3(grid_size),
+                                            dim3(BlockSize),
+                                            0,
+                                            p_a_grid,
+                                            p_b_grid,
+                                            p_c_grid,
+                                            a_k0_m_k1_grid_desc,
+                                            b_k0_n_k1_grid_desc,
+                                            c_m0_m1_m2_n_grid_desc,
+                                            c_block_cluster_adaptor);
+
+#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
+    DeviceMem a_k0_m_k1_grid_desc_dev_buf(sizeof(AK0MK1GridDesc));
+    DeviceMem b_k0_n_k1_grid_desc_dev_buf(sizeof(BK0NK1GridDesc));
+    DeviceMem c_m0_m1_m2_n_grid_desc_dev_buf(sizeof(CM0M1M2NGridDesc));
+    DeviceMem c_block_cluster_adaptor_dev_buf(sizeof(CBlockClusterAdaptor));
+
+    a_k0_m_k1_grid_desc_dev_buf.ToDevice(&a_k0_m_k1_grid_desc);
+    b_k0_n_k1_grid_desc_dev_buf.ToDevice(&b_k0_n_k1_grid_desc);
+    c_m0_m1_m2_n_grid_desc_dev_buf.ToDevice(&c_m0_m1_m2_n_grid_desc);
+    c_block_cluster_adaptor_dev_buf.ToDevice(&c_block_cluster_adaptor);
+
+    float ave_time = launch_and_time_kernel(
+        kernel,
+        nrepeat,
+        dim3(grid_size),
+        dim3(BlockSize),
+        0,
+        p_a_grid,
+        p_b_grid,
+        p_c_grid,
+        cast_pointer_to_constant_address_space(a_k0_m_k1_grid_desc_dev_buf.GetDeviceBuffer()),
+        cast_pointer_to_constant_address_space(b_k0_n_k1_grid_desc_dev_buf.GetDeviceBuffer()),
+        cast_pointer_to_constant_address_space(c_m0_m1_m2_n_grid_desc_dev_buf.GetDeviceBuffer()),
+        cast_pointer_to_constant_address_space(c_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
+#endif
+    return ave_time;
+}
+#endif
diff --git a/host/driver_offline/conv_bwd_driver_offline.cpp b/host/driver_offline/src/conv_bwd_driver_offline.cpp
similarity index 67%
rename from host/driver_offline/conv_bwd_driver_offline.cpp
rename to host/driver_offline/src/conv_bwd_driver_offline.cpp
index 61c3fc385d..67cea94813 100644
--- a/host/driver_offline/conv_bwd_driver_offline.cpp
+++ b/host/driver_offline/src/conv_bwd_driver_offline.cpp
@@ -12,10 +12,10 @@
 #include "conv_common.hpp"
 #include "host_conv_bwd_data.hpp"
 #include "device_tensor.hpp"
-#include "device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp"
-#include "device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp"
+#include "device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp"
+#include "device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp"
 
-#define USE_DYNAMIC_MODE 1
+#define USE_MODE 1
 #define USE_CONV_BWD_V4R1_XDL_NHWC 1
 #define USE_CONV_BWD_V4R1R2_XDL_NHWC 1
 
@@ -37,7 +37,7 @@ int main(int argc, char* argv[])
     constexpr auto I5 = Number<5>{};
     constexpr auto I6 = Number<6>{};
 
-#if USE_DYNAMIC_MODE
+#if USE_MODE
     // dynamic mode
     if(argc != 22)
     {
@@ -46,29 +46,29 @@ int main(int argc, char* argv[])
         exit(1);
     }
 
-    const ConvTensorLayout layout   = static_cast<ConvTensorLayout>(atoi(argv[1]));
-    const ConvBackwardDataAlgo algo = static_cast<ConvBackwardDataAlgo>(atoi(argv[2]));
-    const bool do_verification      = atoi(argv[3]);
-    const int init_method           = atoi(argv[4]);
-    const bool do_log               = atoi(argv[5]);
-    const int nrepeat               = atoi(argv[6]);
-
-    const index_t N  = atoi(argv[7]);
-    const index_t K  = atoi(argv[8]);
-    const index_t C  = atoi(argv[9]);
-    const index_t Y  = atoi(argv[10]);
-    const index_t X  = atoi(argv[11]);
-    const index_t Hi = atoi(argv[12]);
-    const index_t Wi = atoi(argv[13]);
-
-    const index_t conv_stride_h   = atoi(argv[14]);
-    const index_t conv_stride_w   = atoi(argv[15]);
-    const index_t conv_dilation_h = atoi(argv[16]);
-    const index_t conv_dilation_w = atoi(argv[17]);
-    const index_t in_left_pad_h   = atoi(argv[18]);
-    const index_t in_left_pad_w   = atoi(argv[19]);
-    const index_t in_right_pad_h  = atoi(argv[20]);
-    const index_t in_right_pad_w  = atoi(argv[21]);
+    const ConvTensorLayout layout   = static_cast<ConvTensorLayout>(std::stoi(argv[1]));
+    const ConvBackwardDataAlgo algo = static_cast<ConvBackwardDataAlgo>(std::stoi(argv[2]));
+    const bool do_verification      = std::stoi(argv[3]);
+    const int init_method           = std::stoi(argv[4]);
+    const bool do_log               = std::stoi(argv[5]);
+    const int nrepeat               = std::stoi(argv[6]);
+
+    const index_t N  = std::stoi(argv[7]);
+    const index_t K  = std::stoi(argv[8]);
+    const index_t C  = std::stoi(argv[9]);
+    const index_t Y  = std::stoi(argv[10]);
+    const index_t X  = std::stoi(argv[11]);
+    const index_t Hi = std::stoi(argv[12]);
+    const index_t Wi = std::stoi(argv[13]);
+
+    const index_t conv_stride_h   = std::stoi(argv[14]);
+    const index_t conv_stride_w   = std::stoi(argv[15]);
+    const index_t conv_dilation_h = std::stoi(argv[16]);
+    const index_t conv_dilation_w = std::stoi(argv[17]);
+    const index_t in_left_pad_h   = std::stoi(argv[18]);
+    const index_t in_left_pad_w   = std::stoi(argv[19]);
+    const index_t in_right_pad_h  = std::stoi(argv[20]);
+    const index_t in_right_pad_w  = std::stoi(argv[21]);
 
     const index_t YEff = (Y - 1) * conv_dilation_h + 1;
     const index_t XEff = (X - 1) * conv_dilation_w + 1;
@@ -83,12 +83,12 @@ int main(int argc, char* argv[])
         exit(1);
     }
 
-    const ConvTensorLayout layout   = static_cast<ConvTensorLayout>(atoi(argv[1]));
-    const ConvBackwardDataAlgo algo = static_cast<ConvBackwardDataAlgo>(atoi(argv[2]));
-    const bool do_verification      = atoi(argv[3]);
-    const int init_method           = atoi(argv[4]);
-    const bool do_log               = atoi(argv[5]);
-    const int nrepeat               = atoi(argv[6]);
+    const ConvTensorLayout layout   = static_cast<ConvTensorLayout>(std::stoi(argv[1]));
+    const ConvBackwardDataAlgo algo = static_cast<ConvBackwardDataAlgo>(std::stoi(argv[2]));
+    const bool do_verification      = std::stoi(argv[3]);
+    const int init_method           = std::stoi(argv[4]);
+    const bool do_log               = std::stoi(argv[5]);
+    const int nrepeat               = std::stoi(argv[6]);
 
     constexpr index_t N  = 128;
     constexpr index_t C  = 192;
@@ -115,23 +115,19 @@ int main(int argc, char* argv[])
 #endif
 
 #if 0
-    constexpr index_t in_vector_size = 1;
     using in_data_t                  = float;
     using acc_data_t                 = float;
     using out_data_t                 = float;
 #elif 1
-    constexpr index_t in_vector_size = 1;
-    using in_data_t                  = half_t;
-    using acc_data_t                 = float;
-    using out_data_t                 = half_t;
+    using in_data_t  = half_t;
+    using acc_data_t = float;
+    using out_data_t = half_t;
 #endif
 
     std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4);
 
-    switch(layout)
+    if(layout == ConvTensorLayout::NCHW)
     {
-    case ConvTensorLayout::NCHW:
-        // NCHW
         in_lengths_host[0]  = static_cast<std::size_t>(N);
         in_lengths_host[1]  = static_cast<std::size_t>(C);
         in_lengths_host[2]  = static_cast<std::size_t>(Hi);
@@ -144,9 +140,9 @@ int main(int argc, char* argv[])
         out_lengths_host[1] = static_cast<std::size_t>(K);
         out_lengths_host[2] = static_cast<std::size_t>(Ho);
         out_lengths_host[3] = static_cast<std::size_t>(Wo);
-        break;
-    case ConvTensorLayout::NHWC:
-        // NHWC
+    }
+    else if(layout == ConvTensorLayout::NHWC)
+    {
         in_lengths_host[0]  = static_cast<std::size_t>(N);
         in_lengths_host[1]  = static_cast<std::size_t>(Hi);
         in_lengths_host[2]  = static_cast<std::size_t>(Wi);
@@ -159,8 +155,10 @@ int main(int argc, char* argv[])
         out_lengths_host[1] = static_cast<std::size_t>(Ho);
         out_lengths_host[2] = static_cast<std::size_t>(Wo);
         out_lengths_host[3] = static_cast<std::size_t>(K);
-        break;
-    default: throw std::runtime_error("wrong! not implemented");
+    }
+    else
+    {
+        throw std::runtime_error("wrong! not implemented");
     }
 
     Tensor<in_data_t> in_host(in_lengths_host);
@@ -213,40 +211,8 @@ int main(int argc, char* argv[])
         wei.GenerateTensorValue(gen_wei, num_thread);
     }
 
-    auto f_make_for_device_nchw = [&]() {
-#if USE_DYNAMIC_MODE
-        const auto in_lengths_dev     = make_tuple(N, C, Hi, Wi);
-        const auto wei_lengths_dev    = make_tuple(K, C, Y, X);
-        const auto out_lengths_dev    = make_tuple(N, K, Ho, Wo);
-        const auto conv_strides_dev   = make_tuple(conv_stride_h, conv_stride_w);
-        const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
-        const auto in_left_pads_dev   = make_tuple(in_left_pad_h, in_left_pad_w);
-        const auto in_right_pads_dev  = make_tuple(in_right_pad_h, in_right_pad_w);
-#else
-        const auto in_lengths_dev =
-            make_tuple(Number<N>{}, Number<C>{}, Number<Hi>{}, Number<Wi>{});
-        const auto wei_lengths_dev = make_tuple(Number<K>{}, Number<C>{}, Number<Y>{}, Number<X>{});
-        const auto out_lengths_dev =
-            make_tuple(Number<N>{}, Number<K>{}, Number<Ho>{}, Number<Wo>{});
-        const auto conv_strides_dev = make_tuple(Number<conv_stride_h>{}, Number<conv_stride_w>{});
-        const auto conv_dilations_dev =
-            make_tuple(Number<conv_dilation_h>{}, Number<conv_dilation_w>{});
-        const auto in_left_pads_dev = make_tuple(Number<in_left_pad_h>{}, Number<in_left_pad_w>{});
-        const auto in_right_pads_dev =
-            make_tuple(Number<in_right_pad_h>{}, Number<in_right_pad_w>{});
-#endif
-
-        return make_tuple(in_lengths_dev,
-                          wei_lengths_dev,
-                          out_lengths_dev,
-                          conv_strides_dev,
-                          conv_dilations_dev,
-                          in_left_pads_dev,
-                          in_right_pads_dev);
-    };
-
     auto f_make_for_device_nhwc = [&]() {
-#if USE_DYNAMIC_MODE
+#if USE_MODE
         const auto in_lengths_dev     = make_tuple(N, Hi, Wi, C);
         const auto wei_lengths_dev    = make_tuple(K, Y, X, C);
         const auto out_lengths_dev    = make_tuple(N, Ho, Wo, K);
@@ -277,8 +243,6 @@ int main(int argc, char* argv[])
                           in_right_pads_dev);
     };
 
-    const auto nhwc_desc = f_make_for_device_nhwc();
-
 #if USE_CONV_BWD_V4R1_XDL_NHWC
     if(algo == ConvBackwardDataAlgo::V4R1XDLNHWC)
     {
@@ -289,20 +253,20 @@ int main(int argc, char* argv[])
 
         const auto tmp = f_make_for_device_nhwc();
 
-        device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk<
-            in_data_t,
-            acc_data_t,
-            out_data_t>(tmp[I0],
-                        tmp[I1],
-                        tmp[I2],
-                        tmp[I3],
-                        tmp[I4],
-                        tmp[I5],
-                        tmp[I6],
-                        in_device,
-                        wei,
-                        out,
-                        nrepeat);
+        device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk<in_data_t,
+                                                                                  acc_data_t,
+                                                                                  out_data_t>(
+            tmp[I0],
+            tmp[I1],
+            tmp[I2],
+            tmp[I3],
+            tmp[I4],
+            tmp[I5],
+            tmp[I6],
+            in_device,
+            wei,
+            out,
+            nrepeat);
     }
 #endif
 
@@ -316,20 +280,20 @@ int main(int argc, char* argv[])
 
         const auto tmp = f_make_for_device_nhwc();
 
-        device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk<
-            in_data_t,
-            acc_data_t,
-            out_data_t>(tmp[I0],
-                        tmp[I1],
-                        tmp[I2],
-                        tmp[I3],
-                        tmp[I4],
-                        tmp[I5],
-                        tmp[I6],
-                        in_device,
-                        wei,
-                        out,
-                        nrepeat);
+        device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk<in_data_t,
+                                                                                    acc_data_t,
+                                                                                    out_data_t>(
+            tmp[I0],
+            tmp[I1],
+            tmp[I2],
+            tmp[I3],
+            tmp[I4],
+            tmp[I5],
+            tmp[I6],
+            in_device,
+            wei,
+            out,
+            nrepeat);
     }
 #endif
 
diff --git a/host/driver_offline/conv_fwd_driver_offline.cpp b/host/driver_offline/src/conv_fwd_driver_offline.cpp
similarity index 67%
rename from host/driver_offline/conv_fwd_driver_offline.cpp
rename to host/driver_offline/src/conv_fwd_driver_offline.cpp
index ef2e16c4fa..32c33003c5 100644
--- a/host/driver_offline/conv_fwd_driver_offline.cpp
+++ b/host/driver_offline/src/conv_fwd_driver_offline.cpp
@@ -12,17 +12,17 @@
 #include "conv_common.hpp"
 #include "host_conv.hpp"
 #include "device_tensor.hpp"
-#include "device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp"
-#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp"
-#include "device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp"
-#include "device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp"
-#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
-#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
-
-#define USE_DYNAMIC_MODE 1
+#include "device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp"
+#include "device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp"
+#include "device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp"
+#include "device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp"
+#include "device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
+#include "device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
+
+#define USE_MODE 1
 #define USE_CONV_FWD_V4R4_NCHW 1
 #define USE_CONV_FWD_V4R4R2_NHWC 1
-#define USE_CONV_FWD_V6R1_NCHW 1
+#define USE_CONV_FWD_V6R1_NCHW 0
 #define USE_CONV_FWD_V5R1_NCHW 0
 #define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
 #define USE_CONV_FWD_V4R4R4_XDL_NHWC 0
@@ -49,7 +49,7 @@ int main(int argc, char* argv[])
     constexpr auto I5 = Number<5>{};
     constexpr auto I6 = Number<6>{};
 
-#if USE_DYNAMIC_MODE
+#if USE_MODE
     // dynamic mode
     if(argc != 22)
     {
@@ -58,29 +58,29 @@ int main(int argc, char* argv[])
         exit(1);
     }
 
-    const ConvTensorLayout layout = static_cast<ConvTensorLayout>(atoi(argv[1]));
-    const ConvForwardAlgo algo    = static_cast<ConvForwardAlgo>(atoi(argv[2]));
-    const bool do_verification    = atoi(argv[3]);
-    const int init_method         = atoi(argv[4]);
-    const bool do_log             = atoi(argv[5]);
-    const int nrepeat             = atoi(argv[6]);
-
-    const index_t N  = atoi(argv[7]);
-    const index_t K  = atoi(argv[8]);
-    const index_t C  = atoi(argv[9]);
-    const index_t Y  = atoi(argv[10]);
-    const index_t X  = atoi(argv[11]);
-    const index_t Hi = atoi(argv[12]);
-    const index_t Wi = atoi(argv[13]);
-
-    const index_t conv_stride_h   = atoi(argv[14]);
-    const index_t conv_stride_w   = atoi(argv[15]);
-    const index_t conv_dilation_h = atoi(argv[16]);
-    const index_t conv_dilation_w = atoi(argv[17]);
-    const index_t in_left_pad_h   = atoi(argv[18]);
-    const index_t in_left_pad_w   = atoi(argv[19]);
-    const index_t in_right_pad_h  = atoi(argv[20]);
-    const index_t in_right_pad_w  = atoi(argv[21]);
+    const ConvTensorLayout layout = static_cast<ConvTensorLayout>(std::stoi(argv[1]));
+    const ConvForwardAlgo algo    = static_cast<ConvForwardAlgo>(std::stoi(argv[2]));
+    const bool do_verification    = std::stoi(argv[3]);
+    const int init_method         = std::stoi(argv[4]);
+    const bool do_log             = std::stoi(argv[5]);
+    const int nrepeat             = std::stoi(argv[6]);
+
+    const index_t N  = std::stoi(argv[7]);
+    const index_t K  = std::stoi(argv[8]);
+    const index_t C  = std::stoi(argv[9]);
+    const index_t Y  = std::stoi(argv[10]);
+    const index_t X  = std::stoi(argv[11]);
+    const index_t Hi = std::stoi(argv[12]);
+    const index_t Wi = std::stoi(argv[13]);
+
+    const index_t conv_stride_h   = std::stoi(argv[14]);
+    const index_t conv_stride_w   = std::stoi(argv[15]);
+    const index_t conv_dilation_h = std::stoi(argv[16]);
+    const index_t conv_dilation_w = std::stoi(argv[17]);
+    const index_t in_left_pad_h   = std::stoi(argv[18]);
+    const index_t in_left_pad_w   = std::stoi(argv[19]);
+    const index_t in_right_pad_h  = std::stoi(argv[20]);
+    const index_t in_right_pad_w  = std::stoi(argv[21]);
 
     const index_t YEff = (Y - 1) * conv_dilation_h + 1;
     const index_t XEff = (X - 1) * conv_dilation_w + 1;
@@ -95,12 +95,12 @@ int main(int argc, char* argv[])
         exit(1);
     }
 
-    const ConvTensorLayout layout = static_cast<ConvTensorLayout>(atoi(argv[1]));
-    const ConvForwardAlgo algo    = static_cast<ConvForwardAlgo>(atoi(argv[2]));
-    const bool do_verification    = atoi(argv[3]);
-    const int init_method         = atoi(argv[4]);
-    const bool do_log             = atoi(argv[5]);
-    const int nrepeat             = atoi(argv[6]);
+    const ConvTensorLayout layout = static_cast<ConvTensorLayout>(std::stoi(argv[1]));
+    const ConvForwardAlgo algo    = static_cast<ConvForwardAlgo>(std::stoi(argv[2]));
+    const bool do_verification    = std::stoi(argv[3]);
+    const int init_method         = std::stoi(argv[4]);
+    const bool do_log             = std::stoi(argv[5]);
+    const int nrepeat             = std::stoi(argv[6]);
 
     constexpr index_t N  = 128;
     constexpr index_t C  = 192;
@@ -142,10 +142,8 @@ int main(int argc, char* argv[])
 
     std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4);
 
-    switch(layout)
+    if(layout == ConvTensorLayout::NCHW)
     {
-    case ConvTensorLayout::NCHW:
-        // NCHW
         in_lengths_host[0]  = static_cast<std::size_t>(N);
         in_lengths_host[1]  = static_cast<std::size_t>(C);
         in_lengths_host[2]  = static_cast<std::size_t>(Hi);
@@ -158,9 +156,9 @@ int main(int argc, char* argv[])
         out_lengths_host[1] = static_cast<std::size_t>(K);
         out_lengths_host[2] = static_cast<std::size_t>(Ho);
         out_lengths_host[3] = static_cast<std::size_t>(Wo);
-        break;
-    case ConvTensorLayout::NHWC:
-        // NHWC
+    }
+    else if(layout == ConvTensorLayout::NHWC)
+    {
         in_lengths_host[0]  = static_cast<std::size_t>(N);
         in_lengths_host[1]  = static_cast<std::size_t>(Hi);
         in_lengths_host[2]  = static_cast<std::size_t>(Wi);
@@ -173,8 +171,10 @@ int main(int argc, char* argv[])
         out_lengths_host[1] = static_cast<std::size_t>(Ho);
         out_lengths_host[2] = static_cast<std::size_t>(Wo);
         out_lengths_host[3] = static_cast<std::size_t>(K);
-        break;
-    default: throw std::runtime_error("wrong! not implemented");
+    }
+    else
+    {
+        std::runtime_error("wrong! not implemented");
     }
 
     Tensor<in_data_t> in(in_lengths_host);
@@ -228,7 +228,7 @@ int main(int argc, char* argv[])
     }
 
     auto f_make_for_device_nchw = [&]() {
-#if USE_DYNAMIC_MODE
+#if USE_MODE
         const auto in_lengths_dev     = make_tuple(N, C, Hi, Wi);
         const auto wei_lengths_dev    = make_tuple(K, C, Y, X);
         const auto out_lengths_dev    = make_tuple(N, K, Ho, Wo);
@@ -260,7 +260,7 @@ int main(int argc, char* argv[])
     };
 
     auto f_make_for_device_nhwc = [&]() {
-#if USE_DYNAMIC_MODE
+#if USE_MODE
         const auto in_lengths_dev     = make_tuple(N, Hi, Wi, C);
         const auto wei_lengths_dev    = make_tuple(K, Y, X, C);
         const auto out_lengths_dev    = make_tuple(N, Ho, Wo, K);
@@ -301,20 +301,19 @@ int main(int argc, char* argv[])
 
         const auto tmp = f_make_for_device_nchw();
 
-        device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw<in_data_t,
-                                                                                   acc_data_t,
-                                                                                   out_data_t>(
-            tmp[I0],
-            tmp[I1],
-            tmp[I2],
-            tmp[I3],
-            tmp[I4],
-            tmp[I5],
-            tmp[I6],
-            in,
-            wei,
-            out_device,
-            nrepeat);
+        device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw<in_data_t,
+                                                                           acc_data_t,
+                                                                           out_data_t>(tmp[I0],
+                                                                                       tmp[I1],
+                                                                                       tmp[I2],
+                                                                                       tmp[I3],
+                                                                                       tmp[I4],
+                                                                                       tmp[I5],
+                                                                                       tmp[I6],
+                                                                                       in,
+                                                                                       wei,
+                                                                                       out_device,
+                                                                                       nrepeat);
     }
 #endif
 
@@ -328,20 +327,19 @@ int main(int argc, char* argv[])
 
         const auto tmp = f_make_for_device_nhwc();
 
-        device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk<in_data_t,
-                                                                                     acc_data_t,
-                                                                                     out_data_t>(
-            tmp[I0],
-            tmp[I1],
-            tmp[I2],
-            tmp[I3],
-            tmp[I4],
-            tmp[I5],
-            tmp[I6],
-            in,
-            wei,
-            out_device,
-            nrepeat);
+        device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk<in_data_t,
+                                                                             acc_data_t,
+                                                                             out_data_t>(tmp[I0],
+                                                                                         tmp[I1],
+                                                                                         tmp[I2],
+                                                                                         tmp[I3],
+                                                                                         tmp[I4],
+                                                                                         tmp[I5],
+                                                                                         tmp[I6],
+                                                                                         in,
+                                                                                         wei,
+                                                                                         out_device,
+                                                                                         nrepeat);
     }
 #endif
 
@@ -355,20 +353,19 @@ int main(int argc, char* argv[])
 
         const auto tmp = f_make_for_device_nchw();
 
-        device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw<in_data_t,
-                                                                                   acc_data_t,
-                                                                                   out_data_t>(
-            tmp[I0],
-            tmp[I1],
-            tmp[I2],
-            tmp[I3],
-            tmp[I4],
-            tmp[I5],
-            tmp[I6],
-            in,
-            wei,
-            out_device,
-            nrepeat);
+        device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw<in_data_t,
+                                                                           acc_data_t,
+                                                                           out_data_t>(tmp[I0],
+                                                                                       tmp[I1],
+                                                                                       tmp[I2],
+                                                                                       tmp[I3],
+                                                                                       tmp[I4],
+                                                                                       tmp[I5],
+                                                                                       tmp[I6],
+                                                                                       in,
+                                                                                       wei,
+                                                                                       out_device,
+                                                                                       nrepeat);
     }
 #endif
 
@@ -382,21 +379,20 @@ int main(int argc, char* argv[])
 
         const auto tmp = f_make_for_device_nchw();
 
-        device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw<in_data_t,
-                                                                                   16,
-                                                                                   acc_data_t,
-                                                                                   out_data_t>(
-            tmp[I0],
-            tmp[I1],
-            tmp[I2],
-            tmp[I3],
-            tmp[I4],
-            tmp[I5],
-            tmp[I6],
-            in,
-            wei,
-            out_device,
-            nrepeat);
+        device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw<in_data_t,
+                                                                           16,
+                                                                           acc_data_t,
+                                                                           out_data_t>(tmp[I0],
+                                                                                       tmp[I1],
+                                                                                       tmp[I2],
+                                                                                       tmp[I3],
+                                                                                       tmp[I4],
+                                                                                       tmp[I5],
+                                                                                       tmp[I6],
+                                                                                       in,
+                                                                                       wei,
+                                                                                       out_device,
+                                                                                       nrepeat);
     }
 #endif
 
@@ -410,9 +406,9 @@ int main(int argc, char* argv[])
 
         const auto tmp = f_make_for_device_nchw();
 
-        device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw<in_data_t,
-                                                                                      acc_data_t,
-                                                                                      out_data_t>(
+        device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw<in_data_t,
+                                                                              acc_data_t,
+                                                                              out_data_t>(
             tmp[I0],
             tmp[I1],
             tmp[I2],
@@ -437,9 +433,9 @@ int main(int argc, char* argv[])
 
         const auto tmp = f_make_for_device_nhwc();
 
-        device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk<in_data_t,
-                                                                                      acc_data_t,
-                                                                                      out_data_t>(
+        device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk<in_data_t,
+                                                                              acc_data_t,
+                                                                              out_data_t>(
             tmp[I0],
             tmp[I1],
             tmp[I2],
@@ -467,7 +463,6 @@ int main(int argc, char* argv[])
 
         check_error(out_host, out_device);
 
-#if 0
         if(do_log)
         {
             LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
@@ -475,6 +470,5 @@ int main(int argc, char* argv[])
             LogRangeAsType<float>(std::cout << "out_host  : ", out_host.mData, ",") << std::endl;
             LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
         }
-#endif
     }
 }
diff --git a/host/driver_online/CMakeLists.txt b/host/driver_online/CMakeLists.txt
deleted file mode 100644
index 077e3218a0..0000000000
--- a/host/driver_online/CMakeLists.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-include_directories(BEFORE
-    include
-    ${PROJECT_BINARY_DIR}/host/online_compile/include
-    ${PROJECT_SOURCE_DIR}/host/online_compile/include
-    ${PROJECT_SOURCE_DIR}/host/host_tensor/include
-    ${PROJECT_SOURCE_DIR}/host/solver/include
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/utility
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_operation
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/problem_transform
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/driver
-    ${PROJECT_SOURCE_DIR}/external/rocm/include
-    ${PROJECT_SOURCE_DIR}/external/half/include
-)
-
-set(CONV_FWD_DRIVER_ONLINE_SOURCE conv_fwd_driver_online.cpp)
-
-add_executable(conv_fwd_driver_online ${CONV_FWD_DRIVER_ONLINE_SOURCE})
-
-target_link_libraries(conv_fwd_driver_online PRIVATE host_tensor)
-target_link_libraries(conv_fwd_driver_online PRIVATE online_compile)
diff --git a/host/driver_online/conv_fwd_driver_online.cpp b/host/driver_online/conv_fwd_driver_online.cpp
deleted file mode 100644
index 29609d5474..0000000000
--- a/host/driver_online/conv_fwd_driver_online.cpp
+++ /dev/null
@@ -1,453 +0,0 @@
-#include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-#include <stdlib.h>
-#include <half.hpp>
-#include "config.hpp"
-#include "print.hpp"
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
-#include "conv_common.hpp"
-#include "host_conv.hpp"
-#include "device_tensor.hpp"
-#include "handle.hpp"
-#include "hipCheck.hpp"
-#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp"
-#include "online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp"
-#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
-#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp"
-
-#define USE_CONV_FWD_V4R4_NCHW 1
-#define USE_CONV_FWD_V6R1_NCHW 1
-#define USE_CONV_FWD_V4R4_XDLOPS_NCHW 1
-#define USE_CONV_FWD_V4R4_XDLOPS_NHWC 1
-
-enum ConvForwardAlgo
-{
-    V4R4NCHW,    // 0
-    V6R1NCHW,    // 1
-    V4R4XDLNCHW, // 2
-    V4R4XDLNHWC  // 3
-};
-
-int main(int argc, char* argv[])
-{
-    using namespace ck;
-    using namespace ck_driver;
-    using size_t = std::size_t;
-
-    hipStream_t stream;
-    online_compile::Handle* handle;
-
-    MY_HIP_CHECK(hipStreamCreate(&stream));
-
-    handle = new online_compile::Handle(stream);
-
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
-    constexpr auto I6 = Number<6>{};
-
-    if(argc != 22)
-    {
-        printf("arg1 to 5: layout, algo, do_verification, init_method, do_log, nrepeat\n");
-        printf("rest: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx\n");
-        exit(1);
-    }
-
-    const ConvTensorLayout layout = static_cast<ConvTensorLayout>(atoi(argv[1]));
-    const ConvForwardAlgo algo    = static_cast<ConvForwardAlgo>(atoi(argv[2]));
-    const bool do_verification    = atoi(argv[3]);
-    const int init_method         = atoi(argv[4]);
-    const bool do_log             = atoi(argv[5]);
-    const int nrepeat             = atoi(argv[6]);
-
-    const index_t N  = atoi(argv[7]);
-    const index_t K  = atoi(argv[8]);
-    const index_t C  = atoi(argv[9]);
-    const index_t Y  = atoi(argv[10]);
-    const index_t X  = atoi(argv[11]);
-    const index_t Hi = atoi(argv[12]);
-    const index_t Wi = atoi(argv[13]);
-
-    const index_t conv_stride_h   = atoi(argv[14]);
-    const index_t conv_stride_w   = atoi(argv[15]);
-    const index_t conv_dilation_h = atoi(argv[16]);
-    const index_t conv_dilation_w = atoi(argv[17]);
-    const index_t in_left_pad_h   = atoi(argv[18]);
-    const index_t in_left_pad_w   = atoi(argv[19]);
-    const index_t in_right_pad_h  = atoi(argv[20]);
-    const index_t in_right_pad_w  = atoi(argv[21]);
-
-    const index_t YEff = (Y - 1) * conv_dilation_h + 1;
-    const index_t XEff = (X - 1) * conv_dilation_w + 1;
-
-    const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
-    const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
-
-#if 1
-    using in_data_t  = float;
-    using acc_data_t = float;
-    using out_data_t = float;
-#elif 0
-    using in_data_t  = half_t;
-    using acc_data_t = float;
-    using out_data_t = half_t;
-#elif 1
-    using in_data_t  = int8_t;
-    using acc_data_t = int32_t;
-    using out_data_t = int8_t;
-#endif
-
-    std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4);
-
-    switch(layout)
-    {
-    case ConvTensorLayout::NCHW:
-        // NCHW
-        in_lengths_host[0] = static_cast<std::size_t>(N);
-        in_lengths_host[1] = static_cast<std::size_t>(C);
-        in_lengths_host[2] = static_cast<std::size_t>(Hi);
-        in_lengths_host[3] = static_cast<std::size_t>(Wi);
-
-        wei_lengths_host[0] = static_cast<std::size_t>(K);
-        wei_lengths_host[1] = static_cast<std::size_t>(C);
-        wei_lengths_host[2] = static_cast<std::size_t>(Y);
-        wei_lengths_host[3] = static_cast<std::size_t>(X);
-
-        out_lengths_host[0] = static_cast<std::size_t>(N);
-        out_lengths_host[1] = static_cast<std::size_t>(K);
-        out_lengths_host[2] = static_cast<std::size_t>(Ho);
-        out_lengths_host[3] = static_cast<std::size_t>(Wo);
-        break;
-    case ConvTensorLayout::NHWC:
-        // NHWC
-        in_lengths_host[0] = static_cast<std::size_t>(N);
-        in_lengths_host[1] = static_cast<std::size_t>(Hi);
-        in_lengths_host[2] = static_cast<std::size_t>(Wi);
-        in_lengths_host[3] = static_cast<std::size_t>(C);
-
-        wei_lengths_host[0] = static_cast<std::size_t>(K);
-        wei_lengths_host[1] = static_cast<std::size_t>(Y);
-        wei_lengths_host[2] = static_cast<std::size_t>(X);
-        wei_lengths_host[3] = static_cast<std::size_t>(C);
-
-        out_lengths_host[0] = static_cast<std::size_t>(N);
-        out_lengths_host[1] = static_cast<std::size_t>(Ho);
-        out_lengths_host[2] = static_cast<std::size_t>(Wo);
-        out_lengths_host[3] = static_cast<std::size_t>(K);
-        break;
-    default: throw std::runtime_error("wrong! not implemented");
-    }
-
-    Tensor<in_data_t> in(in_lengths_host);
-    Tensor<in_data_t> wei(wei_lengths_host);
-    Tensor<out_data_t> out_host(out_lengths_host);
-    Tensor<out_data_t> out_device(out_lengths_host);
-
-    std::cout << "layout: " << layout << std::endl;
-    ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
-    ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
-    ostream_HostTensorDescriptor(out_host.mDesc, std::cout << "out: ");
-    print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
-    print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
-    print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
-    print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
-
-    std::size_t num_thread = std::thread::hardware_concurrency();
-
-    switch(init_method)
-    {
-    case 0:
-        // no initialization
-        break;
-    case 1:
-        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        break;
-    case 2:
-        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        break;
-    case 3:
-        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        break;
-    case 4:
-        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        break;
-    case 5:
-        in.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
-        break;
-    default:
-        in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
-
-        auto gen_wei = [](auto... is) {
-            return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
-        };
-        wei.GenerateTensorValue(gen_wei, num_thread);
-    }
-
-    auto f_make_for_device_nchw = [&]() {
-        const auto in_lengths_dev  = make_tuple(N, C, Hi, Wi);
-        const auto wei_lengths_dev = make_tuple(K, C, Y, X);
-        const auto out_lengths_dev = make_tuple(N, K, Ho, Wo);
-
-        return make_tuple(in_lengths_dev, wei_lengths_dev, out_lengths_dev);
-    };
-
-    auto f_make_for_device_nhwc = [&]() {
-        const auto in_lengths_dev  = make_tuple(N, Hi, Wi, C);
-        const auto wei_lengths_dev = make_tuple(K, Y, X, C);
-        const auto out_lengths_dev = make_tuple(N, Ho, Wo, K);
-
-        return make_tuple(in_lengths_dev, wei_lengths_dev, out_lengths_dev);
-    };
-
-    const auto conv_strides   = make_tuple(conv_stride_h, conv_stride_w);
-    const auto conv_dilations = make_tuple(conv_dilation_h, conv_dilation_w);
-    const auto in_left_pads   = make_tuple(in_left_pad_h, in_left_pad_w);
-    const auto in_right_pads  = make_tuple(in_right_pad_h, in_right_pad_w);
-
-#if USE_CONV_FWD_V4R4_NCHW
-    if(algo == ConvForwardAlgo::V4R4NCHW)
-    {
-        if(layout != ConvTensorLayout::NCHW)
-        {
-            throw std::runtime_error("wrong! layout");
-        }
-
-        const auto tmp = f_make_for_device_nchw();
-
-        tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* tunable =
-            &default_tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw;
-
-        online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw<
-            in_data_t,
-            acc_data_t,
-            out_data_t>(handle,
-                        tmp[I0],
-                        tmp[I1],
-                        tmp[I2],
-                        conv_strides,
-                        conv_dilations,
-                        in_left_pads,
-                        in_right_pads,
-                        in,
-                        wei,
-                        out_device,
-                        tunable,
-                        nrepeat);
-    }
-#endif
-
-#if USE_CONV_FWD_V6R1_NCHW
-    if(algo == ConvForwardAlgo::V6R1NCHW)
-    {
-        if(layout != ConvTensorLayout::NCHW)
-        {
-            throw std::runtime_error("wrong! layout");
-        }
-
-        const auto tmp = f_make_for_device_nchw();
-
-#if 1
-        const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param = {
-            get_datatype_enum_from_type<in_data_t>::value,
-            get_datatype_enum_from_type<acc_data_t>::value,
-            get_datatype_enum_from_type<out_data_t>::value,
-            256,
-            4,
-            1,
-            128,
-            32,
-            8,
-            4,
-            4,
-            1,
-            {8, 2},
-            {8, 2},
-            {4, 1, 1, 1, 1},
-            {2, 1, 1, 128, 1},
-            {4, 1, 1, 1, 1},
-            {1, 1, 1, 1, 1},
-            {1, 4, 1, 1, 1},
-            {8, 1, 1, 32, 1},
-            {1, 1, 1, 1, 1},
-            {1, 1, 1, 1, 1},
-            4,
-            true,
-            true};
-#elif 0
-        const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param = {
-            get_datatype_enum_from_type<in_data_t>::value,
-            get_datatype_enum_from_type<acc_data_t>::value,
-            get_datatype_enum_from_type<out_data_t>::value,
-            256,
-            4,
-            2,
-            128,
-            32,
-            8,
-            4,
-            4,
-            1,
-            {8, 2},
-            {8, 2},
-            {4, 1, 1, 1, 2},
-            {2, 1, 1, 128, 1},
-            {4, 1, 1, 1, 1},
-            {1, 1, 1, 1, 1},
-            {1, 4, 1, 1, 2},
-            {8, 1, 1, 32, 1},
-            {1, 1, 1, 1, 1},
-            {1, 1, 1, 1, 1},
-            4,
-            true,
-            true};
-#elif 1
-        const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param = {
-            get_datatype_enum_from_type<in_data_t>::value,
-            get_datatype_enum_from_type<acc_data_t>::value,
-            get_datatype_enum_from_type<out_data_t>::value,
-            256,
-            4,
-            4,
-            128,
-            32,
-            8,
-            4,
-            4,
-            1,
-            {8, 2},
-            {8, 2},
-            {4, 1, 1, 1, 4},
-            {2, 1, 1, 128, 1},
-            {4, 1, 1, 1, 1},
-            {1, 1, 1, 1, 1},
-            {1, 4, 1, 1, 4},
-            {8, 1, 1, 32, 1},
-            {1, 1, 1, 1, 1},
-            {1, 1, 1, 1, 1},
-            4,
-            true,
-            true};
-#endif
-
-        online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw<
-            in_data_t,
-            acc_data_t,
-            out_data_t>(handle,
-                        tmp[I0],
-                        tmp[I1],
-                        tmp[I2],
-                        conv_strides,
-                        conv_dilations,
-                        in_left_pads,
-                        in_right_pads,
-                        in,
-                        wei,
-                        out_device,
-                        compile_param,
-                        nrepeat);
-    }
-#endif
-
-#if USE_CONV_FWD_V4R4_XDLOPS_NCHW
-    if(algo == ConvForwardAlgo::V4R4XDLNCHW)
-    {
-        if(layout != ConvTensorLayout::NCHW)
-        {
-            throw std::runtime_error("wrong! layout");
-        }
-
-        const auto tmp = f_make_for_device_nchw();
-
-        tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* tunable =
-            &default_tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw;
-
-        online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw<
-            in_data_t,
-            acc_data_t,
-            out_data_t>(handle,
-                        tmp[I0],
-                        tmp[I1],
-                        tmp[I2],
-                        conv_strides,
-                        conv_dilations,
-                        in_left_pads,
-                        in_right_pads,
-                        in,
-                        wei,
-                        out_device,
-                        tunable,
-                        nrepeat);
-    }
-#endif
-
-#if USE_CONV_FWD_V4R4_XDLOPS_NHWC
-    if(algo == ConvForwardAlgo::V4R4XDLNHWC)
-    {
-        if(layout != ConvTensorLayout::NHWC)
-        {
-            throw std::runtime_error("wrong! layout");
-        }
-
-        const auto tmp = f_make_for_device_nhwc();
-
-        tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* tunable =
-            &default_tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk;
-
-        online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk<
-            in_data_t,
-            acc_data_t,
-            out_data_t>(handle,
-                        tmp[I0],
-                        tmp[I1],
-                        tmp[I2],
-                        conv_strides,
-                        conv_dilations,
-                        in_left_pads,
-                        in_right_pads,
-                        in,
-                        wei,
-                        out_device,
-                        tunable,
-                        nrepeat);
-    }
-#endif
-
-    if(do_verification)
-    {
-        host_direct_convolution(in,
-                                wei,
-                                out_host,
-                                make_tuple(conv_stride_h, conv_stride_w),
-                                make_tuple(conv_dilation_h, conv_dilation_w),
-                                make_tuple(in_left_pad_h, in_left_pad_w),
-                                make_tuple(in_right_pad_h, in_right_pad_w),
-                                layout);
-
-        check_error(out_host, out_device);
-
-#if 0
-        if(do_log)
-        {
-            LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
-            LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
-            LogRangeAsType<float>(std::cout << "out_host  : ", out_host.mData, ",") << std::endl;
-            LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
-        }
-#endif
-    }
-
-    delete handle;
-    MY_HIP_CHECK(hipStreamDestroy(stream));
-}
diff --git a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
deleted file mode 100644
index 06412fba0b..0000000000
--- a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
+++ /dev/null
@@ -1,395 +0,0 @@
-#pragma once
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "handle.hpp"
-#include "online_driver_common.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp"
-#include "conv_tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp"
-
-namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw {
-
-template <typename TInWei, typename TAcc, typename TOut>
-static std::string get_network_config_string_from_types()
-{
-    using namespace ck;
-
-    std::string out;
-
-    out += std::to_string(get_datatype_enum_from_type<TInWei>::value) + "_" +
-           std::to_string(get_datatype_enum_from_type<TAcc>::value) + "_" +
-           std::to_string(get_datatype_enum_from_type<TOut>::value);
-
-    return (out);
-};
-
-static std::string
-get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* pt)
-{
-    std::string out("TUN_");
-
-    out += std::to_string(pt->BlockSize) + "_";
-
-    out += std::to_string(pt->MPerBlock) + "x" + std::to_string(pt->NPerBlock) + "x" +
-           std::to_string(pt->KPerBlock) + "_";
-    out += std::to_string(pt->M1PerThread) + "x" + std::to_string(pt->N1PerThread) + "x" +
-           std::to_string(pt->KPerThread) + "_";
-    out += std::to_string(pt->M1N1ThreadClusterM10) + "x" +
-           std::to_string(pt->M1N1ThreadClusterN10) + "x" +
-           std::to_string(pt->M1N1ThreadClusterM11) + "x" +
-           std::to_string(pt->M1N1ThreadClusterN11) + "_";
-
-    out += std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[2]) + "_";
-
-    out += std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[2]) + "_";
-
-    out += std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "_";
-
-    out += std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "x" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "x" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "_";
-
-    out += std::to_string(pt->ABlockTransferSrcVectorDim) + "_";
-    out += std::to_string(pt->ABlockTransferSrcScalarPerVector) + "_";
-    out += std::to_string(pt->ABlockTransferDstScalarPerVector_M1) + "_";
-    out += std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun) + "_";
-
-    out += std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[2]) + "_";
-
-    out += std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[2]) + "_";
-
-    out += std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "_";
-
-    out += std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "x" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "x" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "_";
-
-    out += std::to_string(pt->BBlockTransferSrcVectorDim) + "_";
-    out += std::to_string(pt->BBlockTransferSrcScalarPerVector) + "_";
-    out += std::to_string(pt->BBlockTransferDstScalarPerVector_N1) + "_";
-    out += std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun) + "_";
-
-    out += std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "_";
-
-    out += std::to_string(pt->CThreadTransferSrcDstVectorDim) + "_";
-    out += std::to_string(pt->CThreadTransferDstScalarPerVector);
-
-    return (out);
-};
-
-template <typename TInWei, typename TAcc, typename TOut>
-static std::string get_definition_string_from_types()
-{
-    using namespace ck;
-
-    std::string out;
-
-    out +=
-        " -DCK_PARAM_ABDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TInWei>::value) +
-        " -DCK_PARAM_AccDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TAcc>::value) +
-        " -DCK_PARAM_CDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TOut>::value);
-
-    return (out);
-};
-
-static std::string
-get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* pt)
-{
-    std::string out;
-
-    out += " -DCK_PARAM_BlockSize=" + std::to_string(pt->BlockSize);
-
-    out += " -DCK_PARAM_MPerBlock=" + std::to_string(pt->MPerBlock) +
-           " -DCK_PARAM_NPerBlock=" + std::to_string(pt->NPerBlock) +
-           " -DCK_PARAM_KPerBlock=" + std::to_string(pt->KPerBlock);
-    out += " -DCK_PARAM_M1PerThread=" + std::to_string(pt->M1PerThread) +
-           " -DCK_PARAM_N1PerThread=" + std::to_string(pt->N1PerThread) +
-           " -DCK_PARAM_KPerThread=" + std::to_string(pt->KPerThread);
-
-    out += " -DCK_PARAM_M1N1ThreadClusterM10=" + std::to_string(pt->M1N1ThreadClusterM10) +
-           " -DCK_PARAM_M1N1ThreadClusterN10=" + std::to_string(pt->M1N1ThreadClusterN10) +
-           " -DCK_PARAM_M1N1ThreadClusterM11=" + std::to_string(pt->M1N1ThreadClusterM11) +
-           " -DCK_PARAM_M1N1ThreadClusterN11=" + std::to_string(pt->M1N1ThreadClusterN11);
-
-    out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_K_M0_M1=" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[2]);
-
-    out += " -DCK_PARAM_ABlockTransferThreadClusterLengths_K_M0_M1=" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[2]);
-
-    out += " -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder=" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]);
-
-    out += " -DCK_PARAM_ABlockTransferSrcAccessOrder=" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "," +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "," +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[2]);
-
-    out +=
-        " -DCK_PARAM_ABlockTransferSrcVectorDim=" + std::to_string(pt->ABlockTransferSrcVectorDim);
-    out += " -DCK_PARAM_ABlockTransferSrcScalarPerVector=" +
-           std::to_string(pt->ABlockTransferSrcScalarPerVector);
-    out += " -DCK_PARAM_ABlockTransferDstScalarPerVector_M1=" +
-           std::to_string(pt->ABlockTransferDstScalarPerVector_M1);
-    out += " -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun=" +
-           std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun);
-
-    out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_K_N0_N1=" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[2]);
-
-    out += " -DCK_PARAM_BBlockTransferThreadClusterLengths_K_N0_N1=" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[2]);
-
-    out += " -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder=" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]);
-
-    out += " -DCK_PARAM_BBlockTransferSrcAccessOrder=" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "," +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "," +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[2]);
-
-    out +=
-        " -DCK_PARAM_BBlockTransferSrcVectorDim=" + std::to_string(pt->BBlockTransferSrcVectorDim);
-    out += " -DCK_PARAM_BBlockTransferSrcScalarPerVector=" +
-           std::to_string(pt->BBlockTransferSrcScalarPerVector);
-    out += " -DCK_PARAM_BBlockTransferDstScalarPerVector_N1=" +
-           std::to_string(pt->BBlockTransferDstScalarPerVector_N1);
-    out += " -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun=" +
-           std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun);
-
-    out += " -DCK_PARAM_CThreadTransferSrcDstAccessOrder=" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]);
-
-    out += " -DCK_PARAM_CThreadTransferSrcDstVectorDim=" +
-           std::to_string(pt->CThreadTransferSrcDstVectorDim);
-    out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
-           std::to_string(pt->CThreadTransferDstScalarPerVector);
-
-    return (out);
-};
-
-} // namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw
-
-template <typename TInWei,
-          typename TAcc,
-          typename TOut,
-          typename InLengths,
-          typename WeiLengths,
-          typename OutLengths,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
-    online_compile::Handle* handle,
-    const InLengths& in_n_c_hi_wi_lengths,
-    const WeiLengths& wei_k_c_y_x_lengths,
-    const OutLengths& out_n_k_ho_wo_lengths,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads,
-    const Tensor<TInWei>& in_n_c_hi_wi,
-    const Tensor<TInWei>& wei_k_c_y_x,
-    Tensor<TOut>& out_n_k_ho_wo,
-    const tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* tunable,
-    ck::index_t nrepeat)
-{
-    using namespace ck;
-    using namespace ck_driver;
-    using namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw;
-    using size_t = std::size_t;
-
-    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // The follow codes are only used for computing the grid_size, hasMainKBlockLoop,
-    // hasDoubleTailKBlockLoop
-
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    const auto in_n_c_hi_wi_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
-    const auto wei_k_c_y_x_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
-    const auto out_n_k_ho_wo_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
-
-    const auto descs =
-        transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
-                                                                        in_n_c_hi_wi_desc,
-                                                                        out_n_k_ho_wo_desc,
-                                                                        conv_strides,
-                                                                        conv_dilations,
-                                                                        in_left_pads,
-                                                                        in_right_pads);
-    const auto a_k_m_grid_desc = descs[I0];
-    const auto c_m_n_grid_desc = descs[I2];
-    const auto M               = c_m_n_grid_desc.GetLength(I0);
-    const auto N               = c_m_n_grid_desc.GetLength(I1);
-    const auto K               = a_k_m_grid_desc.GetLength(I0);
-
-    const index_t grid_size            = (M / tunable->MPerBlock) * (N / tunable->NPerBlock);
-    const bool hasMainKBlockLoop       = ((K + tunable->KPerBlock) / (2 * tunable->KPerBlock) > 1);
-    const bool hasDoubleTailKBlockLoop = ((K / tunable->KPerBlock) % 2 == 0);
-    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-    // these buffers are usually provided by the user application
-    DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
-    DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
-    DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
-
-    in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data());
-    wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data());
-    out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data());
-
-    // these are workspace buffers that should be expressed to the user by the corresponding
-    // workspace API
-    DeviceMem workspace_buf(4096);
-
-    void* a_k_m0_m1_grid_desc_dev_buf = workspace_buf.GetDeviceBuffer();
-    void* b_k_n0_n1_grid_desc_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 1024);
-    void* c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 2048);
-    void* c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 3072);
-
-    const std::vector<size_t> vld  = {static_cast<size_t>(tunable->BlockSize), 1, 1};
-    const std::vector<size_t> vgd1 = {static_cast<size_t>(tunable->BlockSize), 1, 1};
-    const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * tunable->BlockSize), 1, 1};
-
-    std::string program_name =
-        "dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp";
-    std::string algo_name = "implicit_gemm_conv_fwd_v4r4_dlops_nchw";
-
-    std::string param = " -std=c++17 ";
-    std::string network_config;
-
-    param += get_definition_string_from_types<TInWei, TAcc, TOut>() + " " +
-             get_definition_string_from_tunable(tunable) +
-             " -DCK_PARAM_HAS_MAIN_KBLOCK_LOOP=" + std::to_string(hasMainKBlockLoop) +
-             " -DCK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP=" + std::to_string(hasDoubleTailKBlockLoop);
-    network_config = get_network_config_string_from_types<TInWei, TAcc, TOut>() + "_" +
-                     get_network_config_string_from_tunable(tunable) + "_" +
-                     std::to_string(hasMainKBlockLoop) + "_" +
-                     std::to_string(hasDoubleTailKBlockLoop);
-
-    std::vector<float> kernel1_times;
-    std::vector<float> kernel2_times;
-
-    for(index_t i = 0; i < nrepeat; ++i)
-    {
-        KernelTimer timer1, timer2;
-        std::string kernel_name;
-
-        kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw_prepare";
-        auto network_config_1 = network_config + "_1";
-
-        timer1.Start();
-        handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)(
-            static_cast<index_t>(in_n_c_hi_wi_lengths[I0]),
-            static_cast<index_t>(in_n_c_hi_wi_lengths[I1]),
-            static_cast<index_t>(in_n_c_hi_wi_lengths[I2]),
-            static_cast<index_t>(in_n_c_hi_wi_lengths[I3]),
-            static_cast<index_t>(wei_k_c_y_x_lengths[I0]),
-            static_cast<index_t>(wei_k_c_y_x_lengths[I2]),
-            static_cast<index_t>(wei_k_c_y_x_lengths[I3]),
-            conv_strides[I0],
-            conv_strides[I1],
-            conv_dilations[I0],
-            conv_dilations[I1],
-            in_left_pads[I0],
-            in_left_pads[I1],
-            in_right_pads[I0],
-            in_right_pads[I1],
-            a_k_m0_m1_grid_desc_dev_buf,
-            b_k_n0_n1_grid_desc_dev_buf,
-            c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf,
-            c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf);
-        timer1.End();
-
-        kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw";
-        auto network_config_2 = network_config + "_2";
-
-        timer2.Start();
-        handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)(
-            reinterpret_cast<const TInWei*>(wei_k_c_y_x_dev_buf.GetDeviceBuffer()),
-            reinterpret_cast<const TInWei*>(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()),
-            reinterpret_cast<TOut*>(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()),
-            (const void*)(a_k_m0_m1_grid_desc_dev_buf),
-            (const void*)(b_k_n0_n1_grid_desc_dev_buf),
-            (const void*)(c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf),
-            (const void*)(c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf));
-        timer2.End();
-
-        kernel1_times.push_back(timer1.GetElapsedTime());
-        kernel2_times.push_back(timer2.GetElapsedTime());
-    }
-
-    {
-        auto ave_time1 =
-            std::accumulate(
-                std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus<float>{}) /
-            (nrepeat - 1);
-        auto ave_time2 =
-            std::accumulate(
-                std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus<float>{}) /
-            (nrepeat - 1);
-
-        const auto N = in_n_c_hi_wi_lengths[I0];
-        const auto C = in_n_c_hi_wi_lengths[I1];
-
-        const auto K  = out_n_k_ho_wo_lengths[I1];
-        const auto Ho = out_n_k_ho_wo_lengths[I2];
-        const auto Wo = out_n_k_ho_wo_lengths[I3];
-
-        const auto Y = wei_k_c_y_x_lengths[I2];
-        const auto X = wei_k_c_y_x_lengths[I3];
-
-        float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
-                     (std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2);
-
-        std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
-                  << ave_time2 << "), " << perf << " TFlop/s" << std::endl;
-    };
-
-    // copy result back to host
-    out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data());
-}
diff --git a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
deleted file mode 100644
index 61ce41fe84..0000000000
--- a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
+++ /dev/null
@@ -1,386 +0,0 @@
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "handle.hpp"
-#include "online_driver_common.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
-
-namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw {
-
-template <typename TInWei, typename TAcc, typename TOut>
-static std::string get_network_config_string_from_types()
-{
-    using namespace ck;
-
-    std::string out;
-
-    out += std::to_string(get_datatype_enum_from_type<TInWei>::value) + "_" +
-           std::to_string(get_datatype_enum_from_type<TAcc>::value) + "_" +
-           std::to_string(get_datatype_enum_from_type<TOut>::value);
-
-    return (out);
-};
-
-static std::string
-get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* pt)
-{
-    std::string out("TUN_");
-
-    out += std::to_string(pt->BlockSize) + "_";
-
-    out += std::to_string(pt->MPerBlock) + "x" + std::to_string(pt->NPerBlock) + "x" +
-           std::to_string(pt->KPerBlock) + "_";
-    out += std::to_string(pt->MPerWave) + "x" + std::to_string(pt->NPerWave) + "x" +
-           std::to_string(pt->MRepeat) + "x" + std::to_string(pt->NRepeat) + "x" +
-           std::to_string(pt->K1) + "_";
-
-    out += std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[2]) + "_";
-
-    out += std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[2]) + "_";
-
-    out += std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "_";
-
-    out += std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "x" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "x" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "_";
-
-    out += std::to_string(pt->ABlockTransferSrcVectorDim) + "_";
-    out += std::to_string(pt->ABlockTransferSrcScalarPerVector) + "_";
-    out += std::to_string(pt->ABlockTransferDstScalarPerVector_K1) + "_";
-    out += std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun) + "_";
-
-    out += std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[2]) + "_";
-
-    out += std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[2]) + "_";
-
-    out += std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "_";
-
-    out += std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "x" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "x" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "_";
-
-    out += std::to_string(pt->BBlockTransferSrcVectorDim) + "_";
-    out += std::to_string(pt->BBlockTransferSrcScalarPerVector) + "_";
-    out += std::to_string(pt->BBlockTransferDstScalarPerVector_K1) + "_";
-    out += std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun) + "_";
-
-    out += std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[6]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[7]) + "_";
-
-    out += std::to_string(pt->CThreadTransferSrcDstVectorDim) + "_";
-    out += std::to_string(pt->CThreadTransferDstScalarPerVector);
-
-    return (out);
-};
-
-template <typename TInWei, typename TAcc, typename TOut>
-static std::string get_definition_string_from_types()
-{
-    using namespace ck;
-
-    std::string out;
-
-    out +=
-        " -DCK_PARAM_ABDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TInWei>::value) +
-        " -DCK_PARAM_AccDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TAcc>::value) +
-        " -DCK_PARAM_CDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TOut>::value);
-
-    return (out);
-};
-
-static std::string
-get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* pt)
-{
-    std::string out;
-
-    out += " -DCK_PARAM_BlockSize=" + std::to_string(pt->BlockSize);
-
-    out += " -DCK_PARAM_MPerBlock=" + std::to_string(pt->MPerBlock) +
-           " -DCK_PARAM_NPerBlock=" + std::to_string(pt->NPerBlock) +
-           " -DCK_PARAM_KPerBlock=" + std::to_string(pt->KPerBlock);
-    out += " -DCK_PARAM_MPerWave=" + std::to_string(pt->MPerWave) +
-           " -DCK_PARAM_NPerWave=" + std::to_string(pt->NPerWave) +
-           " -DCK_PARAM_K1=" + std::to_string(pt->K1) +
-           " -DCK_PARAM_MRepeat=" + std::to_string(pt->MRepeat) +
-           " -DCK_PARAM_NRepeat=" + std::to_string(pt->NRepeat);
-
-    out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_K0_M_K1=" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[2]);
-
-    out += " -DCK_PARAM_ABlockTransferThreadClusterLengths_K0_M_K1=" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[2]);
-
-    out += " -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder=" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]);
-
-    out += " -DCK_PARAM_ABlockTransferSrcAccessOrder=" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "," +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "," +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[2]);
-
-    out +=
-        " -DCK_PARAM_ABlockTransferSrcVectorDim=" + std::to_string(pt->ABlockTransferSrcVectorDim);
-    out += " -DCK_PARAM_ABlockTransferSrcScalarPerVector=" +
-           std::to_string(pt->ABlockTransferSrcScalarPerVector);
-    out += " -DCK_PARAM_ABlockTransferDstScalarPerVector_K1=" +
-           std::to_string(pt->ABlockTransferDstScalarPerVector_K1);
-    out += " -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun=" +
-           std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun);
-
-    out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_K0_N_K1=" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[2]);
-
-    out += " -DCK_PARAM_BBlockTransferThreadClusterLengths_K0_N_K1=" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[2]);
-
-    out += " -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder=" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]);
-
-    out += " -DCK_PARAM_BBlockTransferSrcAccessOrder=" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "," +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "," +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[2]);
-
-    out +=
-        " -DCK_PARAM_BBlockTransferSrcVectorDim=" + std::to_string(pt->BBlockTransferSrcVectorDim);
-    out += " -DCK_PARAM_BBlockTransferSrcScalarPerVector=" +
-           std::to_string(pt->BBlockTransferSrcScalarPerVector);
-    out += " -DCK_PARAM_BBlockTransferDstScalarPerVector_K1=" +
-           std::to_string(pt->BBlockTransferDstScalarPerVector_K1);
-    out += " -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun=" +
-           std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun);
-
-    out += " -DCK_PARAM_CThreadTransferSrcDstAccessOrder=" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[6]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[7]);
-
-    out += " -DCK_PARAM_CThreadTransferSrcDstVectorDim=" +
-           std::to_string(pt->CThreadTransferSrcDstVectorDim);
-    out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
-           std::to_string(pt->CThreadTransferDstScalarPerVector);
-
-    return (out);
-};
-
-} // namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
-
-template <typename TInWei,
-          typename TAcc,
-          typename TOut,
-          typename InLengths,
-          typename WeiLengths,
-          typename OutLengths,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(
-    online_compile::Handle* handle,
-    const InLengths& in_n_c_hi_wi_lengths,
-    const WeiLengths& wei_k_c_y_x_lengths,
-    const OutLengths& out_n_k_ho_wo_lengths,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads,
-    const Tensor<TInWei>& in_n_c_hi_wi,
-    const Tensor<TInWei>& wei_k_c_y_x,
-    Tensor<TOut>& out_n_k_ho_wo,
-    const tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* tunable,
-    ck::index_t nrepeat)
-{
-    using namespace ck;
-    using namespace ck_driver;
-    using namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw;
-    using size_t = std::size_t;
-
-    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    const auto in_n_c_hi_wi_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
-    const auto wei_k_c_y_x_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
-    const auto out_n_k_ho_wo_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
-
-    const auto n  = in_n_c_hi_wi_desc.GetLength(I0);
-    const auto c  = in_n_c_hi_wi_desc.GetLength(I1);
-    const auto hi = in_n_c_hi_wi_desc.GetLength(I2);
-    const auto wi = in_n_c_hi_wi_desc.GetLength(I3);
-    const auto k  = wei_k_c_y_x_desc.GetLength(I0);
-    const auto y  = wei_k_c_y_x_desc.GetLength(I2);
-    const auto x  = wei_k_c_y_x_desc.GetLength(I3);
-    const auto ho = out_n_k_ho_wo_desc.GetLength(I2);
-    const auto wo = out_n_k_ho_wo_desc.GetLength(I3);
-
-    const auto M  = k;
-    const auto N  = n * ho * wo;
-    const auto K  = c * y * x;
-    const auto K0 = K / tunable->K1;
-
-    const index_t grid_size = (M / tunable->MPerBlock) * (N / tunable->NPerBlock);
-    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-    // these buffers are usually provided by the user application
-    DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
-    DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
-    DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
-
-    in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data());
-    wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data());
-    out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data());
-
-    // these are workspace buffers that should be expressed to the user by the corresponding
-    // workspace API
-    DeviceMem workspace_buf(4096);
-
-    void* a_k_m0_m1_grid_desc_dev_buf = workspace_buf.GetDeviceBuffer();
-    void* b_k_n0_n1_grid_desc_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 1024);
-    void* c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 2048);
-    void* c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 3072);
-
-    const std::vector<size_t> vld  = {static_cast<size_t>(tunable->BlockSize), 1, 1};
-    const std::vector<size_t> vgd1 = {static_cast<size_t>(tunable->BlockSize), 1, 1};
-    const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * tunable->BlockSize), 1, 1};
-
-    std::string program_name =
-        "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp";
-    std::string algo_name = "implicit_gemm_conv_fwd_v4r4_xdlops_nchw";
-
-    std::string param = " -std=c++17 ";
-    std::string network_config;
-
-    param += get_definition_string_from_types<TInWei, TAcc, TOut>() + " " + " -DCK_USE_AMD_XDLOPS" +
-             get_definition_string_from_tunable(tunable);
-
-    network_config = get_network_config_string_from_types<TInWei, TAcc, TOut>() + "_" +
-                     get_network_config_string_from_tunable(tunable);
-
-    std::vector<float> kernel1_times;
-    std::vector<float> kernel2_times;
-
-    for(index_t i = 0; i < nrepeat; ++i)
-    {
-        KernelTimer timer1, timer2;
-        std::string kernel_name;
-
-        kernel_name =
-            "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_prepare";
-        auto network_config_1 = network_config + "_1";
-
-        timer1.Start();
-        handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)(
-            static_cast<index_t>(in_n_c_hi_wi_lengths[I0]),
-            static_cast<index_t>(in_n_c_hi_wi_lengths[I1]),
-            static_cast<index_t>(in_n_c_hi_wi_lengths[I2]),
-            static_cast<index_t>(in_n_c_hi_wi_lengths[I3]),
-            static_cast<index_t>(wei_k_c_y_x_lengths[I0]),
-            static_cast<index_t>(wei_k_c_y_x_lengths[I2]),
-            static_cast<index_t>(wei_k_c_y_x_lengths[I3]),
-            conv_strides[I0],
-            conv_strides[I1],
-            conv_dilations[I0],
-            conv_dilations[I1],
-            in_left_pads[I0],
-            in_left_pads[I1],
-            in_right_pads[I0],
-            in_right_pads[I1],
-            a_k_m0_m1_grid_desc_dev_buf,
-            b_k_n0_n1_grid_desc_dev_buf,
-            c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf,
-            c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf);
-        timer1.End();
-
-        kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw";
-        auto network_config_2 = network_config + "_2";
-
-        timer2.Start();
-        handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)(
-            reinterpret_cast<const TInWei*>(wei_k_c_y_x_dev_buf.GetDeviceBuffer()),
-            reinterpret_cast<const TInWei*>(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()),
-            reinterpret_cast<TOut*>(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()),
-            (const void*)(a_k_m0_m1_grid_desc_dev_buf),
-            (const void*)(b_k_n0_n1_grid_desc_dev_buf),
-            (const void*)(c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf),
-            (const void*)(c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf));
-        timer2.End();
-
-        kernel1_times.push_back(timer1.GetElapsedTime());
-        kernel2_times.push_back(timer2.GetElapsedTime());
-    }
-
-    {
-        auto ave_time1 =
-            std::accumulate(
-                std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus<float>{}) /
-            (nrepeat - 1);
-        auto ave_time2 =
-            std::accumulate(
-                std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus<float>{}) /
-            (nrepeat - 1);
-
-        const auto N = in_n_c_hi_wi_lengths[I0];
-        const auto C = in_n_c_hi_wi_lengths[I1];
-
-        const auto K  = out_n_k_ho_wo_lengths[I1];
-        const auto Ho = out_n_k_ho_wo_lengths[I2];
-        const auto Wo = out_n_k_ho_wo_lengths[I3];
-
-        const auto Y = wei_k_c_y_x_lengths[I2];
-        const auto X = wei_k_c_y_x_lengths[I3];
-
-        float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
-                     (std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2);
-
-        std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
-                  << ave_time2 << "), " << perf << " TFlop/s" << std::endl;
-    };
-
-    // copy result back to host
-    out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data());
-}
diff --git a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
deleted file mode 100644
index 57724c7612..0000000000
--- a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
+++ /dev/null
@@ -1,389 +0,0 @@
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "handle.hpp"
-#include "online_driver_common.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
-#include "conv_tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp"
-
-namespace detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk {
-
-template <typename TInWei, typename TAcc, typename TOut>
-static std::string get_network_config_string_from_types()
-{
-    using namespace ck;
-
-    std::string out;
-
-    out += std::to_string(get_datatype_enum_from_type<TInWei>::value) + "_" +
-           std::to_string(get_datatype_enum_from_type<TAcc>::value) + "_" +
-           std::to_string(get_datatype_enum_from_type<TOut>::value);
-
-    return (out);
-};
-
-static std::string
-get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* pt)
-{
-    std::string out("TUN_");
-
-    out += std::to_string(pt->BlockSize) + "_";
-
-    out += std::to_string(pt->MPerBlock) + "x" + std::to_string(pt->NPerBlock) + "x" +
-           std::to_string(pt->KPerBlock) + "_";
-    out += std::to_string(pt->MPerWave) + "x" + std::to_string(pt->NPerWave) + "x" +
-           std::to_string(pt->MRepeat) + "x" + std::to_string(pt->NRepeat) + "x" +
-           std::to_string(pt->K1) + "_";
-
-    out += std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[2]) + "_";
-
-    out += std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[2]) + "_";
-
-    out += std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "_";
-
-    out += std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "x" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "x" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "_";
-
-    out += std::to_string(pt->ABlockTransferSrcVectorDim) + "_";
-    out += std::to_string(pt->ABlockTransferSrcScalarPerVector) + "_";
-    out += std::to_string(pt->ABlockTransferDstScalarPerVector_K1) + "_";
-    out += std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun) + "_";
-
-    out += std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[2]) + "_";
-
-    out += std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[2]) + "_";
-
-    out += std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "_";
-
-    out += std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "x" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "x" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "_";
-
-    out += std::to_string(pt->BBlockTransferSrcVectorDim) + "_";
-    out += std::to_string(pt->BBlockTransferSrcScalarPerVector) + "_";
-    out += std::to_string(pt->BBlockTransferDstScalarPerVector_K1) + "_";
-    out += std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun) + "_";
-
-    out += std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[6]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[7]) + "_";
-
-    out += std::to_string(pt->CThreadTransferSrcDstVectorDim) + "_";
-    out += std::to_string(pt->CThreadTransferDstScalarPerVector);
-
-    return (out);
-};
-
-template <typename TInWei, typename TAcc, typename TOut>
-static std::string get_definition_string_from_types()
-{
-    using namespace ck;
-
-    std::string out;
-
-    out +=
-        " -DCK_PARAM_ABDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TInWei>::value) +
-        " -DCK_PARAM_AccDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TAcc>::value) +
-        " -DCK_PARAM_CDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TOut>::value);
-
-    return (out);
-};
-
-static std::string
-get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* pt)
-{
-    std::string out;
-
-    out += " -DCK_PARAM_BlockSize=" + std::to_string(pt->BlockSize);
-
-    out += " -DCK_PARAM_MPerBlock=" + std::to_string(pt->MPerBlock) +
-           " -DCK_PARAM_NPerBlock=" + std::to_string(pt->NPerBlock) +
-           " -DCK_PARAM_KPerBlock=" + std::to_string(pt->KPerBlock);
-    out += " -DCK_PARAM_MPerWave=" + std::to_string(pt->MPerWave) +
-           " -DCK_PARAM_NPerWave=" + std::to_string(pt->NPerWave) +
-           " -DCK_PARAM_K1=" + std::to_string(pt->K1) +
-           " -DCK_PARAM_MRepeat=" + std::to_string(pt->MRepeat) +
-           " -DCK_PARAM_NRepeat=" + std::to_string(pt->NRepeat);
-
-    out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_K0_M_K1=" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[2]);
-
-    out += " -DCK_PARAM_ABlockTransferThreadClusterLengths_K0_M_K1=" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[2]);
-
-    out += " -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder=" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]);
-
-    out += " -DCK_PARAM_ABlockTransferSrcAccessOrder=" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "," +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "," +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[2]);
-
-    out +=
-        " -DCK_PARAM_ABlockTransferSrcVectorDim=" + std::to_string(pt->ABlockTransferSrcVectorDim);
-    out += " -DCK_PARAM_ABlockTransferSrcScalarPerVector=" +
-           std::to_string(pt->ABlockTransferSrcScalarPerVector);
-    out += " -DCK_PARAM_ABlockTransferDstScalarPerVector_K1=" +
-           std::to_string(pt->ABlockTransferDstScalarPerVector_K1);
-    out += " -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun=" +
-           std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun);
-
-    out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_K0_N_K1=" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[2]);
-
-    out += " -DCK_PARAM_BBlockTransferThreadClusterLengths_K0_N_K1=" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[2]);
-
-    out += " -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder=" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]);
-
-    out += " -DCK_PARAM_BBlockTransferSrcAccessOrder=" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "," +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "," +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[2]);
-
-    out +=
-        " -DCK_PARAM_BBlockTransferSrcVectorDim=" + std::to_string(pt->BBlockTransferSrcVectorDim);
-    out += " -DCK_PARAM_BBlockTransferSrcScalarPerVector=" +
-           std::to_string(pt->BBlockTransferSrcScalarPerVector);
-    out += " -DCK_PARAM_BBlockTransferDstScalarPerVector_K1=" +
-           std::to_string(pt->BBlockTransferDstScalarPerVector_K1);
-    out += " -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun=" +
-           std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun);
-
-    out += " -DCK_PARAM_CThreadTransferSrcDstAccessOrder=" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[6]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[7]);
-
-    out += " -DCK_PARAM_CThreadTransferSrcDstVectorDim=" +
-           std::to_string(pt->CThreadTransferSrcDstVectorDim);
-    out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
-           std::to_string(pt->CThreadTransferDstScalarPerVector);
-
-    return (out);
-};
-
-} // namespace detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
-
-template <typename TInWei,
-          typename TAcc,
-          typename TOut,
-          typename InLengths,
-          typename WeiLengths,
-          typename OutLengths,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk(
-    online_compile::Handle* handle,
-    const InLengths& in_n_hi_wi_c_lengths,
-    const WeiLengths& wei_k_y_x_c_lengths,
-    const OutLengths& out_n_ho_wo_k_lengths,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads,
-    const Tensor<TInWei>& in_n_hi_wi_c,
-    const Tensor<TInWei>& wei_k_y_x_c,
-    Tensor<TOut>& out_n_ho_wo_k,
-    const tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* tunable,
-    ck::index_t nrepeat)
-{
-    using namespace ck;
-    using namespace detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk;
-    using size_t = std::size_t;
-
-    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // The follow codes are only used for computing the grid_size, hasMainKBlockLoop,
-    // hasDoubleTailKBlockLoop
-
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    const auto in_n_hi_wi_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths);
-    const auto wei_k_y_x_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths);
-    const auto out_n_ho_wo_k_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths);
-
-    const auto n  = in_n_hi_wi_c_desc.GetLength(I0);
-    const auto hi = in_n_hi_wi_c_desc.GetLength(I1);
-    const auto wi = in_n_hi_wi_c_desc.GetLength(I2);
-    const auto c  = in_n_hi_wi_c_desc.GetLength(I3);
-
-    const auto k = wei_k_y_x_c_desc.GetLength(I0);
-    const auto y = wei_k_y_x_c_desc.GetLength(I1);
-    const auto x = wei_k_y_x_c_desc.GetLength(I2);
-
-    const auto ho = out_n_ho_wo_k_desc.GetLength(I1);
-    const auto wo = out_n_ho_wo_k_desc.GetLength(I2);
-
-    const auto M  = k;
-    const auto N  = n * ho * wo;
-    const auto K  = c * y * x;
-    const auto K0 = K / tunable->K1;
-
-    const index_t grid_size = (M / tunable->MPerBlock) * (N / tunable->NPerBlock);
-
-    // these buffers are usually provided by the user application
-    DeviceMem in_n_hi_wi_c_dev_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
-    DeviceMem wei_k_y_x_c_dev_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
-    DeviceMem out_n_ho_wo_k_dev_buf(sizeof(TOut) * out_n_ho_wo_k.mDesc.GetElementSpace());
-
-    in_n_hi_wi_c_dev_buf.ToDevice(in_n_hi_wi_c.mData.data());
-    wei_k_y_x_c_dev_buf.ToDevice(wei_k_y_x_c.mData.data());
-    out_n_ho_wo_k_dev_buf.ToDevice(out_n_ho_wo_k.mData.data());
-
-    // these are workspace buffers that should be expressed to the user by the corresponding
-    // workspace API
-    DeviceMem workspace_buf(4096);
-
-    void* a_k0_m_k1_grid_desc_dev_buf = workspace_buf.GetDeviceBuffer();
-    void* b_k0_n_k1_grid_desc_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 1024);
-    void* c_m0_m1_m2_n_grid_desc_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 2048);
-    void* c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 3072);
-
-    const std::vector<size_t> vld  = {static_cast<size_t>(tunable->BlockSize), 1, 1};
-    const std::vector<size_t> vgd1 = {static_cast<size_t>(tunable->BlockSize), 1, 1};
-    const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * tunable->BlockSize), 1, 1};
-
-    std::string program_name =
-        "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp";
-    std::string algo_name = "implicit_gemm_conv_fwd_v4r4_xdlops_nhwc";
-
-    std::string param = " -std=c++17 ";
-    std::string network_config;
-
-    param += get_definition_string_from_types<TInWei, TAcc, TOut>() + " -DCK_USE_AMD_XDLOPS ";
-    param += get_definition_string_from_tunable(tunable);
-
-    network_config = get_network_config_string_from_types<TInWei, TAcc, TOut>() + "_" +
-                     get_network_config_string_from_tunable(tunable);
-
-    std::vector<float> kernel1_times;
-    std::vector<float> kernel2_times;
-
-    for(index_t i = 0; i < nrepeat; ++i)
-    {
-        KernelTimer timer1, timer2;
-        std::string kernel_name;
-
-        kernel_name =
-            "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk_prepare";
-        auto network_config_1 = network_config + "_1";
-
-        timer1.Start();
-        handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)(
-            static_cast<index_t>(in_n_hi_wi_c_lengths[I0]),
-            static_cast<index_t>(in_n_hi_wi_c_lengths[I1]),
-            static_cast<index_t>(in_n_hi_wi_c_lengths[I2]),
-            static_cast<index_t>(in_n_hi_wi_c_lengths[I3]),
-            static_cast<index_t>(wei_k_y_x_c_lengths[I0]),
-            static_cast<index_t>(wei_k_y_x_c_lengths[I1]),
-            static_cast<index_t>(wei_k_y_x_c_lengths[I2]),
-            conv_strides[I0],
-            conv_strides[I1],
-            conv_dilations[I0],
-            conv_dilations[I1],
-            in_left_pads[I0],
-            in_left_pads[I1],
-            in_right_pads[I0],
-            in_right_pads[I1],
-            a_k0_m_k1_grid_desc_dev_buf,
-            b_k0_n_k1_grid_desc_dev_buf,
-            c_m0_m1_m2_n_grid_desc_dev_buf,
-            c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf);
-        timer1.End();
-
-        kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk";
-        auto network_config_2 = network_config + "_2";
-
-        timer2.Start();
-        handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)(
-            reinterpret_cast<const TInWei*>(in_n_hi_wi_c_dev_buf.GetDeviceBuffer()),
-            reinterpret_cast<const TInWei*>(wei_k_y_x_c_dev_buf.GetDeviceBuffer()),
-            reinterpret_cast<TOut*>(out_n_ho_wo_k_dev_buf.GetDeviceBuffer()),
-            (const void*)(a_k0_m_k1_grid_desc_dev_buf),
-            (const void*)(b_k0_n_k1_grid_desc_dev_buf),
-            (const void*)(c_m0_m1_m2_n_grid_desc_dev_buf),
-            (const void*)(c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf));
-        timer2.End();
-
-        kernel1_times.push_back(timer1.GetElapsedTime());
-        kernel2_times.push_back(timer2.GetElapsedTime());
-    }
-
-    {
-        auto ave_time1 =
-            std::accumulate(
-                std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus<float>{}) /
-            (nrepeat - 1);
-        auto ave_time2 =
-            std::accumulate(
-                std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus<float>{}) /
-            (nrepeat - 1);
-
-        const auto N = in_n_hi_wi_c_lengths[I0];
-        const auto C = in_n_hi_wi_c_lengths[I3];
-
-        const auto Ho = out_n_ho_wo_k_lengths[I1];
-        const auto Wo = out_n_ho_wo_k_lengths[I2];
-        const auto K  = out_n_ho_wo_k_lengths[I3];
-
-        const auto Y = wei_k_y_x_c_lengths[I1];
-        const auto X = wei_k_y_x_c_lengths[I2];
-
-        float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
-                     (std::size_t(1000) * 1000 * 1000) / ave_time2;
-
-        std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
-                  << ave_time2 << "), " << perf << " TFlop/s" << std::endl;
-    };
-
-    // copy result back to host
-    out_n_ho_wo_k_dev_buf.FromDevice(out_n_ho_wo_k.mData.data());
-}
diff --git a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
deleted file mode 100644
index 92467a7668..0000000000
--- a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
+++ /dev/null
@@ -1,182 +0,0 @@
-#pragma once
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "handle.hpp"
-#include "online_driver_common.hpp"
-#include "convolution_problem_descriptor.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp"
-#include "conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp"
-
-template <typename TInWei,
-          typename TAcc,
-          typename TOut,
-          typename InLengths,
-          typename WeiLengths,
-          typename OutLengths,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-void online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
-    online_compile::Handle* handle,
-    const InLengths& in_n_c_hi_wi_lengths,
-    const WeiLengths& wei_k_c_y_x_lengths,
-    const OutLengths& out_n_k_ho_wo_lengths,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads,
-    const Tensor<TInWei>& in_n_c_hi_wi,
-    const Tensor<TInWei>& wei_k_c_y_x,
-    Tensor<TOut>& out_n_k_ho_wo,
-    const ck_driver::CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw& compile_param,
-    ck::index_t nrepeat)
-{
-    using namespace ck;
-    using namespace ck_driver;
-    using size_t = std::size_t;
-
-    std::cout << __func__ << std::endl;
-
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    ConvolutionProblemDescriptor conv_problem_desc{in_n_c_hi_wi_lengths[I0],
-                                                   out_n_k_ho_wo_lengths[I1],
-                                                   in_n_c_hi_wi_lengths[I1],
-                                                   wei_k_c_y_x_lengths[I2],
-                                                   wei_k_c_y_x_lengths[I3],
-                                                   in_n_c_hi_wi_lengths[I2],
-                                                   in_n_c_hi_wi_lengths[I3],
-                                                   out_n_k_ho_wo_lengths[I2],
-                                                   out_n_k_ho_wo_lengths[I3],
-                                                   conv_strides[I0],
-                                                   conv_strides[I1],
-                                                   conv_dilations[I0],
-                                                   conv_dilations[I1],
-                                                   in_left_pads[I0],
-                                                   in_left_pads[I1],
-                                                   in_right_pads[I0],
-                                                   in_right_pads[I1],
-                                                   get_datatype_enum_from_type<TInWei>::value,
-                                                   get_datatype_enum_from_type<TInWei>::value,
-                                                   get_datatype_enum_from_type<TOut>::value};
-
-    if(!ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::IsValidCompileParameter(conv_problem_desc,
-                                                                   compile_param))
-    {
-        throw std::runtime_error("wrong! IsValidCompileParameter fail");
-    }
-
-    DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
-    DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
-    DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
-
-    in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data());
-    wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data());
-    out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data());
-
-    // workspace is used for save transformed tensor descritpors created by prepare kernel
-    DeviceMem workspace_dev_buf(
-        ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetWorkSpaceSize(conv_problem_desc, compile_param));
-
-    const auto block_size = std::size_t(
-        ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetBlockSize(conv_problem_desc, compile_param));
-
-    const auto grid_size = std::size_t(
-        ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetGridSize(conv_problem_desc, compile_param));
-
-    const std::vector<size_t> vld1 = {1, 1, 1};
-    const std::vector<size_t> vgd1 = {1, 1, 1};
-
-    const std::vector<size_t> vld2 = {static_cast<size_t>(block_size), 1, 1};
-    const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * block_size), 1, 1};
-
-    std::string program_name =
-        "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp";
-    std::string algo_name = "implicit_gemm_conv_fwd_v6r1_dlops_nchw";
-
-    std::string compile_param_string = get_ck_hip_online_compile_common_flag() + compile_param.GetCompileParameterString();
-    std::string network_config       = compile_param_string;
-
-    std::vector<float> kernel1_times;
-    std::vector<float> kernel2_times;
-
-    for(index_t i = 0; i < nrepeat + 1; ++i)
-    {
-        KernelTimer timer1, timer2;
-        std::string kernel_name;
-
-        kernel_name = "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare";
-        auto network_config_1 = network_config + "_1";
-
-        timer1.Start();
-        handle->AddKernel(algo_name,
-                          network_config_1,
-                          program_name,
-                          kernel_name,
-                          vld1,
-                          vgd1,
-                          compile_param_string)(static_cast<index_t>(in_n_c_hi_wi_lengths[I0]),
-                                                static_cast<index_t>(in_n_c_hi_wi_lengths[I1]),
-                                                static_cast<index_t>(in_n_c_hi_wi_lengths[I2]),
-                                                static_cast<index_t>(in_n_c_hi_wi_lengths[I3]),
-                                                static_cast<index_t>(wei_k_c_y_x_lengths[I0]),
-                                                static_cast<index_t>(wei_k_c_y_x_lengths[I2]),
-                                                static_cast<index_t>(wei_k_c_y_x_lengths[I3]),
-                                                conv_strides[I0],
-                                                conv_strides[I1],
-                                                conv_dilations[I0],
-                                                conv_dilations[I1],
-                                                in_left_pads[I0],
-                                                in_left_pads[I1],
-                                                in_right_pads[I0],
-                                                in_right_pads[I1],
-                                                (void*)(workspace_dev_buf.GetDeviceBuffer()));
-        timer1.End();
-
-        kernel_name = "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw";
-        auto network_config_2 = network_config + "_2";
-
-        timer2.Start();
-        handle->AddKernel(algo_name,
-                          network_config_2,
-                          program_name,
-                          kernel_name,
-                          vld2,
-                          vgd2,
-                          compile_param_string)(
-            reinterpret_cast<const TInWei*>(wei_k_c_y_x_dev_buf.GetDeviceBuffer()),
-            reinterpret_cast<const TInWei*>(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()),
-            reinterpret_cast<TOut*>(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()),
-            (const void*)(workspace_dev_buf.GetDeviceBuffer()));
-        timer2.End();
-
-        kernel1_times.push_back(timer1.GetElapsedTime());
-        kernel2_times.push_back(timer2.GetElapsedTime());
-    }
-
-    {
-        auto ave_time1 =
-            std::accumulate(
-                std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus<float>{}) /
-            nrepeat;
-        auto ave_time2 =
-            std::accumulate(
-                std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus<float>{}) /
-            nrepeat;
-
-        float perf = (float)(conv_problem_desc.CalculateFlop()) /
-                     (std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2);
-
-        std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
-                  << ave_time2 << "), " << perf << " TFlop/s" << std::endl;
-    };
-
-    // copy result back to host
-    out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data());
-}
diff --git a/host/host_tensor/CMakeLists.txt b/host/host_tensor/CMakeLists.txt
index 9c30275220..3dcecf64e1 100644
--- a/host/host_tensor/CMakeLists.txt
+++ b/host/host_tensor/CMakeLists.txt
@@ -10,6 +10,8 @@ set(HOST_TENSOR_SOURCE
 ## the library target
 add_library(host_tensor SHARED ${HOST_TENSOR_SOURCE}) 
 
+target_include_directories(host_tensor SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
+
 target_link_libraries(host_tensor PRIVATE hip::device)
 target_link_libraries(host_tensor INTERFACE hip::host)
 
diff --git a/host/host_tensor/include/conv_common.hpp b/host/host_tensor/include/conv_common.hpp
index 73126b3c79..4bf2c23494 100644
--- a/host/host_tensor/include/conv_common.hpp
+++ b/host/host_tensor/include/conv_common.hpp
@@ -1,7 +1,7 @@
 #ifndef CONV_COMMON_HPP
 #define CONV_COMMON_HPP
 
-#include "dynamic_tensor_descriptor.hpp"
+#include "tensor_descriptor.hpp"
 
 enum ConvTensorLayout
 {
@@ -19,8 +19,8 @@ template <typename... InDesc,
           typename LeftPads,
           typename RightPads>
 constexpr auto get_convolution_output_default_4d_tensor_descriptor(
-    const ck::DynamicTensorDescriptor<InDesc...>& in_desc,
-    const ck::DynamicTensorDescriptor<WeiDesc...>& wei_desc,
+    const ck::TensorDescriptor<InDesc...>& in_desc,
+    const ck::TensorDescriptor<WeiDesc...>& wei_desc,
     const ConvStrides& conv_strides,
     const ConvDilations conv_dilations,
     const LeftPads& left_pads,
@@ -57,12 +57,12 @@ constexpr auto get_convolution_output_default_4d_tensor_descriptor(
     const auto Ho = (Hi + LeftPadH + RightPadH - YEff) / conv_strides[I0] + I1;
     const auto Wo = (Wi + LeftPadW + RightPadW - XEff) / conv_strides[I1] + I1;
 
-    return make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K, Ho, Wo));
+    return make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho, Wo));
 }
 
 template <class InDesc, class WeiDesc, class OutDesc>
 constexpr std::size_t
-calculate_convolution_flops(const InDesc& in_desc, const WeiDesc& wei_desc, const OutDesc& out_desc)
+calculate_convolution_flops(const InDesc&, const WeiDesc& wei_desc, const OutDesc& out_desc)
 {
     using namespace ck;
 
diff --git a/host/host_tensor/include/device.hpp b/host/host_tensor/include/device.hpp
index 2299e14921..e2cba94100 100644
--- a/host/host_tensor/include/device.hpp
+++ b/host/host_tensor/include/device.hpp
@@ -34,24 +34,16 @@ struct KernelTimer
 using device_stream_t = hipStream_t;
 
 template <typename... Args, typename F>
-void launch_kernel(F kernel,
-                   dim3 grid_dim,
-                   dim3 block_dim,
-                   std::size_t lds_byte,
-                   hipStream_t stream_id,
-                   Args... args)
+void launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
 {
+    hipStream_t stream_id = nullptr;
+
     hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
 }
 
 template <typename... Args, typename F>
-float launch_and_time_kernel(F kernel,
-                             int nrepeat,
-                             dim3 grid_dim,
-                             dim3 block_dim,
-                             std::size_t lds_byte,
-                             hipStream_t stream_id,
-                             Args... args)
+float launch_and_time_kernel(
+    F kernel, int nrepeat, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
 {
     KernelTimer timer;
 
@@ -66,6 +58,8 @@ float launch_and_time_kernel(F kernel,
 
     printf("Warm up\n");
 
+    hipStream_t stream_id = nullptr;
+
     // warm up
     hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
 
diff --git a/host/host_tensor/include/host_conv.hpp b/host/host_tensor/include/host_conv.hpp
index 7f26cb42f7..c1228f4832 100644
--- a/host/host_tensor/include/host_conv.hpp
+++ b/host/host_tensor/include/host_conv.hpp
@@ -14,15 +14,13 @@ void host_direct_convolution(const Tensor<TIn>& in,
                              const ConvStrides& conv_strides,
                              const ConvDilations& conv_dilations,
                              const InLeftPads& in_left_pads,
-                             const InRightPads& in_right_pads,
+                             const InRightPads&,
                              const ConvTensorLayout layout = ConvTensorLayout::NCHW)
 {
     using namespace ck;
 
     constexpr auto I0 = Number<0>{};
     constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
 
     auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
         double v = 0;
@@ -68,23 +66,25 @@ void host_direct_convolution(const Tensor<TIn>& in,
         out(n, ho, wo, k) = v;
     };
 
-    switch(layout)
+    if(layout == ConvTensorLayout::NCHW)
     {
-    case ConvTensorLayout::NCHW:
         make_ParallelTensorFunctor(f_nchw,
                                    out.mDesc.GetLengths()[0],
                                    out.mDesc.GetLengths()[1],
                                    out.mDesc.GetLengths()[2],
                                    out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
-        break;
-    case ConvTensorLayout::NHWC:
+    }
+    else if(layout == ConvTensorLayout::NHWC)
+    {
         make_ParallelTensorFunctor(f_nhwc,
                                    out.mDesc.GetLengths()[0],
                                    out.mDesc.GetLengths()[1],
                                    out.mDesc.GetLengths()[2],
                                    out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
-        break;
-    default: throw std::runtime_error("wrong! not supported layout");
+    }
+    else
+    {
+        throw std::runtime_error("wrong! not supported layout");
     }
 }
 
@@ -100,17 +100,15 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
     constexpr std::size_t HoPerTile = 2;
     constexpr std::size_t WoPerTile = 2;
 
-    std::size_t N  = in_nchw.mDesc.GetLengths()[0];
-    std::size_t C  = in_nchw.mDesc.GetLengths()[1];
-    std::size_t HI = in_nchw.mDesc.GetLengths()[2];
-    std::size_t WI = in_nchw.mDesc.GetLengths()[3];
+    std::size_t N = in_nchw.mDesc.GetLengths()[0];
+    std::size_t C = in_nchw.mDesc.GetLengths()[1];
 
     std::size_t K = wei_kcyx.mDesc.GetLengths()[0];
     std::size_t Y = wei_kcyx.mDesc.GetLengths()[2];
     std::size_t X = wei_kcyx.mDesc.GetLengths()[3];
 
-    std::size_t HO = out_nkhw.mDesc.GetLengths()[2];
-    std::size_t WO = out_nkhw.mDesc.GetLengths()[3];
+    std::size_t Ho = out_nkhw.mDesc.GetLengths()[2];
+    std::size_t Wo = out_nkhw.mDesc.GetLengths()[3];
 
     index_t h_pad_low = InLeftPads{}.Get(Number<0>{});
     index_t w_pad_low = InLeftPads{}.Get(Number<1>{});
@@ -118,8 +116,8 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
     std::size_t HiPerTile = HoPerTile + Y - 1;
     std::size_t WiPerTile = WoPerTile + X - 1;
 
-    std::size_t HTile = (HO + HoPerTile - 1) / HoPerTile;
-    std::size_t WTile = (WO + WoPerTile - 1) / WoPerTile;
+    std::size_t HTile = (Ho + HoPerTile - 1) / HoPerTile;
+    std::size_t WTile = (Wo + WoPerTile - 1) / WoPerTile;
 
     Tensor<double> in_hold({N, C, HTile, WTile, HiPerTile, WiPerTile});
     Tensor<double> in_transform({N, C, HTile, WTile, HiPerTile, WiPerTile});
diff --git a/host/host_tensor/include/host_conv_bwd_data.hpp b/host/host_tensor/include/host_conv_bwd_data.hpp
index 07617c3926..ca23422e23 100644
--- a/host/host_tensor/include/host_conv_bwd_data.hpp
+++ b/host/host_tensor/include/host_conv_bwd_data.hpp
@@ -14,7 +14,7 @@ void host_direct_convolution_backward_data(Tensor<TIn>& in,
                                            const ConvStrides& conv_strides,
                                            const ConvDilations& conv_dilations,
                                            const InLeftPads& in_left_pads,
-                                           const InRightPads& in_right_pads,
+                                           const InRightPads& /* in_right_pads */,
                                            const ConvTensorLayout layout = ConvTensorLayout::NCHW)
 {
     using namespace ck;
@@ -25,11 +25,6 @@ void host_direct_convolution_backward_data(Tensor<TIn>& in,
     constexpr auto I3 = Number<3>{};
 
     auto f_nchw = [&](auto n, auto c, auto hi, auto wi) {
-        std::size_t N  = in.mDesc.GetLengths()[I0];
-        std::size_t C  = in.mDesc.GetLengths()[I1];
-        std::size_t Hi = in.mDesc.GetLengths()[I2];
-        std::size_t Wi = in.mDesc.GetLengths()[I3];
-
         std::size_t K = wei.mDesc.GetLengths()[I0];
         std::size_t Y = wei.mDesc.GetLengths()[I2];
         std::size_t X = wei.mDesc.GetLengths()[I3];
@@ -74,11 +69,6 @@ void host_direct_convolution_backward_data(Tensor<TIn>& in,
     };
 
     auto f_nhwc = [&](auto n, auto hi, auto wi, auto c) {
-        std::size_t N  = in.mDesc.GetLengths()[I0];
-        std::size_t Hi = in.mDesc.GetLengths()[I1];
-        std::size_t Wi = in.mDesc.GetLengths()[I2];
-        std::size_t C  = in.mDesc.GetLengths()[I3];
-
         std::size_t K = wei.mDesc.GetLengths()[I0];
         std::size_t Y = wei.mDesc.GetLengths()[I1];
         std::size_t X = wei.mDesc.GetLengths()[I2];
@@ -122,22 +112,24 @@ void host_direct_convolution_backward_data(Tensor<TIn>& in,
         in(n, hi, wi, c) = v;
     };
 
-    switch(layout)
+    if(layout == ConvTensorLayout::NCHW)
     {
-    case ConvTensorLayout::NCHW:
         make_ParallelTensorFunctor(f_nchw,
                                    in.mDesc.GetLengths()[0],
                                    in.mDesc.GetLengths()[1],
                                    in.mDesc.GetLengths()[2],
                                    in.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
-        break;
-    case ConvTensorLayout::NHWC:
+    }
+    else if(layout == ConvTensorLayout::NHWC)
+    {
         make_ParallelTensorFunctor(f_nhwc,
                                    in.mDesc.GetLengths()[0],
                                    in.mDesc.GetLengths()[1],
                                    in.mDesc.GetLengths()[2],
                                    in.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
-        break;
-    default: throw std::runtime_error("wrong! not supported layout");
+    }
+    else
+    {
+        throw std::runtime_error("wrong! not supported layout");
     }
 }
diff --git a/host/host_tensor/include/host_tensor.hpp b/host/host_tensor/include/host_tensor.hpp
index 70778a4a94..06aed0a0c1 100644
--- a/host/host_tensor/include/host_tensor.hpp
+++ b/host/host_tensor/include/host_tensor.hpp
@@ -34,7 +34,7 @@ std::ostream& LogRangeAsType(std::ostream& os, Range&& range, std::string delim)
             first = false;
         else
             os << delim;
-        os << T{v};
+        os << static_cast<T>(v);
     }
     return os;
 }
diff --git a/host/host_tensor/include/host_tensor_generator.hpp b/host/host_tensor/include/host_tensor_generator.hpp
index 98192e066f..7c09843d01 100644
--- a/host/host_tensor/include/host_tensor_generator.hpp
+++ b/host/host_tensor/include/host_tensor_generator.hpp
@@ -9,7 +9,7 @@ struct GeneratorTensor_1
     int value = 1;
 
     template <typename... Is>
-    float operator()(Is... is)
+    float operator()(Is...)
     {
         return value;
     }
diff --git a/host/host_tensor/src/device.cpp b/host/host_tensor/src/device.cpp
index d0d74a4c2a..0d1b3d6883 100644
--- a/host/host_tensor/src/device.cpp
+++ b/host/host_tensor/src/device.cpp
@@ -24,32 +24,32 @@ struct KernelTimerImpl
 {
     KernelTimerImpl()
     {
-        hipEventCreate(&mStart);
-        hipEventCreate(&mEnd);
+        hipGetErrorString(hipEventCreate(&mStart));
+        hipGetErrorString(hipEventCreate(&mEnd));
     }
 
     ~KernelTimerImpl()
     {
-        hipEventDestroy(mStart);
-        hipEventDestroy(mEnd);
+        hipGetErrorString(hipEventDestroy(mStart));
+        hipGetErrorString(hipEventDestroy(mEnd));
     }
 
     void Start()
     {
-        hipDeviceSynchronize();
-        hipEventRecord(mStart, 0);
+        hipGetErrorString(hipDeviceSynchronize());
+        hipGetErrorString(hipEventRecord(mStart, nullptr));
     }
 
     void End()
     {
-        hipEventRecord(mEnd, 0);
-        hipEventSynchronize(mEnd);
+        hipGetErrorString(hipEventRecord(mEnd, nullptr));
+        hipGetErrorString(hipEventSynchronize(mEnd));
     }
 
     float GetElapsedTime() const
     {
         float time;
-        hipEventElapsedTime(&time, mStart, mEnd);
+        hipGetErrorString(hipEventElapsedTime(&time, mStart, mEnd));
         return time;
     }
 
diff --git a/host/online_compile/CMakeLists.txt b/host/online_compile/CMakeLists.txt
deleted file mode 100644
index 1b66703fcd..0000000000
--- a/host/online_compile/CMakeLists.txt
+++ /dev/null
@@ -1,168 +0,0 @@
-set(CMAKE_CXX_COMPILER /opt/rocm/llvm/bin/clang++)
-
-## for online-compiling of HIP kernels
-set(OLC_HIP_COMPILER ${CMAKE_CXX_COMPILER} CACHE PATH "")
-
-## reset to avoid the C++ options from the parent project
-set(CMAKE_CXX_FLAGS "")
-message("Compiling options for library and kernels: ${CMAKE_CXX_FLAGS}")
-
-# look for and register clang-offload-bundler
-if(OLC_HIP_COMPILER MATCHES ".*clang\\+\\+$")
-    find_program(OLC_OFFLOADBUNDLER_BIN clang-offload-bundler
-        PATH_SUFFIXES bin
-        PATHS
-	    /opt/rocm/llvm
-	    ${CMAKE_INSTALL_PREFIX}/llvm
-    )
-endif()
-
-if(OLC_OFFLOADBUNDLER_BIN)
-    message(STATUS "clang-offload-bundler found: ${OLC_OFFLOADBUNDLER_BIN}")
-    set(OLC_OFFLOADBUNDLER_BIN "${OLC_OFFLOADBUNDLER_BIN}")
-else()
-    # look for and register extractkernel
-    message(STATUS "clang-offload-bundler not found")
-
-    find_program(EXTRACTKERNEL_BIN extractkernel
-        PATH_SUFFIXES bin
-        PATHS
-            /opt/rocm/hip
-            /opt/rocm/hcc
-            /opt/rocm
-	    ${CMAKE_INSTALL_PREFIX}/hip
-            ${CMAKE_INSTALL_PREFIX}/hcc
-            ${CMAKE_INSTALL_PREFIX}
-
-    )
-    if(EXTRACTKERNEL_BIN)
-        message(STATUS "extractkernel found: ${EXTRACTKERNEL_BIN}")
-        set(EXTRACTKERNEL_BIN "${EXTRACTKERNEL_BIN}")
-    else()
-        message(FATAL_ERROR "extractkernel not found")
-    endif()
-endif()
-
-option(Boost_USE_STATIC_LIBS "Use boost static libraries" OFF)
-set(BOOST_COMPONENTS filesystem)
-add_definitions(-DBOOST_ALL_NO_LIB=1)
-find_package(Boost REQUIRED COMPONENTS ${BOOST_COMPONENTS})
-
-# HIP is always required
-find_package(hip REQUIRED PATHS /opt/rocm)
-message(STATUS "Build with HIP ${hip_VERSION}")
-target_flags(HIP_COMPILER_FLAGS hip::device)
-# Remove cuda arch flags
-string(REGEX REPLACE --cuda-gpu-arch=[a-z0-9]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
-string(REGEX REPLACE --offload-arch=[a-z0-9]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
-
-set(OLC_hip_VERSION_MAJOR "${hip_VERSION_MAJOR}")
-set(OLC_hip_VERSION_MINOR "${hip_VERSION_MINOR}")
-set(OLC_hip_VERSION_PATCH "${hip_VERSION_PATCH}")
-
-option(ENABLE_DEBUG "Build to enable debugging" ON)
-if(ENABLE_DEBUG)
-    set(OLC_DEBUG 1)
-else()
-    set(OLC_DEBUG 0)
-endif()
-
-configure_file("${PROJECT_SOURCE_DIR}/host/online_compile/include/config.h.in" "${PROJECT_BINARY_DIR}/host/online_compile/include/config.h")
-
-include_directories(BEFORE
-    ${PROJECT_BINARY_DIR}/host/online_compile/include
-)
-
-message(STATUS "Hip compiler flags: ${HIP_COMPILER_FLAGS}")
-
-## HIP_COMPILER_FLAGS will be used for on-line compiling of the HIP kernels
-set(HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS} ${HIP_ONLINE_COMPILER_FLAGS}")
-add_definitions("-DHIP_COMPILER_FLAGS=${HIP_COMPILER_FLAGS}")
-
-file(GLOB_RECURSE COMPOSABLE_KERNEL_INCLUDE_1 "${PROJECT_SOURCE_DIR}/composable_kernel/include/*/*.hpp")
-file(GLOB COMPOSABLE_KERNEL_INCLUDE_2 "${PROJECT_SOURCE_DIR}/external/rocm/include/bfloat16_dev.hpp")
-set(MCONV_KERNEL_INCLUDES
-    ${COMPOSABLE_KERNEL_INCLUDE_1}
-    ${COMPOSABLE_KERNEL_INCLUDE_2}
-   )
-
-file(GLOB_RECURSE MCONV_KERNELS "${PROJECT_SOURCE_DIR}/composable_kernel/src/kernel_wrapper/*.cpp")
-
-add_kernels(${CMAKE_CURRENT_SOURCE_DIR} "${MCONV_KERNELS}")
-add_kernel_includes(${CMAKE_CURRENT_SOURCE_DIR} "${MCONV_KERNEL_INCLUDES}")
-
-set(ONLINE_COMPILATION_SOURCE
-     ${PROJECT_BINARY_DIR}/kernel.cpp
-     ${PROJECT_BINARY_DIR}/kernel_includes.cpp
-)
-
-include_directories(BEFORE
-    ${PROJECT_BINARY_DIR}/host/online_compile/include
-    include
-)
-
-set(OLC_HIP_UTILITY_CPPS
-    hip_utility/logger.cpp
-    hip_utility/tmp_dir.cpp
-    hip_utility/md5.cpp  
-    hip_utility/exec_utils.cpp
-    hip_utility/target_properties.cpp  
-    hip_utility/handlehip.cpp
-    hip_utility/kernel_build_params.cpp  
-    hip_utility/hip_build_utils.cpp  
-    hip_utility/hipoc_program.cpp  
-    hip_utility/hipoc_kernel.cpp  
-    hip_utility/kernel_cache.cpp  
-    hip_utility/binary_cache.cpp
-   )
-
-list(APPEND OLC_SOURCES ${OLC_HIP_UTILITY_CPPS} ${OLC_HIP_UTILITY_HEADERS})
-
-## addkernels provide the tool to create inlined kernels in one header
-add_subdirectory(addkernels)
-
-function(inline_kernels_src KERNELS KERNEL_INCLUDES)
-    set(KERNEL_SRC_HPP_FILENAME batch_all.cpp.hpp)
-    set(KERNEL_SRC_HPP_PATH ${PROJECT_BINARY_DIR}/inlined_kernels/${KERNEL_SRC_HPP_FILENAME})
-    set(KERNEL_SRC_CPP_PATH ${PROJECT_BINARY_DIR}/inlined_kernels/batch_all.cpp)
-
-    add_custom_command(
-        OUTPUT ${KERNEL_SRC_HPP_PATH}
-        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-        DEPENDS addkernels ${KERNELS} ${KERNEL_INCLUDES}
-        COMMAND $<TARGET_FILE:addkernels> -target ${KERNEL_SRC_HPP_PATH} -extern -source ${KERNELS}
-	COMMENT "Inlining All kernels"
-    )
-    configure_file(kernels_batch.cpp.in ${KERNEL_SRC_CPP_PATH})
-    list(APPEND OLC_SOURCES ${KERNEL_SRC_CPP_PATH} ${KERNEL_SRC_HPP_PATH})
-
-    set(OLC_SOURCES ${OLC_SOURCES} PARENT_SCOPE)
-endfunction()
-
-inline_kernels_src("${MCONV_KERNELS}" "${MCONV_KERNEL_INCLUDES}")
-
-list(APPEND ONLINE_COMPILATION_SOURCE ${OLC_SOURCES} ${PROJECT_BINARY_DIR}/olc_kernel_includes.h)
-
-add_custom_command(
-    OUTPUT ${PROJECT_BINARY_DIR}/olc_kernel_includes.h
-    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-    DEPENDS addkernels ${MCONV_KERNEL_INCLUDES}
-    COMMAND $<TARGET_FILE:addkernels> -no-recurse -guard GUARD_OLC_KERNEL_INCLUDES_HPP_ -target ${PROJECT_BINARY_DIR}/olc_kernel_includes.h -source ${MCONV_KERNEL_INCLUDES}
-    COMMENT "Inlining HIP kernel includes"
-  )
-
-## the library target
-add_library(online_compile SHARED ${ONLINE_COMPILATION_SOURCE}) 
-
-target_include_directories(online_compile PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/online_compile/include/)
-target_include_directories(online_compile PRIVATE ${PROJECT_BINARY_DIR})
-target_include_directories(online_compile PRIVATE ${PROJECT_SOURCE_DIR}/external/half/include/)
-
-target_link_libraries(online_compile PRIVATE hip::device)
-target_link_libraries(online_compile INTERFACE hip::host)
-target_link_libraries(online_compile PRIVATE Boost::filesystem)
-
-target_compile_features(online_compile PUBLIC)
-set_target_properties(online_compile PROPERTIES POSITION_INDEPENDENT_CODE ON)
-
-install(TARGETS online_compile LIBRARY DESTINATION lib) 
diff --git a/host/online_compile/addkernels/addkernels.cpp b/host/online_compile/addkernels/addkernels.cpp
deleted file mode 100644
index 5be523d97b..0000000000
--- a/host/online_compile/addkernels/addkernels.cpp
+++ /dev/null
@@ -1,264 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include "include_inliner.hpp"
-#include <algorithm>
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include <memory>
-#include <sstream>
-#include <string>
-
-void Bin2Hex(std::istream& source,
-             std::ostream& target,
-             const std::string& variable,
-             bool nullTerminate,
-             size_t bufferSize,
-             size_t lineSize)
-{
-    source.seekg(0, std::ios::end);
-    std::unique_ptr<unsigned char[]> buffer(new unsigned char[bufferSize]);
-    std::streamoff sourceSize = source.tellg();
-    std::streamoff blockStart = 0;
-
-    if(variable.length() != 0)
-    {
-        target << "extern const size_t " << variable << "_SIZE;" << std::endl;
-        target << "extern const unsigned char " << variable << "[];" << std::endl;
-        target << "const size_t " << variable << "_SIZE = " << std::setbase(10) << sourceSize << ";"
-               << std::endl;
-        target << "const unsigned char " << variable << "[] = {" << std::endl;
-    }
-
-    target << std::setbase(16) << std::setfill('0');
-    source.seekg(0, std::ios::beg);
-
-    while(blockStart < sourceSize)
-    {
-        source.read(reinterpret_cast<char*>(buffer.get()), bufferSize);
-
-        std::streamoff pos       = source.tellg();
-        std::streamoff blockSize = (pos < 0 ? sourceSize : pos) - blockStart;
-        std::streamoff i         = 0;
-
-        while(i < blockSize)
-        {
-            size_t j   = i;
-            size_t end = std::min<size_t>(i + lineSize, blockSize);
-
-            for(; j < end; j++)
-                target << "0x" << std::setw(2) << static_cast<unsigned>(buffer[j]) << ",";
-
-            target << std::endl;
-            i = end;
-        }
-
-        blockStart += blockSize;
-    }
-
-    if(nullTerminate)
-        target << "0x00," << std::endl;
-
-    if(variable.length() != 0)
-    {
-        target << "};" << std::endl;
-    }
-}
-
-void PrintHelp()
-{
-    std::cout << "Usage: bin2hex {<option>}" << std::endl;
-    std::cout << "Option format: -<option name>[ <option value>]" << std::endl;
-    std::cout << std::endl;
-    std::cout << "Options:" << std::endl;
-    std::cout
-        << "[REQUIRED] -s[ource] {<path to file>}: files to be processed. Must be last argument."
-        << std::endl;
-    std::cout << "           -t[arget] <path>: target file. Default: std out." << std::endl;
-    std::cout << "           -l[ine-size] <number>: bytes in one line. Default: 16." << std::endl;
-    std::cout << "           -b[uffer] <number>: read buffer size. Default: 512." << std::endl;
-    std::cout << "           -g[uard] <string>: guard name. Default: no guard" << std::endl;
-    std::cout << "           -n[o-recurse] : dont expand include files recursively. Default: off"
-              << std::endl;
-}
-
-[[gnu::noreturn]] void WrongUsage(const std::string& error)
-{
-    std::cout << "Wrong usage: " << error << std::endl;
-    std::cout << std::endl;
-    PrintHelp();
-    std::exit(1);
-}
-
-[[gnu::noreturn]] void UnknownArgument(const std::string& arg)
-{
-    std::ostringstream ss;
-    ss << "unknown argument - " << arg;
-    WrongUsage(ss.str());
-}
-
-void Process(const std::string& sourcePath,
-             std::ostream& target,
-             size_t bufferSize,
-             size_t lineSize,
-             bool recurse,
-             bool as_extern)
-{
-    std::string fileName(sourcePath);
-    std::string extension, root;
-    std::stringstream inlinerTemp;
-    auto extPos   = fileName.rfind('.');
-    auto slashPos = fileName.rfind('/');
-
-    if(extPos != std::string::npos)
-    {
-        extension = fileName.substr(extPos + 1);
-        fileName  = fileName.substr(0, extPos);
-    }
-
-    if(slashPos != std::string::npos)
-    {
-        root     = fileName.substr(0, slashPos + 1);
-        fileName = fileName.substr(slashPos + 1);
-    }
-
-    std::string variable(fileName);
-    std::ifstream sourceFile(sourcePath, std::ios::in | std::ios::binary);
-    std::istream* source = &sourceFile;
-
-    if(!sourceFile.good())
-    {
-        std::cerr << "File not found: " << sourcePath << std::endl;
-        std::exit(1);
-    }
-
-    const auto is_asm    = extension == "s";
-    const auto is_cl     = extension == "cl";
-    const auto is_hip    = extension == "cpp";
-    const auto is_header = extension == "hpp";
-
-    if(is_asm || is_cl || is_hip || is_header)
-    {
-        IncludeInliner inliner;
-
-        try
-        {
-            if(is_asm)
-                inliner.Process(
-                    sourceFile, inlinerTemp, root, sourcePath, ".include", false, recurse);
-            else if(is_cl || is_header)
-                inliner.Process(
-                    sourceFile, inlinerTemp, root, sourcePath, "#include", true, recurse);
-            else if(is_hip)
-                inliner.Process(
-                    sourceFile, inlinerTemp, root, sourcePath, "<#not_include>", true, false);
-        }
-        catch(const InlineException& ex)
-        {
-            std::cerr << ex.What() << std::endl;
-            std::cerr << ex.GetTrace() << std::endl;
-            std::exit(1);
-        }
-
-        source = &inlinerTemp;
-    }
-
-    std::transform(variable.begin(), variable.end(), variable.begin(), ::toupper);
-
-    if(as_extern && variable.length() != 0)
-    {
-        variable = "APP_KERNEL_" + variable;
-    }
-
-    Bin2Hex(*source, target, variable, true, bufferSize, lineSize);
-}
-
-int main(int argsn, char** args)
-{
-    if(argsn == 1)
-    {
-        PrintHelp();
-        return 2;
-    }
-
-    std::string guard;
-    size_t bufferSize = 512;
-    size_t lineSize   = 16;
-
-    std::ofstream targetFile;
-    std::ostream* target = &std::cout;
-    bool recurse         = true;
-    bool as_extern       = false;
-
-    int i = 0;
-    while(++i < argsn && **args != '-')
-    {
-        std::string arg(args[i] + 1);
-        std::transform(arg.begin(), arg.end(), arg.begin(), ::tolower);
-
-        if(arg == "s" || arg == "source")
-        {
-            if(guard.length() > 0)
-            {
-                *target << "#ifndef " << guard << std::endl;
-                *target << "#define " << guard << std::endl;
-            }
-
-            *target << "#include <cstddef>" << std::endl;
-
-            while(++i < argsn)
-            {
-                Process(args[i], *target, bufferSize, lineSize, recurse, as_extern);
-            }
-
-            if(guard.length() > 0)
-            {
-                *target << "#endif" << std::endl;
-            }
-
-            return 0;
-        }
-        else if(arg == "t" || arg == "target")
-        {
-            targetFile.open(args[++i], std::ios::out);
-            target = &targetFile;
-        }
-        else if(arg == "l" || arg == "line-size")
-            lineSize = std::stol(args[++i]);
-        else if(arg == "b" || arg == "buffer")
-            bufferSize = std::stol(args[++i]);
-        else if(arg == "g" || arg == "guard")
-            guard = args[++i];
-        else if(arg == "n" || arg == "no-recurse")
-            recurse = false;
-        else if(arg == "e" || arg == "extern")
-            as_extern = true;
-        else
-            UnknownArgument(arg);
-    }
-
-    WrongUsage("source key is required");
-}
diff --git a/host/online_compile/addkernels/include_inliner.cpp b/host/online_compile/addkernels/include_inliner.cpp
deleted file mode 100644
index e5aec7dd77..0000000000
--- a/host/online_compile/addkernels/include_inliner.cpp
+++ /dev/null
@@ -1,213 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include <algorithm>
-#include <exception>
-#include <fstream>
-#include <sstream>
-
-#ifdef _WIN32
-#include <windows.h>
-#endif
-#ifdef __linux__
-#include <linux/limits.h>
-#include <cstdlib>
-#endif // !WIN32
-
-#include "include_inliner.hpp"
-
-namespace PathHelpers {
-static int GetMaxPath()
-{
-#ifdef _WIN32
-    return MAX_PATH;
-#else
-    return PATH_MAX;
-#endif
-}
-
-static std::string GetAbsolutePath(const std::string& path)
-{
-    std::string result(GetMaxPath(), ' ');
-#ifdef _WIN32
-    const auto retval = GetFullPathName(path.c_str(), result.size(), &result[0], nullptr);
-
-    if(retval == 0)
-        return "";
-#else
-    auto* const retval = realpath(path.c_str(), &result[0]);
-
-    if(retval == nullptr)
-        return "";
-#endif
-    return result;
-}
-} // namespace PathHelpers
-
-std::string IncludeFileExceptionBase::What() const
-{
-    std::ostringstream ss;
-    ss << GetMessage() << ": <" << _file << ">";
-
-    return ss.str();
-}
-
-void IncludeInliner::Process(std::istream& input,
-                             std::ostream& output,
-                             const std::string& root,
-                             const std::string& file_name,
-                             const std::string& directive,
-                             bool allow_angle_brackets,
-                             bool recurse)
-{
-    ProcessCore(input, output, root, file_name, 0, directive, allow_angle_brackets, recurse);
-}
-
-void IncludeInliner::ProcessCore(std::istream& input,
-                                 std::ostream& output,
-                                 const std::string& root,
-                                 const std::string& file_name,
-                                 int line_number,
-                                 const std::string& directive,
-                                 bool allow_angle_brackets,
-                                 bool recurse)
-{
-    if(_include_depth >= include_depth_limit)
-        throw InlineStackOverflowException(GetIncludeStackTrace(0));
-
-    _include_depth++;
-    _included_stack_head =
-        std::make_shared<SourceFileDesc>(file_name, _included_stack_head, line_number);
-    auto current_line          = 0;
-    auto next_include_optional = false;
-
-    while(!input.eof())
-    {
-        std::string line;
-        std::string word;
-        std::getline(input, line);
-        std::istringstream line_parser(line);
-        line_parser >> word;
-        current_line++;
-        std::transform(word.begin(), word.end(), word.begin(), ::tolower);
-
-        const auto include_optional = next_include_optional;
-        next_include_optional       = false;
-
-        if(!word.empty() && word == "//inliner-include-optional")
-        {
-            if(include_optional)
-                throw IncludeExpectedException(GetIncludeStackTrace(current_line));
-            next_include_optional = true;
-            continue;
-        }
-
-        if(!word.empty() && word == directive && recurse)
-        {
-            auto first_quote_pos = line.find('"', static_cast<int>(line_parser.tellg()) + 1);
-            std::string::size_type second_quote_pos;
-
-            if(first_quote_pos != std::string::npos)
-            {
-                second_quote_pos = line.find('"', first_quote_pos + 1);
-                if(second_quote_pos == std::string::npos)
-                    throw WrongInlineDirectiveException(GetIncludeStackTrace(current_line));
-            }
-            else
-            {
-                if(!allow_angle_brackets)
-                    throw WrongInlineDirectiveException(GetIncludeStackTrace(current_line));
-
-                first_quote_pos = line.find('<', static_cast<int>(line_parser.tellg()) + 1);
-                if(first_quote_pos == std::string::npos)
-                    throw WrongInlineDirectiveException(GetIncludeStackTrace(current_line));
-
-                second_quote_pos = line.find('>', first_quote_pos + 1);
-                if(second_quote_pos == std::string::npos)
-                    throw WrongInlineDirectiveException(GetIncludeStackTrace(current_line));
-            }
-
-            const std::string include_file_path =
-                line.substr(first_quote_pos + 1, second_quote_pos - first_quote_pos - 1);
-            const std::string abs_include_file_path(
-                PathHelpers::GetAbsolutePath(root + "/" + include_file_path)); // NOLINT
-
-            if(abs_include_file_path.empty())
-            {
-                if(include_optional)
-                    continue;
-                throw IncludeNotFoundException(include_file_path,
-                                               GetIncludeStackTrace(current_line));
-            }
-            std::ifstream include_file(abs_include_file_path, std::ios::in);
-
-            if(!include_file.good())
-                throw IncludeCantBeOpenedException(include_file_path,
-                                                   GetIncludeStackTrace(current_line));
-
-            ProcessCore(include_file,
-                        output,
-                        root,
-                        include_file_path,
-                        current_line,
-                        directive,
-                        allow_angle_brackets,
-                        recurse);
-        }
-        else
-        {
-            if(include_optional)
-                throw IncludeExpectedException(GetIncludeStackTrace(current_line));
-
-            if(output.tellp() > 0)
-                output << std::endl;
-
-            output << line;
-        }
-    }
-
-    auto prev_file       = _included_stack_head->included_from;
-    _included_stack_head = prev_file;
-    _include_depth--;
-}
-
-std::string IncludeInliner::GetIncludeStackTrace(int line)
-{
-    std::ostringstream ss;
-
-    if(_included_stack_head == nullptr)
-        return "";
-
-    auto item = _included_stack_head;
-    ss << "    " << item->path << ":" << line;
-
-    while(item->included_from != nullptr)
-    {
-        ss << std::endl << "    from " << item->included_from->path << ":" << item->included_line;
-        item = item->included_from;
-    }
-
-    return ss.str();
-}
diff --git a/host/online_compile/addkernels/include_inliner.hpp b/host/online_compile/addkernels/include_inliner.hpp
deleted file mode 100644
index 501ad7cc78..0000000000
--- a/host/online_compile/addkernels/include_inliner.hpp
+++ /dev/null
@@ -1,142 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef SOURCE_INLINER_HPP
-
-#define SOURCE_INLINER_HPP
-#include "source_file_desc.hpp"
-#include <ostream>
-#include <memory>
-#include <stack>
-
-class InlineException : public std::exception
-{
-    public:
-    InlineException(const std::string& trace) : _trace(trace) {}
-
-    virtual std::string What() const = 0;
-    const std::string& GetTrace() const { return _trace; }
-
-    private:
-    std::string _trace;
-};
-
-class InlineStackOverflowException : public InlineException
-{
-    public:
-    InlineStackOverflowException(const std::string& trace) : InlineException(trace) {}
-
-    std::string What() const override
-    {
-        return "Include stack depth limit has been reached, possible circle includes";
-    }
-};
-
-class IncludeExpectedException : public InlineException
-{
-    public:
-    IncludeExpectedException(const std::string& trace) : InlineException(trace) {}
-
-    std::string What() const override { return "Include directive expected"; }
-};
-
-class WrongInlineDirectiveException : public InlineException
-{
-    public:
-    WrongInlineDirectiveException(const std::string& trace) : InlineException(trace) {}
-
-    std::string What() const override { return "Include directive has wrong format"; }
-};
-
-class IncludeFileExceptionBase : public InlineException
-{
-    public:
-    IncludeFileExceptionBase(const std::string& file, const std::string& trace)
-        : InlineException(trace), _file(file)
-    {
-    }
-
-    std::string What() const override;
-    virtual std::string GetMessage() const = 0;
-
-    private:
-    std::string _file;
-};
-
-class IncludeNotFoundException : public IncludeFileExceptionBase
-{
-    public:
-    IncludeNotFoundException(const std::string& file, const std::string& trace)
-        : IncludeFileExceptionBase(file, trace)
-    {
-    }
-
-    std::string GetMessage() const override
-    {
-        return "Include file not found (if it is optional put //inliner-include-optional on line "
-               "before it)";
-    }
-};
-
-class IncludeCantBeOpenedException : public IncludeFileExceptionBase
-{
-    public:
-    IncludeCantBeOpenedException(const std::string& file, const std::string& trace)
-        : IncludeFileExceptionBase(file, trace)
-    {
-    }
-
-    std::string GetMessage() const override { return "Can not open include file"; }
-};
-
-class IncludeInliner
-{
-    public:
-    int include_depth_limit = 256;
-
-    void Process(std::istream& input,
-                 std::ostream& output,
-                 const std::string& root,
-                 const std::string& file_name,
-                 const std::string& directive,
-                 bool allow_angle_brackets,
-                 bool recurse);
-    std::string GetIncludeStackTrace(int line);
-
-    private:
-    int _include_depth                                   = 0;
-    std::shared_ptr<SourceFileDesc> _included_stack_head = nullptr;
-
-    void ProcessCore(std::istream& input,
-                     std::ostream& output,
-                     const std::string& root,
-                     const std::string& file_name,
-                     int line_number,
-                     const std::string& directive,
-                     bool allow_angle_brackets,
-                     bool recurse);
-};
-
-#endif // !SOURCE_INLINER_HPP
diff --git a/host/online_compile/addkernels/source_file_desc.hpp b/host/online_compile/addkernels/source_file_desc.hpp
deleted file mode 100644
index f0fbf5938f..0000000000
--- a/host/online_compile/addkernels/source_file_desc.hpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef SOURCE_FILE_DESC_HPP
-
-#define SOURCE_FILE_DESC_HPP
-#include <string>
-#include <memory>
-
-class SourceFileDesc
-{
-    public:
-    const std::string path;
-    int included_line;
-    std::shared_ptr<SourceFileDesc> included_from;
-
-    SourceFileDesc(const std::string& path_, std::shared_ptr<SourceFileDesc> from, int line)
-        : path(path_), included_line(line), included_from(from)
-    {
-    }
-};
-
-#endif // SOURCE_FILE_DESC_HPP
diff --git a/host/online_compile/hip_utility/binary_cache.cpp b/host/online_compile/hip_utility/binary_cache.cpp
deleted file mode 100644
index b899d1e296..0000000000
--- a/host/online_compile/hip_utility/binary_cache.cpp
+++ /dev/null
@@ -1,112 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
-#include <binary_cache.hpp>
-#include <handle.hpp>
-#include <md5.hpp>
-#include <env.hpp>
-#include <stringutils.hpp>
-#include <logger.hpp>
-#include <target_properties.hpp>
-#include <boost/filesystem.hpp>
-#include <fstream>
-#include <iostream>
-
-namespace online_compile {
-
-OLC_DECLARE_ENV_VAR(OLC_DISABLE_CACHE)
-OLC_DECLARE_ENV_VAR(HOME)
-
-static boost::filesystem::path ComputeCachePath()
-{
-    const char* home_dir = GetStringEnv(HOME{});
-    if(home_dir == nullptr || home_dir == std::string("/") || home_dir == std::string(""))
-    {
-        home_dir = "/tmp";
-    }
-
-    auto p = boost::filesystem::path{home_dir} / "_hip_binary_kernels_";
-
-    if(!boost::filesystem::exists(p))
-        boost::filesystem::create_directories(p);
-    return p;
-}
-
-boost::filesystem::path GetCachePath()
-{
-    static const boost::filesystem::path user_path = ComputeCachePath();
-
-    return user_path;
-}
-
-static bool IsCacheDisabled() { return online_compile::IsEnabled(OLC_DISABLE_CACHE{}); }
-
-boost::filesystem::path
-GetCacheFile(const std::string& device, const std::string& name, const std::string& args)
-{
-    // std::string filename = (is_kernel_str ? online_compile::md5(name) : name) + ".o";
-    std::string filename = name + ".o";
-    return GetCachePath() / online_compile::md5(device + ":" + args) / filename;
-}
-
-boost::filesystem::path LoadBinary(const TargetProperties& target,
-                                   const size_t num_cu,
-                                   const std::string& name,
-                                   const std::string& args)
-{
-    if(online_compile::IsCacheDisabled())
-        return {};
-
-    (void)num_cu;
-    auto f = GetCacheFile(target.DbId(), name, args);
-    if(boost::filesystem::exists(f))
-    {
-        return f.string();
-    }
-    else
-    {
-        return {};
-    }
-}
-
-void SaveBinary(const boost::filesystem::path& binary_path,
-                const TargetProperties& target,
-                const std::string& name,
-                const std::string& args)
-{
-    if(online_compile::IsCacheDisabled())
-    {
-        boost::filesystem::remove(binary_path);
-    }
-    else
-    {
-        auto p = GetCacheFile(target.DbId(), name, args);
-        boost::filesystem::create_directories(p.parent_path());
-        boost::filesystem::rename(binary_path, p);
-    }
-}
-
-} // namespace online_compile
diff --git a/host/online_compile/hip_utility/exec_utils.cpp b/host/online_compile/hip_utility/exec_utils.cpp
deleted file mode 100644
index ec305783f1..0000000000
--- a/host/online_compile/hip_utility/exec_utils.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2019 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include <exec_utils.hpp>
-#include <manage_ptr.hpp>
-#include <istream>
-#include <ostream>
-#include <string>
-#include <cstdio>
-#include <array>
-#include <cassert>
-
-#ifdef __linux__
-#include <unistd.h>
-#include <cstdio>
-#include <sys/wait.h>
-#endif // __linux__
-
-namespace online_compile {
-namespace exec {
-
-int Run(const std::string& p, std::istream* in, std::ostream* out)
-{
-#ifdef __linux__
-    const auto redirect_stdin  = (in != nullptr);
-    const auto redirect_stdout = (out != nullptr);
-
-    assert(!(redirect_stdin && redirect_stdout));
-
-    const auto file_mode = redirect_stdout ? "r" : "w";
-    OLC_MANAGE_PTR(FILE*, pclose) pipe{popen(p.c_str(), file_mode)};
-
-    if(!pipe)
-        throw std::runtime_error("online_compile::exec::Run(): popen(" + p + ", " + file_mode +
-                                 ") failed");
-
-    if(redirect_stdin || redirect_stdout)
-    {
-        std::array<char, 1024> buffer{};
-
-        if(redirect_stdout)
-        {
-            while(feof(pipe.get()) == 0)
-                if(fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr)
-                    *out << buffer.data();
-        }
-        else
-        {
-            while(!in->eof())
-            {
-                in->read(buffer.data(), buffer.size() - 1);
-                buffer[in->gcount()] = 0;
-
-                if(fputs(buffer.data(), pipe.get()) == EOF)
-                    throw std::runtime_error("online_compile::exec::Run(): fputs() failed");
-            }
-        }
-    }
-
-    auto status = pclose(pipe.release());
-    return WEXITSTATUS(status);
-#else
-    (void)p;
-    (void)in;
-    (void)out;
-    return -1;
-#endif // __linux__
-}
-
-} // namespace exec
-} // namespace online_compile
diff --git a/host/online_compile/hip_utility/handlehip.cpp b/host/online_compile/hip_utility/handlehip.cpp
deleted file mode 100644
index 843957b4ad..0000000000
--- a/host/online_compile/hip_utility/handlehip.cpp
+++ /dev/null
@@ -1,285 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
-#include <handle.hpp>
-
-#include <binary_cache.hpp>
-#include <env.hpp>
-#include <kernel_cache.hpp>
-#include <stringutils.hpp>
-#include <target_properties.hpp>
-
-#include <hipCheck.hpp>
-
-#include <write_file.hpp>
-
-#include <boost/filesystem.hpp>
-#include <boost/lexical_cast.hpp>
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <algorithm>
-#include <cassert>
-#include <chrono>
-#include <thread>
-
-OLC_DECLARE_ENV_VAR(OLC_DEVICE_CU)
-
-namespace online_compile {
-
-std::size_t GetAvailableMemory()
-{
-    size_t free, total;
-    MY_HIP_CHECK(hipMemGetInfo(&free, &total));
-    return free;
-}
-
-int get_device_id() // Get random device
-{
-    int device;
-
-    MY_HIP_CHECK(hipGetDevice(&device));
-
-    return device;
-}
-
-void set_device(int id) { MY_HIP_CHECK(hipSetDevice(id)); }
-
-int set_default_device()
-{
-    int n;
-
-    MY_HIP_CHECK(hipGetDeviceCount(&n));
-
-    // Pick device based on process id
-    auto pid = ::getpid();
-    assert(pid > 0);
-    set_device(pid % n);
-    return (pid % n);
-}
-
-struct HandleImpl
-{
-    using StreamPtr = std::shared_ptr<typename std::remove_pointer<hipStream_t>::type>;
-
-    HandleImpl() {}
-
-    StreamPtr create_stream()
-    {
-        hipStream_t result;
-
-        MY_HIP_CHECK(hipStreamCreate(&result));
-
-        return StreamPtr{result, &hipStreamDestroy};
-    }
-
-    static StreamPtr reference_stream(hipStream_t s) { return StreamPtr{s, null_deleter{}}; }
-
-    std::string get_device_name() const
-    {
-        hipDeviceProp_t props;
-
-        MY_HIP_CHECK(hipGetDeviceProperties(&props, device));
-
-        const std::string name(props.gcnArchName);
-        return name;
-    }
-
-    StreamPtr stream = nullptr;
-    int device       = -1;
-    KernelCache cache;
-    TargetProperties target_properties;
-};
-
-Handle::Handle(hipStream_t stream) : impl(new HandleImpl())
-{
-    this->impl->device = get_device_id();
-
-    if(stream == nullptr)
-        this->impl->stream = HandleImpl::reference_stream(nullptr);
-    else
-        this->impl->stream = HandleImpl::reference_stream(stream);
-
-    this->impl->target_properties.Init(this);
-}
-
-Handle::Handle() : impl(new HandleImpl())
-{
-    this->impl->device = get_device_id();
-    this->impl->stream = HandleImpl::reference_stream(nullptr);
-
-    this->impl->target_properties.Init(this);
-}
-
-Handle::~Handle() {}
-
-void Handle::SetStream(hipStream_t streamID) const
-{
-    this->impl->stream = HandleImpl::reference_stream(streamID);
-
-    this->impl->target_properties.Init(this);
-}
-
-hipStream_t Handle::GetStream() const { return impl->stream.get(); }
-
-KernelInvoke Handle::AddKernel(const std::string& algorithm,
-                               const std::string& network_config,
-                               const std::string& program_name,
-                               const std::string& kernel_name,
-                               const std::vector<size_t>& vld,
-                               const std::vector<size_t>& vgd,
-                               const std::string& params,
-                               std::size_t cache_index) const
-{
-
-    auto obj = this->impl->cache.AddKernel(
-        *this, algorithm, network_config, program_name, kernel_name, vld, vgd, params, cache_index);
-    return this->Run(obj);
-}
-
-void Handle::ClearKernels(const std::string& algorithm, const std::string& network_config) const
-{
-    this->impl->cache.ClearKernels(algorithm, network_config);
-}
-
-const std::vector<Kernel>& Handle::GetKernelsImpl(const std::string& algorithm,
-                                                  const std::string& network_config) const
-{
-    return this->impl->cache.GetKernels(algorithm, network_config);
-}
-
-bool Handle::HasKernel(const std::string& algorithm, const std::string& network_config) const
-{
-    return this->impl->cache.HasKernels(algorithm, network_config);
-}
-
-KernelInvoke Handle::Run(Kernel k) const { return k.Invoke(this->GetStream()); }
-
-Program Handle::LoadProgram(const std::string& program_name, std::string params) const
-{
-    if((!online_compile::EndsWith(program_name, ".mlir-cpp")) &&
-       (!online_compile::EndsWith(program_name, ".mlir")))
-    {
-        params += " -mcpu=" + this->GetTargetProperties().Name();
-    }
-
-    auto hsaco = online_compile::LoadBinary(
-        this->GetTargetProperties(), this->GetMaxComputeUnits(), program_name, params);
-    if(hsaco.empty())
-    {
-        auto p = HIPOCProgram{program_name, params, this->GetTargetProperties()};
-
-        auto path = online_compile::GetCachePath() / boost::filesystem::unique_path();
-        if(p.IsCodeObjectInMemory())
-            online_compile::WriteFile(p.GetCodeObjectBlob(), path);
-        else
-            boost::filesystem::copy_file(p.GetCodeObjectPathname(), path);
-        online_compile::SaveBinary(path, this->GetTargetProperties(), program_name, params);
-
-        return p;
-    }
-    else
-    {
-        return HIPOCProgram{program_name, hsaco};
-    }
-}
-
-bool Handle::HasProgram(const std::string& program_name, const std::string& params) const
-{
-    return this->impl->cache.HasProgram(program_name, params);
-}
-
-void Handle::AddProgram(Program prog,
-                        const std::string& program_name,
-                        const std::string& params) const
-{
-    this->impl->cache.AddProgram(prog, program_name, params);
-}
-
-void Handle::Finish() const { MY_HIP_CHECK(hipStreamSynchronize(this->GetStream())); }
-
-std::size_t Handle::GetLocalMemorySize() const
-{
-    int result;
-
-    MY_HIP_CHECK(hipDeviceGetAttribute(
-        &result, hipDeviceAttributeMaxSharedMemoryPerBlock, this->impl->device));
-
-    return result;
-}
-
-std::size_t Handle::GetGlobalMemorySize() const
-{
-    size_t result;
-
-    MY_HIP_CHECK(hipDeviceTotalMem(&result, this->impl->device));
-
-    return result;
-}
-
-std::size_t Handle::GetMaxComputeUnits() const
-{
-    int result;
-    const char* const num_cu = online_compile::GetStringEnv(OLC_DEVICE_CU{});
-    if(num_cu != nullptr && strlen(num_cu) > 0)
-    {
-        return boost::lexical_cast<std::size_t>(num_cu);
-    }
-
-    MY_HIP_CHECK(
-        hipDeviceGetAttribute(&result, hipDeviceAttributeMultiprocessorCount, this->impl->device));
-
-    return result;
-}
-
-std::size_t Handle::GetWavefrontWidth() const
-{
-    hipDeviceProp_t props{};
-
-    MY_HIP_CHECK(hipGetDeviceProperties(&props, this->impl->device));
-
-    auto result = static_cast<size_t>(props.warpSize);
-    return result;
-}
-
-std::string Handle::GetDeviceNameImpl() const { return this->impl->get_device_name(); }
-
-std::string Handle::GetDeviceName() const { return this->impl->target_properties.Name(); }
-
-const TargetProperties& Handle::GetTargetProperties() const
-{
-    return this->impl->target_properties;
-}
-
-std::ostream& Handle::Print(std::ostream& os) const
-{
-    os << "stream: " << this->impl->stream << ", device_id: " << this->impl->device;
-    return os;
-}
-
-} // namespace online_compile
diff --git a/host/online_compile/hip_utility/hip_build_utils.cpp b/host/online_compile/hip_utility/hip_build_utils.cpp
deleted file mode 100644
index 99b786e606..0000000000
--- a/host/online_compile/hip_utility/hip_build_utils.cpp
+++ /dev/null
@@ -1,346 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2019 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
-#include <hip_build_utils.hpp>
-#include <stringutils.hpp>
-#include <tmp_dir.hpp>
-#include <env.hpp>
-#include <target_properties.hpp>
-#include <write_file.hpp>
-#include <exec_utils.hpp>
-#include <logger.hpp>
-#include <config.h>
-#include <boost/optional.hpp>
-#include <sstream>
-#include <string>
-#include <stdexcept>
-
-#include <iostream>
-
-OLC_DECLARE_ENV_VAR(OLC_DEBUG_HIP_VERBOSE)
-OLC_DECLARE_ENV_VAR(OLC_DEBUG_HIP_DUMP)
-
-#define OLC_HIP_COMPILER "/opt/rocm/llvm/bin/clang++"
-
-namespace online_compile {
-
-bool IsHccCompiler()
-{
-    static const auto isHcc = EndsWith(OLC_HIP_COMPILER, "hcc");
-    return isHcc;
-}
-
-bool IsHipClangCompiler()
-{
-    static const auto isClangXX = EndsWith(OLC_HIP_COMPILER, "clang++");
-    return isClangXX;
-}
-
-namespace {
-
-inline bool ProduceCoV3()
-{
-    // Otherwise, let's enable CO v3 for HIP kernels since ROCm 3.0.
-    return (HipCompilerVersion() >= external_tool_version_t{3, 0, -1});
-}
-
-/// Returns option for enabling/disabling CO v3 generation for the compiler
-/// that builds HIP kernels, depending on compiler version etc.
-inline const std::string& GetCoV3Option(const bool enable)
-{
-    /// \note PR #2166 uses the "--hcc-cov3" option when isHCC is true.
-    /// It's unclear why... HCC included in ROCm 2.8 does not support it,
-    /// perhaps it suits for some older HCC?
-    ///
-    /// These options are Ok for ROCm 3.0:
-    static const std::string option_enable{"-mcode-object-v3"};
-    static const std::string no_option{};
-    if(enable)
-        return option_enable;
-    else
-        return no_option;
-}
-} // namespace
-
-static boost::filesystem::path HipBuildImpl(boost::optional<TmpDir>& tmp_dir,
-                                            const std::string& filename,
-                                            std::string src,
-                                            std::string params,
-                                            const TargetProperties& target,
-                                            const bool testing_mode,
-                                            const bool sources_already_reside_on_filesystem)
-{
-#ifdef __linux__
-    // Write out the include files
-    // Let's assume includes are overkill for feature tests & optimize'em out.
-    if(!testing_mode)
-    {
-        auto inc_list = GetHipKernelIncList();
-        auto inc_path = tmp_dir->path;
-        boost::filesystem::create_directories(inc_path);
-        for(auto inc_file : inc_list)
-        {
-            auto inc_src = GetKernelInc(inc_file);
-            WriteFile(inc_src, inc_path / inc_file);
-        }
-    }
-
-    // Sources produced by MLIR-cpp already reside in tmp dir.
-    if(!sources_already_reside_on_filesystem)
-    {
-        src += "\nint main() {}\n";
-        WriteFile(src, tmp_dir->path / filename);
-    }
-
-    // cppcheck-suppress unreadVariable
-    const LcOptionTargetStrings lots(target);
-
-    auto env = std::string("");
-    if(IsHccCompiler())
-    {
-        params += " -amdgpu-target=" + target.Name();
-        params += " " + GetCoV3Option(ProduceCoV3());
-    }
-    else if(IsHipClangCompiler())
-    {
-        if(params.find("-std=") == std::string::npos)
-            params += " --std=c++11";
-
-        if(HipCompilerVersion() < external_tool_version_t{4, 1, 0})
-            params += " --cuda-gpu-arch=" + lots.device;
-        else
-            params += " --cuda-gpu-arch=" + lots.device + lots.xnack;
-
-        params += " --cuda-device-only";
-        params += " -c";
-        params += " -O3 ";
-    }
-
-    params += " -Wno-unused-command-line-argument -I. ";
-    params += OLC_STRINGIZE(HIP_COMPILER_FLAGS);
-    if(IsHccCompiler())
-    {
-        env += std::string("KMOPTLLC=\"-mattr=+enable-ds128 ");
-        if(HipCompilerVersion() >= external_tool_version_t{2, 8, 0})
-            env += " --amdgpu-spill-vgpr-to-agpr=0";
-        env += '\"';
-    }
-    else if(IsHipClangCompiler())
-    {
-        params += " -mllvm --amdgpu-spill-vgpr-to-agpr=0";
-        params += " -mllvm -amdgpu-early-inline-all=true";
-        params += " -mllvm -amdgpu-function-calls=false";
-    }
-
-    if(online_compile::IsEnabled(OLC_DEBUG_HIP_VERBOSE{}))
-    {
-        params += " -v";
-    }
-
-    if(online_compile::IsEnabled(OLC_DEBUG_HIP_DUMP{}))
-    {
-        if(IsHccCompiler())
-        {
-            params += " -gline-tables-only";
-            env += " KMDUMPISA=1";
-            env += " KMDUMPLLVM=1";
-        }
-        else if(IsHipClangCompiler())
-        {
-            params += " -gline-tables-only";
-            params += " -save-temps";
-        }
-    }
-
-    // hip version
-    params +=
-        std::string(" -DHIP_PACKAGE_VERSION_FLAT=") + std::to_string(HIP_PACKAGE_VERSION_FLAT);
-
-    params += " ";
-    auto bin_file = tmp_dir->path / (filename + ".o");
-
-    // compile
-    const std::string redirector = testing_mode ? " 1>/dev/null 2>&1" : "";
-    tmp_dir->Execute(env + std::string(" ") + OLC_HIP_COMPILER,
-                     params + filename + " -o " + bin_file.string() + redirector);
-    if(!boost::filesystem::exists(bin_file))
-        throw std::runtime_error(filename + " failed to compile");
-#ifdef EXTRACTKERNEL_BIN
-    if(IsHccCompiler())
-    {
-        // call extract kernel
-        tmp_dir->Execute(EXTRACTKERNEL_BIN, " -i " + bin_file.string());
-        auto hsaco =
-            std::find_if(boost::filesystem::directory_iterator{tmp_dir->path}, {}, [](auto entry) {
-                return (entry.path().extension() == ".hsaco");
-            });
-
-        if(hsaco == boost::filesystem::directory_iterator{})
-        {
-            fdt_log(LogLevel::Info, "HipBuild", "failed to find *.hsaco in ")
-                << hsaco->path().string() << std::endl;
-        }
-
-        return hsaco->path();
-    }
-#endif
-    return bin_file;
-#else
-    (void)filename;
-    (void)params;
-    throw std::runtimer_error("HIP kernels are only supported in Linux");
-#endif
-}
-
-boost::filesystem::path HipBuild(boost::optional<TmpDir>& tmp_dir,
-                                 const std::string& filename,
-                                 std::string src,
-                                 std::string params,
-                                 const TargetProperties& target,
-                                 const bool sources_already_reside_on_filesystem)
-{
-    return HipBuildImpl(
-        tmp_dir, filename, src, params, target, false, sources_already_reside_on_filesystem);
-}
-
-void bin_file_to_str(const boost::filesystem::path& file, std::string& buf)
-{
-    std::ifstream bin_file_ptr(file.string().c_str(), std::ios::binary);
-    std::ostringstream bin_file_strm;
-    bin_file_strm << bin_file_ptr.rdbuf();
-    buf = bin_file_strm.str();
-}
-
-static external_tool_version_t HipCompilerVersionImpl()
-{
-    external_tool_version_t version;
-    if(IsHccCompiler())
-    {
-        const std::string path(OLC_HIP_COMPILER);
-        const std::string mandatory_prefix("(based on HCC ");
-        do
-        {
-            if(path.empty() || !std::ifstream(path).good())
-                break;
-
-            std::stringstream out;
-            if(online_compile::exec::Run(path + " --version", nullptr, &out) != 0)
-                break;
-
-            std::string line;
-            while(!out.eof())
-            {
-                std::getline(out, line);
-                fdt_log() << line;
-                auto begin = line.find(mandatory_prefix);
-                if(begin == std::string::npos)
-                    continue;
-
-                begin += mandatory_prefix.size();
-                int v3, v2, v1 = v2 = v3 = -1;
-                char c2, c1 = c2 = 'X';
-                std::istringstream iss(line.substr(begin));
-                iss >> v1 >> c1 >> v2 >> c2 >> v3;
-                if(!iss.fail() && v1 >= 0)
-                {
-                    version.major = v1;
-                    if(c1 == '.' && v2 >= 0)
-                    {
-                        version.minor = v2;
-                        if(c2 == '.' && v3 >= 0)
-                            version.patch = v3;
-                    }
-                }
-                break;
-            }
-        } while(false);
-    }
-    else
-    {
-#ifdef HIP_PACKAGE_VERSION_MAJOR
-        fdt_log(
-            LogLevel::Info, "HipCompilerVersion", "Read version information from HIP package...");
-        version.major = HIP_PACKAGE_VERSION_MAJOR;
-#ifdef HIP_PACKAGE_VERSION_MINOR
-        version.minor = HIP_PACKAGE_VERSION_MINOR;
-#else
-        version.minor = 0;
-#endif
-#ifdef HIP_PACKAGE_VERSION_PATCH
-        version.patch = HIP_PACKAGE_VERSION_PATCH;
-#else
-        version.patch = 0;
-#endif
-#else // HIP_PACKAGE_VERSION_MAJOR is not defined. CMake failed to find HIP package.
-        fdt_log(LogLevel::Info, "HipCompilerVersion", "...assuming 3.2.0 (hip-clang RC)");
-        version.major = 3;
-        version.minor = 2;
-        version.patch = 0;
-#endif
-    }
-    fdt_log() << version.major << '.' << version.minor << '.' << version.patch << std::endl;
-    return version;
-}
-
-external_tool_version_t HipCompilerVersion()
-{
-    // NOLINTNEXTLINE (cppcoreguidelines-avoid-non-const-global-variables)
-    static auto once = HipCompilerVersionImpl();
-    return once;
-}
-
-bool operator>(const external_tool_version_t& lhs, const external_tool_version_t& rhs)
-{
-    if(lhs.major > rhs.major)
-        return true;
-    else if(lhs.major == rhs.major)
-    {
-        if(lhs.minor > rhs.minor)
-            return true;
-        else if(lhs.minor == rhs.minor)
-            return (lhs.patch > rhs.patch);
-        else
-            return false;
-    }
-    else
-        return false;
-}
-
-bool operator<(const external_tool_version_t& lhs, const external_tool_version_t& rhs)
-{
-    return rhs > lhs;
-}
-bool operator>=(const external_tool_version_t& lhs, const external_tool_version_t& rhs)
-{
-    return !(lhs < rhs);
-}
-
-bool operator<=(const external_tool_version_t& lhs, const external_tool_version_t& rhs)
-{
-    return !(lhs > rhs);
-}
-
-} // namespace online_compile
diff --git a/host/online_compile/hip_utility/hipoc_kernel.cpp b/host/online_compile/hip_utility/hipoc_kernel.cpp
deleted file mode 100644
index a07d736ac1..0000000000
--- a/host/online_compile/hip_utility/hipoc_kernel.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
-#include <env.hpp>
-#include <hipoc_kernel.hpp>
-#include <hipCheck.hpp>
-
-#include <hip/hip_ext.h>
-#include <hip/hip_runtime.h>
-
-#include <chrono>
-#include <thread>
-
-namespace online_compile {
-
-void HIPOCKernelInvoke::run(void* args, std::size_t size) const
-{
-    HipEventPtr start = nullptr;
-    HipEventPtr stop  = nullptr;
-    void* config[]    = {// HIP_LAUNCH_PARAM_* are macros that do horrible things
-                      // NOLINTNEXTLINE cppcoreguidelines-pro-type-cstyle-cast
-                      HIP_LAUNCH_PARAM_BUFFER_POINTER,
-                      args,
-                      // NOLINTNEXTLINE cppcoreguidelines-pro-type-cstyle-cast
-                      HIP_LAUNCH_PARAM_BUFFER_SIZE,
-                      &size,
-                      // NOLINTNEXTLINE cppcoreguidelines-pro-type-cstyle-cast
-                      HIP_LAUNCH_PARAM_END};
-    if(callback)
-    {
-        start = make_hip_event();
-        stop  = make_hip_event();
-    }
-
-    MY_HIP_CHECK(hipExtModuleLaunchKernel(fun,
-                                          gdims[0],
-                                          gdims[1],
-                                          gdims[2],
-                                          ldims[0],
-                                          ldims[1],
-                                          ldims[2],
-                                          0,
-                                          stream,
-                                          nullptr,
-                                          reinterpret_cast<void**>(&config),
-                                          start.get(),
-                                          stop.get()));
-
-    if(callback)
-    {
-        MY_HIP_CHECK(hipEventSynchronize(stop.get()));
-        callback(start.get(), stop.get());
-    }
-}
-
-HIPOCKernelInvoke HIPOCKernel::Invoke(hipStream_t stream,
-                                      std::function<void(hipEvent_t, hipEvent_t)> callback) const
-{
-    return HIPOCKernelInvoke{stream, fun, ldims, gdims, name, callback};
-}
-} // namespace online_compile
diff --git a/host/online_compile/hip_utility/hipoc_program.cpp b/host/online_compile/hip_utility/hipoc_program.cpp
deleted file mode 100644
index 81e03b72ab..0000000000
--- a/host/online_compile/hip_utility/hipoc_program.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include <hip_build_utils.hpp>
-#include <hipoc_program.hpp>
-#include <kernel.hpp>
-#include <stringutils.hpp>
-#include <target_properties.hpp>
-#include <env.hpp>
-#include <write_file.hpp>
-#include <boost/optional.hpp>
-#include <boost/filesystem/operations.hpp>
-
-#include <cstring>
-#include <mutex>
-#include <sstream>
-
-#include <unistd.h>
-
-namespace online_compile {
-
-static hipModulePtr CreateModule(const boost::filesystem::path& hsaco_file)
-{
-    hipModule_t raw_m;
-    MY_HIP_CHECK(hipModuleLoad(&raw_m, hsaco_file.string().c_str()));
-    hipModulePtr m{raw_m};
-    return m;
-}
-
-template <typename T> /// intended for std::string and std::vector<char>
-hipModulePtr CreateModuleInMem(const T& blob)
-{
-    hipModule_t raw_m;
-    MY_HIP_CHECK(hipModuleLoadData(&raw_m, reinterpret_cast<const void*>(blob.data())));
-    hipModulePtr m{raw_m};
-    return m;
-}
-
-HIPOCProgramImpl::HIPOCProgramImpl(const std::string& program_name,
-                                   const boost::filesystem::path& filespec)
-    : program(program_name), hsaco_file(filespec)
-{
-    this->module = CreateModule(hsaco_file);
-}
-
-HIPOCProgramImpl::HIPOCProgramImpl(const std::string& program_name,
-                                   std::string params,
-                                   const TargetProperties& target_)
-    : program(program_name), target(target_)
-{
-    BuildCodeObject(params);
-    if(!binary.empty())
-    {
-        module = CreateModuleInMem(this->binary);
-    }
-    else
-    {
-        module = CreateModule(this->hsaco_file);
-    }
-}
-
-void HIPOCProgramImpl::BuildCodeObjectInFile(std::string& params,
-                                             const std::string& src,
-                                             const std::string& filename)
-{
-
-    this->dir.emplace(filename);
-    hsaco_file = dir->path / (filename + ".o");
-
-    if(online_compile::EndsWith(filename, ".cpp"))
-    {
-        hsaco_file = HipBuild(dir, filename, src, params, target);
-    }
-    else
-        throw std::runtime_error("Only HIP kernel source of .cpp file is supported");
-
-    if(!boost::filesystem::exists(hsaco_file))
-        throw std::runtime_error("Cant find file: " + hsaco_file.string());
-}
-
-void HIPOCProgramImpl::BuildCodeObject(std::string params)
-{
-    std::string filename = program;
-
-    if(online_compile::EndsWith(filename, ".cpp"))
-    {
-        params += " -Wno-everything";
-    }
-
-    BuildCodeObjectInFile(params, GetKernelSrc(this->program), filename);
-}
-
-HIPOCProgram::HIPOCProgram() {}
-HIPOCProgram::HIPOCProgram(const std::string& program_name,
-                           std::string params,
-                           const TargetProperties& target)
-    : impl(std::make_shared<HIPOCProgramImpl>(program_name, params, target))
-{
-}
-
-HIPOCProgram::HIPOCProgram(const std::string& program_name, const boost::filesystem::path& hsaco)
-    : impl(std::make_shared<HIPOCProgramImpl>(program_name, hsaco))
-{
-}
-
-hipModule_t HIPOCProgram::GetModule() const { return impl->module.get(); }
-
-boost::filesystem::path HIPOCProgram::GetCodeObjectPathname() const { return impl->hsaco_file; }
-
-std::string HIPOCProgram::GetCodeObjectBlob() const
-{
-    return {impl->binary.data(), impl->binary.size()};
-}
-
-bool HIPOCProgram::IsCodeObjectInMemory() const { return !impl->binary.empty(); };
-
-} // namespace online_compile
diff --git a/host/online_compile/hip_utility/kernel_build_params.cpp b/host/online_compile/hip_utility/kernel_build_params.cpp
deleted file mode 100644
index e37974b1a3..0000000000
--- a/host/online_compile/hip_utility/kernel_build_params.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2019 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
-#include <sstream>
-
-#include <boost/range/adaptor/transformed.hpp>
-
-#include <kernel_build_params.hpp>
-#include <stringutils.hpp>
-
-namespace online_compile {
-
-static std::string GenerateDefines(const std::vector<KernelBuildParameter>& options,
-                                   const std::string& prefix)
-{
-    const auto strs =
-        options | boost::adaptors::transformed([&prefix](const KernelBuildParameter& define) {
-            std::ostringstream ss;
-
-            ss << '-';
-            if(define.type == ParameterTypes::Define)
-                ss << prefix;
-
-            ss << define.name;
-
-            if(!define.value.empty())
-            {
-                switch(define.type)
-                {
-                case ParameterTypes::Define: ss << '='; break;
-                case ParameterTypes::Option: ss << ' '; break;
-                }
-
-                ss << define.value;
-            }
-
-            return ss.str();
-        });
-
-    return JoinStrings(strs, " ");
-}
-
-} // namespace online_compile
diff --git a/host/online_compile/hip_utility/kernel_cache.cpp b/host/online_compile/hip_utility/kernel_cache.cpp
deleted file mode 100644
index dceb8de94e..0000000000
--- a/host/online_compile/hip_utility/kernel_cache.cpp
+++ /dev/null
@@ -1,154 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-/* ************************************************************************
- * Copyright 2015 Vratis, Ltd.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * ************************************************************************ */
-
-#include <env.hpp>
-#include <kernel_cache.hpp>
-#include <stringutils.hpp>
-
-#include <iostream>
-#include <iterator>
-
-namespace online_compile {
-
-const std::vector<Kernel>& KernelCache::GetKernels(const std::string& algorithm,
-                                                   const std::string& network_config)
-{
-
-    std::pair<std::string, std::string> key = std::make_pair(algorithm, network_config);
-
-    const auto it = kernel_map.find(key);
-    if(it != kernel_map.end())
-    {
-        return it->second;
-    }
-
-    static const std::vector<Kernel> empty{};
-    return empty;
-}
-
-bool KernelCache::HasKernels(const std::string& algorithm, const std::string& network_config) const
-{
-    const auto key = std::make_pair(algorithm, network_config);
-    const auto it  = kernel_map.find(key);
-    if(it == kernel_map.end())
-        return false;
-
-    if(it->second.empty())
-    {
-        throw std::runtime_error(
-            "There should be at least one kernel in kernel cache if an entry exists");
-    }
-
-    return true;
-}
-
-bool KernelCache::HasProgram(const std::string& name, const std::string& params) const
-{
-    const auto key = std::make_pair(name, params);
-    return program_map.count(key) > 0;
-}
-
-void KernelCache::AddProgram(Program prog, const std::string& program_name, std::string params)
-{
-    program_map[std::make_pair(program_name, params)] = prog;
-}
-
-Kernel KernelCache::AddKernel(const Handle& h,
-                              const std::string& algorithm,
-                              const std::string& network_config,
-                              const std::string& program_name,
-                              const std::string& kernel_name,
-                              const std::vector<size_t>& vld,
-                              const std::vector<size_t>& vgd,
-                              std::string params,
-                              std::size_t cache_index)
-{
-    const std::pair<std::string, std::string> key = std::make_pair(algorithm, network_config);
-
-    Program program;
-
-    auto program_it = program_map.find(std::make_pair(program_name, params));
-    if(program_it != program_map.end())
-    {
-        program = program_it->second;
-    }
-    else
-    {
-        program                                           = h.LoadProgram(program_name, params);
-        program_map[std::make_pair(program_name, params)] = program;
-    }
-
-    Kernel kernel{};
-
-    kernel = Kernel{program, kernel_name, vld, vgd};
-
-    if(!network_config.empty() && !algorithm.empty())
-    {
-        this->AddKernel(key, kernel, cache_index);
-    }
-    return kernel;
-}
-
-void KernelCache::AddKernel(Key key, Kernel k, std::size_t cache_index)
-{
-    auto&& v = kernel_map[key];
-    if(cache_index >= v.size())
-    {
-        v.resize(cache_index + 1);
-    }
-    v[cache_index] = k;
-}
-
-void KernelCache::ClearKernels(const std::string& algorithm, const std::string& network_config)
-{
-    if(network_config.empty() || algorithm.empty())
-    {
-        throw std::runtime_error("Network config or algorithm empty.");
-    }
-    const std::pair<std::string, std::string> key = std::make_pair(algorithm, network_config);
-    auto&& v                                      = this->kernel_map[key];
-    if(!v.empty()) {}
-    v.clear();
-}
-
-KernelCache::KernelCache() {}
-
-} // namespace online_compile
diff --git a/host/online_compile/hip_utility/logger.cpp b/host/online_compile/hip_utility/logger.cpp
deleted file mode 100644
index d84bb20908..0000000000
--- a/host/online_compile/hip_utility/logger.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-#include <config.h>
-#include <logger.hpp>
-#include <iostream>
-#include <string>
-
-using namespace std;
-
-namespace online_compile {
-
-#if OLC_DEBUG
-static LogLevel defLevel = LogLevel::Info2;
-#else
-static LogLevel defLevel = LogLevel::Error;
-#endif
-
-string LogLevelString(LogLevel level)
-{
-    switch(level)
-    {
-    case LogLevel::Error: return ("Error");
-    case LogLevel::Warning: return ("Warning");
-    case LogLevel::Info: return ("Info");
-    case LogLevel::Info2: return ("Info2");
-    default: return ("Unknown");
-    };
-};
-
-ostream& fdt_log(LogLevel level, const char* header, const char* content)
-{
-    if(level > online_compile::defLevel)
-    {
-        return (cerr);
-    };
-
-    cerr << endl << LogLevelString(level) << ":" << header << ", " << content;
-
-    return (cerr);
-}
-
-ostream& fdt_log() { return (cerr); };
-
-void fdt_log_flush() { cerr << endl; }
-}; // namespace online_compile
diff --git a/host/online_compile/hip_utility/md5.cpp b/host/online_compile/hip_utility/md5.cpp
deleted file mode 100644
index 24166decba..0000000000
--- a/host/online_compile/hip_utility/md5.cpp
+++ /dev/null
@@ -1,319 +0,0 @@
-/*
- * Derived from a public-domain MD5 implementation. Original license
- * below.
- *
- * This is an OpenSSL-compatible implementation of the RSA Data Security, Inc.
- * MD5 Message-Digest Algorithm (RFC 1321).
- *
- * Homepage:
- * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5
- *
- * Author:
- * Alexander Peslyak, better known as Solar Designer <solar at openwall.com>
- *
- * This software was written by Alexander Peslyak in 2001.  No copyright is
- * claimed, and the software is hereby placed in the public domain.
- * In case this attempt to disclaim copyright and place the software in the
- * public domain is deemed null and void, then the software is
- * Copyright (c) 2001 Alexander Peslyak and it is hereby released to the
- * general public under the following terms:
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted.
- *
- * There's ABSOLUTELY NO WARRANTY, express or implied.
- *
- * (This is a heavily cut-down "BSD license".)
- *
- * This differs from Colin Plumb's older public domain implementation in that
- * no exactly 32-bit integer data type is required (any 32-bit or wider
- * unsigned integer data type will do), there's no compile-time endianness
- * configuration, and the function prototypes match OpenSSL's.  No code from
- * Colin Plumb's implementation has been reused; this comment merely compares
- * the properties of the two independent implementations.
- *
- * The primary goals of this implementation are portability and ease of use.
- * It is meant to be fast, but not as fast as possible.  Some known
- * optimizations are not included to reduce source code size and avoid
- * compile-time configuration.
- */
-#include <md5.hpp>
-#include <array>
-#include <cstring>
-#include <cstdint>
-#include <sstream>
-#include <iomanip>
-
-#define MD5_DIGEST_LENGTH 16
-
-struct MD5_CTX
-{
-    uint32_t lo, hi;
-    uint32_t a, b, c, d;
-    unsigned char buffer[64];
-    uint32_t block[MD5_DIGEST_LENGTH];
-};
-
-/*
- * The basic MD5 functions.
- *
- * F and G are optimized compared to their RFC 1321 definitions for
- * architectures that lack an AND-NOT instruction, just like in Colin Plumb's
- * implementation.
- */
-#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
-#define G(x, y, z) ((y) ^ ((z) & ((x) ^ (y))))
-#define H(x, y, z) (((x) ^ (y)) ^ (z))
-#define H2(x, y, z) ((x) ^ ((y) ^ (z)))
-#define I(x, y, z) ((y) ^ ((x) | ~(z)))
-
-/*
- * The MD5 transformation for all four rounds.
- */
-#define STEP(f, a, b, c, d, x, t, s)                         \
-    (a) += f((b), (c), (d)) + (x) + (t);                     \
-    (a) = (((a) << (s)) | (((a)&0xffffffff) >> (32 - (s)))); \
-    (a) += (b);
-
-/*
- * SET reads 4 input bytes in little-endian byte order and stores them in a
- * properly aligned word in host byte order.
- *
- * The check for little-endian architectures that tolerate unaligned memory
- * accesses is just an optimization.  Nothing will break if it fails to detect
- * a suitable architecture.
- *
- * Unfortunately, this optimization may be a C strict aliasing rules violation
- * if the caller's data buffer has effective type that cannot be aliased by
- * uint32_t.  In practice, this problem may occur if these MD5 routines are
- * inlined into a calling function, or with future and dangerously advanced
- * link-time optimizations.  For the time being, keeping these MD5 routines in
- * their own translation unit avoids the problem.
- */
-#if defined(__i386__) || defined(__x86_64__) || defined(__vax__)
-#define SET(n) (*reinterpret_cast<const uint32_t*>(&ptr[(n)*4]))
-#define GET(n) SET(n)
-#else
-#define SET(n)                                                         \
-    (ctx->block[(n)] = static_cast<uint32_t>(ptr[(n)*4]) |             \
-                       (static_cast<uint32_t>(ptr[(n)*4 + 1]) << 8) |  \
-                       (static_cast<uint32_t>(ptr[(n)*4 + 2]) << 16) | \
-                       (static_cast<uint32_t>(ptr[(n)*4 + 3]) << 24))
-#define GET(n) (ctx->block[(n)])
-#endif
-
-/*
- * This processes one or more 64-byte data blocks, but does NOT update the bit
- * counters.  There are no alignment requirements.
- */
-static const void* body(MD5_CTX* ctx, const void* data, size_t size)
-{
-    const unsigned char* ptr;
-    uint32_t a, b, c, d;
-
-    ptr = static_cast<const unsigned char*>(data);
-
-    a = ctx->a;
-    b = ctx->b;
-    c = ctx->c;
-    d = ctx->d;
-
-    do
-    {
-        uint32_t saved_a = a, saved_b = b, saved_c = c, saved_d = d;
-
-        /* Round 1 */
-        STEP(F, a, b, c, d, SET(0), 0xd76aa478, 7)
-        STEP(F, d, a, b, c, SET(1), 0xe8c7b756, 12)
-        STEP(F, c, d, a, b, SET(2), 0x242070db, 17)
-        STEP(F, b, c, d, a, SET(3), 0xc1bdceee, 22)
-        STEP(F, a, b, c, d, SET(4), 0xf57c0faf, 7)
-        STEP(F, d, a, b, c, SET(5), 0x4787c62a, 12)
-        STEP(F, c, d, a, b, SET(6), 0xa8304613, 17)
-        STEP(F, b, c, d, a, SET(7), 0xfd469501, 22)
-        STEP(F, a, b, c, d, SET(8), 0x698098d8, 7)
-        STEP(F, d, a, b, c, SET(9), 0x8b44f7af, 12)
-        STEP(F, c, d, a, b, SET(10), 0xffff5bb1, 17)
-        STEP(F, b, c, d, a, SET(11), 0x895cd7be, 22)
-        STEP(F, a, b, c, d, SET(12), 0x6b901122, 7)
-        STEP(F, d, a, b, c, SET(13), 0xfd987193, 12)
-        STEP(F, c, d, a, b, SET(14), 0xa679438e, 17)
-        STEP(F, b, c, d, a, SET(15), 0x49b40821, 22)
-
-        /* Round 2 */
-        STEP(G, a, b, c, d, GET(1), 0xf61e2562, 5)
-        STEP(G, d, a, b, c, GET(6), 0xc040b340, 9)
-        STEP(G, c, d, a, b, GET(11), 0x265e5a51, 14)
-        STEP(G, b, c, d, a, GET(0), 0xe9b6c7aa, 20)
-        STEP(G, a, b, c, d, GET(5), 0xd62f105d, 5)
-        STEP(G, d, a, b, c, GET(10), 0x02441453, 9)
-        STEP(G, c, d, a, b, GET(15), 0xd8a1e681, 14)
-        STEP(G, b, c, d, a, GET(4), 0xe7d3fbc8, 20)
-        STEP(G, a, b, c, d, GET(9), 0x21e1cde6, 5)
-        STEP(G, d, a, b, c, GET(14), 0xc33707d6, 9)
-        STEP(G, c, d, a, b, GET(3), 0xf4d50d87, 14)
-        STEP(G, b, c, d, a, GET(8), 0x455a14ed, 20)
-        STEP(G, a, b, c, d, GET(13), 0xa9e3e905, 5)
-        STEP(G, d, a, b, c, GET(2), 0xfcefa3f8, 9)
-        STEP(G, c, d, a, b, GET(7), 0x676f02d9, 14)
-        STEP(G, b, c, d, a, GET(12), 0x8d2a4c8a, 20)
-
-        /* Round 3 */
-        STEP(H, a, b, c, d, GET(5), 0xfffa3942, 4)
-        STEP(H2, d, a, b, c, GET(8), 0x8771f681, 11)
-        STEP(H, c, d, a, b, GET(11), 0x6d9d6122, 16)
-        STEP(H2, b, c, d, a, GET(14), 0xfde5380c, 23)
-        STEP(H, a, b, c, d, GET(1), 0xa4beea44, 4)
-        STEP(H2, d, a, b, c, GET(4), 0x4bdecfa9, 11)
-        STEP(H, c, d, a, b, GET(7), 0xf6bb4b60, 16)
-        STEP(H2, b, c, d, a, GET(10), 0xbebfbc70, 23)
-        STEP(H, a, b, c, d, GET(13), 0x289b7ec6, 4)
-        STEP(H2, d, a, b, c, GET(0), 0xeaa127fa, 11)
-        STEP(H, c, d, a, b, GET(3), 0xd4ef3085, 16)
-        STEP(H2, b, c, d, a, GET(6), 0x04881d05, 23)
-        STEP(H, a, b, c, d, GET(9), 0xd9d4d039, 4)
-        STEP(H2, d, a, b, c, GET(12), 0xe6db99e5, 11)
-        STEP(H, c, d, a, b, GET(15), 0x1fa27cf8, 16)
-        STEP(H2, b, c, d, a, GET(2), 0xc4ac5665, 23)
-
-        /* Round 4 */
-        STEP(I, a, b, c, d, GET(0), 0xf4292244, 6)
-        STEP(I, d, a, b, c, GET(7), 0x432aff97, 10)
-        STEP(I, c, d, a, b, GET(14), 0xab9423a7, 15)
-        STEP(I, b, c, d, a, GET(5), 0xfc93a039, 21)
-        STEP(I, a, b, c, d, GET(12), 0x655b59c3, 6)
-        STEP(I, d, a, b, c, GET(3), 0x8f0ccc92, 10)
-        STEP(I, c, d, a, b, GET(10), 0xffeff47d, 15)
-        STEP(I, b, c, d, a, GET(1), 0x85845dd1, 21)
-        STEP(I, a, b, c, d, GET(8), 0x6fa87e4f, 6)
-        STEP(I, d, a, b, c, GET(15), 0xfe2ce6e0, 10)
-        STEP(I, c, d, a, b, GET(6), 0xa3014314, 15)
-        STEP(I, b, c, d, a, GET(13), 0x4e0811a1, 21)
-        STEP(I, a, b, c, d, GET(4), 0xf7537e82, 6)
-        STEP(I, d, a, b, c, GET(11), 0xbd3af235, 10)
-        STEP(I, c, d, a, b, GET(2), 0x2ad7d2bb, 15)
-        STEP(I, b, c, d, a, GET(9), 0xeb86d391, 21)
-
-        a += saved_a;
-        b += saved_b;
-        c += saved_c;
-        d += saved_d;
-
-        ptr += 64;
-    } while((size -= 64) != 0u);
-
-    ctx->a = a;
-    ctx->b = b;
-    ctx->c = c;
-    ctx->d = d;
-
-    return ptr;
-}
-
-static void MD5_Init(MD5_CTX* ctx)
-{
-    ctx->a = 0x67452301;
-    ctx->b = 0xefcdab89;
-    ctx->c = 0x98badcfe;
-    ctx->d = 0x10325476;
-
-    ctx->lo = 0;
-    ctx->hi = 0;
-}
-
-static void MD5_Update(MD5_CTX* ctx, const void* data, size_t size)
-{
-    uint32_t saved_lo;
-    size_t used;
-
-    saved_lo = ctx->lo;
-    if((ctx->lo = (saved_lo + size) & 0x1fffffff) < saved_lo)
-        ctx->hi++;
-    ctx->hi += size >> 29;
-
-    used = saved_lo & 0x3f;
-
-    if(used != 0u)
-    {
-        size_t available = 64 - used;
-
-        if(size < available)
-        {
-            memcpy(&ctx->buffer[used], data, size);
-            return;
-        }
-
-        memcpy(&ctx->buffer[used], data, available);
-        data = static_cast<const unsigned char*>(data) + available;
-        size -= available;
-        body(ctx, ctx->buffer, 64);
-    }
-
-    if(size >= 64)
-    {
-        data = body(ctx, data, size & ~size_t{0x3f});
-        size &= 0x3f;
-    }
-
-    memcpy(ctx->buffer, data, size);
-}
-
-#define OUT(dst, src)                                   \
-    (dst)[0] = static_cast<unsigned char>(src);         \
-    (dst)[1] = static_cast<unsigned char>((src) >> 8);  \
-    (dst)[2] = static_cast<unsigned char>((src) >> 16); \
-    (dst)[3] = static_cast<unsigned char>((src) >> 24);
-
-static void MD5_Final(unsigned char* result, MD5_CTX* ctx)
-{
-    size_t used, available;
-
-    used = ctx->lo & 0x3f;
-
-    ctx->buffer[used++] = 0x80;
-
-    available = 64 - used;
-
-    if(available < 8)
-    {
-        memset(&ctx->buffer[used], 0, available);
-        body(ctx, ctx->buffer, 64);
-        used      = 0;
-        available = 64;
-    }
-
-    memset(&ctx->buffer[used], 0, available - 8);
-
-    ctx->lo <<= 3;
-    OUT(&ctx->buffer[56], ctx->lo)
-    OUT(&ctx->buffer[60], ctx->hi)
-
-    body(ctx, ctx->buffer, 64);
-
-    OUT(&result[0], ctx->a)
-    OUT(&result[4], ctx->b)
-    OUT(&result[8], ctx->c)
-    OUT(&result[12], ctx->d)
-
-    memset(ctx, 0, sizeof(*ctx));
-}
-
-namespace online_compile {
-
-std::string md5(std::string s)
-{
-    std::array<unsigned char, MD5_DIGEST_LENGTH> result{};
-
-    MD5_CTX ctx{};
-    MD5_Init(&ctx);
-    MD5_Update(&ctx, s.data(), s.length());
-    MD5_Final(result.data(), &ctx);
-
-    std::ostringstream sout;
-    sout << std::hex << std::setfill('0');
-    for(auto c : result)
-        sout << std::setw(2) << int{c};
-
-    return sout.str();
-}
-} // namespace online_compile
diff --git a/host/online_compile/hip_utility/target_properties.cpp b/host/online_compile/hip_utility/target_properties.cpp
deleted file mode 100644
index 1de2852c91..0000000000
--- a/host/online_compile/hip_utility/target_properties.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2020 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include <env.hpp>
-#include <handle.hpp>
-#include <stringutils.hpp>
-#include <target_properties.hpp>
-#include <map>
-#include <string>
-
-OLC_DECLARE_ENV_VAR(OLC_DEBUG_ENFORCE_DEVICE)
-
-namespace online_compile {
-
-static std::string GetDeviceNameFromMap(const std::string& in)
-{
-    // NOLINTNEXTLINE (cppcoreguidelines-avoid-non-const-global-variables)
-    static std::map<std::string, std::string> device_name_map = {
-        {"Ellesmere", "gfx803"},
-        {"Baffin", "gfx803"},
-        {"RacerX", "gfx803"},
-        {"Polaris10", "gfx803"},
-        {"Polaris11", "gfx803"},
-        {"Tonga", "gfx803"},
-        {"Fiji", "gfx803"},
-        {"gfx800", "gfx803"},
-        {"gfx802", "gfx803"},
-        {"gfx804", "gfx803"},
-        {"Vega10", "gfx900"},
-        {"gfx901", "gfx900"},
-        {"10.3.0 Sienna_Cichlid 18", "gfx1030"},
-    };
-
-    const char* const p_asciz = online_compile::GetStringEnv(OLC_DEBUG_ENFORCE_DEVICE{});
-    if(p_asciz != nullptr && strlen(p_asciz) > 0)
-        return {p_asciz};
-
-    const auto name = in.substr(0, in.find(':')); // str.substr(0, npos) returns str.
-
-    auto match = device_name_map.find(name);
-    if(match != device_name_map.end())
-        return match->second;
-    return name; // NOLINT (performance-no-automatic-move)
-}
-
-void TargetProperties::Init(const Handle* const handle)
-{
-    const auto rawName = [&]() -> std::string { return handle->GetDeviceNameImpl(); }();
-    name               = GetDeviceNameFromMap(rawName);
-    // DKMS driver older than 5.9 may report incorrect state of SRAMECC feature.
-    // Therefore we compute default SRAMECC and rely on it for now.
-    sramecc = [&]() -> boost::optional<bool> {
-        if(name == "gfx906" || name == "gfx908")
-            return {true};
-        return {};
-    }();
-    // However we need to store the reported state, even if it is incorrect,
-    // to use together with COMGR.
-    sramecc_reported = [&]() -> boost::optional<bool> {
-        if(rawName.find(":sramecc+") != std::string::npos)
-            return true;
-        if(rawName.find(":sramecc-") != std::string::npos)
-            return false;
-        return sramecc; // default
-    }();
-    xnack = [&]() -> boost::optional<bool> {
-        if(rawName.find(":xnack+") != std::string::npos)
-            return true;
-        if(rawName.find(":xnack-") != std::string::npos)
-            return false;
-        return {}; // default
-    }();
-    InitDbId();
-}
-
-void TargetProperties::InitDbId()
-{
-    dbId = name;
-    if(name == "gfx906" || name == "gfx908")
-    {
-        // Let's stay compatible with existing gfx906/908 databases.
-        // When feature equal to the default (SRAMECC ON), do not
-        // append feature suffix. This is for backward compatibility
-        // with legacy databases ONLY!
-        if(!sramecc || !(*sramecc))
-            dbId += "_nosramecc";
-    }
-    else
-    {
-        if(sramecc && *sramecc)
-            dbId += "_sramecc";
-    }
-    if(xnack && *xnack)
-        dbId += "_xnack";
-}
-
-} // namespace online_compile
diff --git a/host/online_compile/hip_utility/tmp_dir.cpp b/host/online_compile/hip_utility/tmp_dir.cpp
deleted file mode 100644
index bdef7cad06..0000000000
--- a/host/online_compile/hip_utility/tmp_dir.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2019 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
-#include <tmp_dir.hpp>
-#include <env.hpp>
-#include <boost/filesystem.hpp>
-#include <logger.hpp>
-
-OLC_DECLARE_ENV_VAR(OLC_DEBUG_SAVE_TEMP_DIR)
-
-namespace online_compile {
-
-void SystemCmd(std::string cmd)
-{
-    fdt_log(LogLevel::Info, "SystemCmd", cmd.c_str());
-    fdt_log_flush();
-    if(std::system(cmd.c_str()) != 0)
-        throw std::runtime_error("Can't execute " + cmd);
-}
-
-TmpDir::TmpDir(std::string prefix)
-    : path(boost::filesystem::temp_directory_path() /
-           boost::filesystem::unique_path("online_compile-" + prefix + "-%%%%-%%%%-%%%%-%%%%"))
-{
-    boost::filesystem::create_directories(this->path);
-}
-
-void TmpDir::Execute(std::string exe, std::string args) const
-{
-    std::string cd  = "cd " + this->path.string() + "; ";
-    std::string cmd = cd + exe + " " + args; // + " > /dev/null";
-    SystemCmd(cmd);
-}
-
-TmpDir::~TmpDir()
-{
-    if(!online_compile::IsEnabled(OLC_DEBUG_SAVE_TEMP_DIR{}))
-    {
-        boost::filesystem::remove_all(this->path);
-    }
-}
-
-} // namespace online_compile
diff --git a/host/online_compile/include/binary_cache.hpp b/host/online_compile/include/binary_cache.hpp
deleted file mode 100644
index c146bb9758..0000000000
--- a/host/online_compile/include/binary_cache.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
-#ifndef GUARD_OLC_BINARY_CACHE_HPP
-#define GUARD_OLC_BINARY_CACHE_HPP
-
-#include <target_properties.hpp>
-#include <boost/filesystem/path.hpp>
-#include <string>
-
-namespace online_compile {
-
-boost::filesystem::path
-GetCacheFile(const std::string& device, const std::string& name, const std::string& args);
-
-boost::filesystem::path GetCachePath();
-
-boost::filesystem::path LoadBinary(const TargetProperties& target,
-                                   std::size_t num_cu,
-                                   const std::string& name,
-                                   const std::string& args);
-void SaveBinary(const boost::filesystem::path& binary_path,
-                const TargetProperties& target,
-                const std::string& name,
-                const std::string& args);
-
-} // namespace online_compile
-
-#endif
diff --git a/host/online_compile/include/config.h.in b/host/online_compile/include/config.h.in
deleted file mode 100644
index b36486de2f..0000000000
--- a/host/online_compile/include/config.h.in
+++ /dev/null
@@ -1,47 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_CONFIG_H_IN
-#define GUARD_CONFIG_H_IN
-
-// "_PACKAGE_" to avoid name contentions: the macros like
-// HIP_VERSION_MAJOR are defined in hip_version.h.
-// clang-format off
-#define HIP_PACKAGE_VERSION_MAJOR @OLC_hip_VERSION_MAJOR@
-#define HIP_PACKAGE_VERSION_MINOR @OLC_hip_VERSION_MINOR@
-#define HIP_PACKAGE_VERSION_PATCH @OLC_hip_VERSION_PATCH@
-// clang-format on
-
-#define HIP_PACKAGE_VERSION_FLAT                                                   \
-    ((HIP_PACKAGE_VERSION_MAJOR * 1000ULL + HIP_PACKAGE_VERSION_MINOR) * 1000000 + \
-     HIP_PACKAGE_VERSION_PATCH)
-
-#cmakedefine01 OLC_DEBUG
-
-#cmakedefine OLC_HIP_COMPILER "@OLC_HIP_COMPILER@"
-#cmakedefine EXTRACTKERNEL_BIN "@EXTRACTKERNEL_BIN@"
-#cmakedefine OLC_OFFLOADBUNDLER_BIN "@OLC_OFFLOADBUNDLER_BIN@"
-
-#endif
diff --git a/host/online_compile/include/env.hpp b/host/online_compile/include/env.hpp
deleted file mode 100644
index 057a863269..0000000000
--- a/host/online_compile/include/env.hpp
+++ /dev/null
@@ -1,123 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_OLC_ENV_HPP
-#define GUARD_OLC_ENV_HPP
-
-#include <cstdlib>
-#include <cstring>
-#include <string>
-#include <vector>
-
-namespace online_compile {
-
-/// \todo Rework: Case-insensitive string compare, ODR, (?) move to .cpp
-
-// Declare a cached environment variable
-#define OLC_DECLARE_ENV_VAR(x)                    \
-    struct x                                      \
-    {                                             \
-        static const char* value() { return #x; } \
-    };
-
-/*
- * Returns false if a feature-controlling environment variable is defined
- * and set to something which disables a feature.
- */
-inline bool IsEnvvarValueDisabled(const char* name)
-{
-    const auto value_env_p = std::getenv(name);
-    return value_env_p != nullptr &&
-           (std::strcmp(value_env_p, "disable") == 0 || std::strcmp(value_env_p, "disabled") == 0 ||
-            std::strcmp(value_env_p, "0") == 0 || std::strcmp(value_env_p, "no") == 0 ||
-            std::strcmp(value_env_p, "false") == 0);
-}
-
-inline bool IsEnvvarValueEnabled(const char* name)
-{
-    const auto value_env_p = std::getenv(name);
-    return value_env_p != nullptr &&
-           (std::strcmp(value_env_p, "enable") == 0 || std::strcmp(value_env_p, "enabled") == 0 ||
-            std::strcmp(value_env_p, "1") == 0 || std::strcmp(value_env_p, "yes") == 0 ||
-            std::strcmp(value_env_p, "true") == 0);
-}
-
-// Return 0 if env is enabled else convert environment var to an int.
-// Supports hexadecimal with leading 0x or decimal
-inline unsigned long int EnvvarValue(const char* name, unsigned long int fallback = 0)
-{
-    const auto value_env_p = std::getenv(name);
-    if(value_env_p == nullptr)
-    {
-        return fallback;
-    }
-    else
-    {
-        return strtoul(value_env_p, nullptr, 0);
-    }
-}
-
-inline std::vector<std::string> GetEnv(const char* name)
-{
-    const auto p = std::getenv(name);
-    if(p == nullptr)
-        return {};
-    else
-        return {{p}};
-}
-
-template <class T>
-inline const char* GetStringEnv(T)
-{
-    static const std::vector<std::string> result = GetEnv(T::value());
-    if(result.empty())
-        return nullptr;
-    else
-        return result.front().c_str();
-}
-
-template <class T>
-inline bool IsEnabled(T)
-{
-    static const bool result = online_compile::IsEnvvarValueEnabled(T::value());
-    return result;
-}
-
-template <class T>
-inline bool IsDisabled(T)
-{
-    static const bool result = online_compile::IsEnvvarValueDisabled(T::value());
-    return result;
-}
-
-template <class T>
-inline unsigned long int Value(T, unsigned long int fallback = 0)
-{
-    static const auto result = online_compile::EnvvarValue(T::value(), fallback);
-    return result;
-}
-} // namespace online_compile
-
-#endif
diff --git a/host/online_compile/include/exec_utils.hpp b/host/online_compile/include/exec_utils.hpp
deleted file mode 100644
index e257133ca4..0000000000
--- a/host/online_compile/include/exec_utils.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2019 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef EXEC_OLC_UTILS_HPP
-#define EXEC_OLC_UTILS_HPP
-
-#include <istream>
-#include <ostream>
-#include <string>
-
-namespace online_compile {
-namespace exec {
-
-/// Redirecting both input and output is not supported.
-int Run(const std::string& p, std::istream* in, std::ostream* out);
-
-} // namespace exec
-} // namespace online_compile
-
-#endif // EXEC_UTILS_HPP
diff --git a/host/online_compile/include/handle.hpp b/host/online_compile/include/handle.hpp
deleted file mode 100644
index 8eda802a43..0000000000
--- a/host/online_compile/include/handle.hpp
+++ /dev/null
@@ -1,145 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_OLC_HANDLE_HPP_
-#define GUARD_OLC_HANDLE_HPP_
-
-#include <kernel.hpp>
-#include <stringutils.hpp>
-#include <target_properties.hpp>
-
-#include <boost/range/adaptor/transformed.hpp>
-
-#include <cstdio>
-#include <cstring>
-#include <ios>
-#include <sstream>
-#include <memory>
-#include <vector>
-#include <unordered_map>
-
-namespace online_compile {
-
-struct HandleImpl;
-
-struct Handle
-{
-    friend struct TargetProperties;
-
-    Handle();
-    Handle(hipStream_t stream);
-    Handle(Handle&&) noexcept;
-    ~Handle();
-
-    hipStream_t GetStream() const;
-    void SetStream(hipStream_t streamID) const;
-
-    KernelInvoke AddKernel(const std::string& algorithm,
-                           const std::string& network_config,
-                           const std::string& program_name,
-                           const std::string& kernel_name,
-                           const std::vector<size_t>& vld,
-                           const std::vector<size_t>& vgd,
-                           const std::string& params,
-                           std::size_t cache_index = 0) const;
-
-    bool HasKernel(const std::string& algorithm, const std::string& network_config) const;
-
-    void ClearKernels(const std::string& algorithm, const std::string& network_config) const;
-
-    auto GetKernels(const std::string& algorithm, const std::string& network_config) const
-    {
-        return this->GetKernelsImpl(algorithm, network_config) |
-               boost::adaptors::transformed([this](Kernel k) { return this->Run(k); });
-    }
-    KernelInvoke GetKernel(const std::string& algorithm, const std::string& network_config) const
-    {
-        auto ks = this->GetKernelsImpl(algorithm, network_config);
-        if(ks.empty())
-        {
-            throw std::runtime_error("looking for default kernel (does not exist): " + algorithm +
-                                     ", " + network_config);
-        }
-        return this->Run(ks.front());
-    }
-
-    KernelInvoke Run(Kernel k) const;
-
-    Program LoadProgram(const std::string& program_name, std::string params) const;
-
-    bool HasProgram(const std::string& program_name, const std::string& params) const;
-
-    void AddProgram(Program prog, const std::string& program_name, const std::string& params) const;
-
-    void Finish() const;
-
-    std::size_t GetLocalMemorySize() const;
-    std::size_t GetGlobalMemorySize() const;
-    std::size_t GetWavefrontWidth() const;
-    std::size_t GetMaxComputeUnits() const;
-    std::size_t GetMaxHardwareComputeUnits() const
-    {
-        std::size_t num_cu = this->GetMaxComputeUnits();
-        std::string name   = this->GetDeviceName();
-        return StartsWith(name, "gfx1") ? num_cu * 2 /* CUs per WGP */ : num_cu;
-    }
-
-    std::string GetDeviceName() const;
-    const TargetProperties& GetTargetProperties() const;
-
-    private:
-    std::string GetDeviceNameImpl() const;
-    const std::vector<Kernel>& GetKernelsImpl(const std::string& algorithm,
-                                              const std::string& network_config) const;
-
-    public:
-    std::ostream& Print(std::ostream& os) const;
-
-    static std::string GetDbBasename(const TargetProperties& target, size_t num_cu)
-    {
-        auto ret = target.DbId() + [&]() {
-            std::ostringstream ss;
-            if(num_cu <= 64)
-                ss << '_' << num_cu;
-            else
-                ss << std::hex << num_cu;
-            return std::string(ss.str());
-        }();
-        return ret;
-    }
-
-    std::string GetDbBasename() const
-    {
-        return GetDbBasename(GetTargetProperties(), GetMaxComputeUnits());
-    }
-
-    std::unique_ptr<HandleImpl> impl;
-};
-
-inline std::ostream& operator<<(std::ostream& os, const Handle& handle) { return handle.Print(os); }
-
-} // namespace online_compile
-
-#endif // GUARD_OLC_HANDLE_HPP_
diff --git a/host/online_compile/include/hipCheck.hpp b/host/online_compile/include/hipCheck.hpp
deleted file mode 100644
index 61959c8fa6..0000000000
--- a/host/online_compile/include/hipCheck.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef _HIP_OLC_CHECK_HPP_
-#define _HIP_OLC_CHECK_HPP_
-
-#include <hip/hip_runtime.h>
-#include <sstream>
-#include <vector>
-
-// Here flag can be a constant, variable or function call
-#define MY_HIP_CHECK(flag)                                                         \
-    do                                                                             \
-    {                                                                              \
-        hipError_t _tmpVal;                                                        \
-        if((_tmpVal = flag) != hipSuccess)                                         \
-        {                                                                          \
-            std::ostringstream ostr;                                               \
-            ostr << "HIP Function Failed (" << __FILE__ << "," << __LINE__ << ") " \
-                 << hipGetErrorString(_tmpVal);                                    \
-            throw std::runtime_error(ostr.str());                                  \
-        }                                                                          \
-    } while(0)
-
-#endif
diff --git a/host/online_compile/include/hip_build_utils.hpp b/host/online_compile/include/hip_build_utils.hpp
deleted file mode 100644
index f93993edef..0000000000
--- a/host/online_compile/include/hip_build_utils.hpp
+++ /dev/null
@@ -1,97 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2019 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef OLC_GUARD_OLC_HIP_BUILD_UTILS_HPP
-#define OLC_GUARD_OLC_HIP_BUILD_UTILS_HPP
-
-#include <target_properties.hpp>
-#include <kernel.hpp>
-#include <boost/optional.hpp>
-#include <string>
-
-namespace online_compile {
-
-boost::filesystem::path HipBuild(boost::optional<online_compile::TmpDir>& tmp_dir,
-                                 const std::string& filename,
-                                 std::string src,
-                                 std::string params,
-                                 const TargetProperties& target,
-                                 bool sources_already_reside_on_filesystem = false);
-
-void bin_file_to_str(const boost::filesystem::path& file, std::string& buf);
-
-struct external_tool_version_t
-{
-    int major = -1;
-    int minor = -1;
-    int patch = -1;
-    friend bool operator>(const external_tool_version_t& lhs, const external_tool_version_t& rhs);
-    friend bool operator<(const external_tool_version_t& lhs, const external_tool_version_t& rhs);
-    friend bool operator>=(const external_tool_version_t& lhs, const external_tool_version_t& rhs);
-    friend bool operator<=(const external_tool_version_t& lhs, const external_tool_version_t& rhs);
-};
-
-external_tool_version_t HipCompilerVersion();
-
-bool IsHccCompiler();
-bool IsHipClangCompiler();
-
-class LcOptionTargetStrings
-{
-    public:
-    const std::string& device;
-    const std::string xnack;
-
-    private:
-    const std::string sramecc;
-    const std::string sramecc_reported;
-
-    public:
-    const std::string targetId;
-    LcOptionTargetStrings(const TargetProperties& target)
-        : device(target.Name()),
-          xnack([&]() -> std::string {
-              if(target.Xnack())
-                  return std::string{":xnack"} + (*target.Xnack() ? "+" : "-");
-              return {};
-          }()),
-          sramecc([&]() -> std::string {
-              if(target.Sramecc())
-                  return std::string{":sramecc"} + (*target.Sramecc() ? "+" : "-");
-              return {};
-          }()),
-          sramecc_reported([&]() -> std::string {
-              if(target.SrameccReported())
-                  return std::string{":sramecc"} + (*target.SrameccReported() ? "+" : "-");
-              return {};
-          }()),
-          targetId(device + sramecc + xnack)
-    {
-    }
-};
-
-} // namespace online_compile
-
-#endif
diff --git a/host/online_compile/include/hipoc_kernel.hpp b/host/online_compile/include/hipoc_kernel.hpp
deleted file mode 100644
index f6c5e1adf5..0000000000
--- a/host/online_compile/include/hipoc_kernel.hpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_OLC_HIPOC_KERNEL_HPP
-#define GUARD_OLC_HIPOC_KERNEL_HPP
-
-#include <array>
-#include <cassert>
-#include <hipoc_program.hpp>
-#include <stringutils.hpp>
-#include <manage_ptr.hpp>
-#include <op_kernel_args.hpp>
-#include <hipCheck.hpp>
-#include <vector>
-#include <memory>
-
-namespace online_compile {
-
-using HipEventPtr = OLC_MANAGE_PTR(hipEvent_t, hipEventDestroy);
-inline HipEventPtr make_hip_event()
-{
-    hipEvent_t result = nullptr;
-    MY_HIP_CHECK(hipEventCreate(&result));
-    return HipEventPtr{result};
-}
-
-template <class T, class U>
-struct KernelArgsPair
-{
-    static const int alignment    = sizeof(U);
-    static const int padding      = (alignment - (sizeof(T) % alignment)) % alignment;
-    static const int second_index = sizeof(T) + padding;
-    KernelArgsPair(T x, U y)
-    {
-
-        new(buffer) T(x); // NOLINT (clang-analyzer-cplusplus.PlacementNew)
-        new(buffer + second_index) U(y);
-    }
-    char buffer[second_index + sizeof(U)] = {};
-};
-
-template <class... Ts>
-struct KernelArgsPack;
-
-template <class T, class U, class... Ts>
-struct KernelArgsPack<T, U, Ts...>
-{
-    using data_t = KernelArgsPack<KernelArgsPair<T, U>, Ts...>;
-    KernelArgsPack(T x, U y, Ts... xs) : data(KernelArgsPair<T, U>(x, y), xs...) {}
-    data_t data;
-};
-
-template <class T>
-struct KernelArgsPack<T>
-{
-    KernelArgsPack(T x) : head(x) {}
-    T head;
-};
-
-template <class... Ts>
-struct KernelArgs
-{
-    KernelArgs(Ts... xs) : pack(xs...) { std::fill(std::begin(hidden), std::end(hidden), 0); }
-    KernelArgsPack<Ts...> pack;
-    uint64_t hidden[6] = {};
-};
-
-struct HIPOCKernelInvoke
-{
-    hipStream_t stream          = nullptr;
-    hipFunction_t fun           = nullptr;
-    std::array<size_t, 3> ldims = {};
-    std::array<size_t, 3> gdims = {};
-    std::string name;
-    std::function<void(hipEvent_t, hipEvent_t)> callback;
-
-    // Workaround for aggregate types in c++11
-    HIPOCKernelInvoke() {}
-    HIPOCKernelInvoke(hipStream_t pstream,
-                      hipFunction_t pfun,
-                      std::array<size_t, 3> pldims,
-                      std::array<size_t, 3> pgdims,
-                      std::string pname,
-                      std::function<void(hipEvent_t, hipEvent_t)> pcallback)
-        : stream(pstream), fun(pfun), ldims(pldims), gdims(pgdims), name(pname), callback(pcallback)
-    {
-    }
-    void operator()(std::vector<OpKernelArg>& any_args) const
-    {
-        char hip_args[256] = {0};
-        auto sz_left       = any_args[0].size();
-
-        memcpy(hip_args, &(any_args[0].buffer[0]), any_args[0].size());
-
-        for(unsigned long idx = 1; idx < any_args.size(); idx++)
-        {
-            auto& any_arg              = any_args[idx];
-            unsigned long alignment    = any_arg.size();
-            unsigned long padding      = (alignment - (sz_left % alignment)) % alignment;
-            unsigned long second_index = sz_left + padding;
-            memcpy(hip_args + second_index, &(any_arg.buffer[0]), any_arg.size());
-            sz_left = second_index + alignment;
-        }
-        run(hip_args, sz_left);
-    }
-
-    template <class... Ts>
-    void operator()(Ts... xs) const
-    {
-        KernelArgs<Ts...> args{xs...};
-        run(&args, sizeof(args));
-    }
-
-    void run(void* args, std::size_t size) const;
-
-    const std::string& GetName() const { return name; }
-};
-
-struct HIPOCKernel
-{
-    HIPOCProgram program;
-    std::string name;
-    std::array<size_t, 3> ldims = {};
-    std::array<size_t, 3> gdims = {};
-    std::string kernel_module;
-    hipFunction_t fun = nullptr;
-
-    HIPOCKernel() {}
-    HIPOCKernel(HIPOCProgram p,
-                const std::string kernel_name,
-                std::vector<size_t> local_dims,
-                std::vector<size_t> global_dims)
-        : program(p), name(kernel_name)
-    {
-        assert(!local_dims.empty() && local_dims.size() <= 3);
-        assert(!global_dims.empty() && global_dims.size() <= 3);
-        ldims.fill(1);
-        gdims.fill(1);
-        std::copy(local_dims.begin(), local_dims.end(), ldims.begin());
-        std::copy(global_dims.begin(), global_dims.end(), gdims.begin());
-
-        kernel_module = name;
-        MY_HIP_CHECK(hipModuleGetFunction(&fun, program.GetModule(), kernel_module.c_str()));
-    }
-
-    HIPOCKernelInvoke Invoke(hipStream_t stream,
-                             std::function<void(hipEvent_t, hipEvent_t)> callback = nullptr) const;
-};
-
-} // namespace online_compile
-
-#endif
diff --git a/host/online_compile/include/hipoc_program.hpp b/host/online_compile/include/hipoc_program.hpp
deleted file mode 100644
index c388bb35bf..0000000000
--- a/host/online_compile/include/hipoc_program.hpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_OLC_HIPOC_PROGRAM_HPP
-#define GUARD_OLC_HIPOC_PROGRAM_HPP
-
-#include <target_properties.hpp>
-#include <manage_ptr.hpp>
-#include <hipoc_program_impl.hpp>
-#include <boost/filesystem/path.hpp>
-#include <hip/hip_runtime_api.h>
-#include <string>
-
-namespace online_compile {
-
-struct HIPOCProgramImpl;
-struct HIPOCProgram
-{
-    HIPOCProgram();
-    /// This ctor builds the program from source, initializes module.
-    /// Also either CO pathname (typically if offline tools were used)
-    /// or binary blob (if comgr was used to build the program)
-    /// is initialized. GetModule(), GetCodeObjectPathname(),
-    /// GetCodeObjectBlob() return appropriate data after this ctor.
-    /// Other ctors only guarantee to initialize module.
-    HIPOCProgram(const std::string& program_name,
-                 std::string params,
-                 const TargetProperties& target);
-    HIPOCProgram(const std::string& program_name, const boost::filesystem::path& hsaco);
-    std::shared_ptr<const HIPOCProgramImpl> impl;
-    hipModule_t GetModule() const;
-    /// \return Pathname of CO file, if it resides on the filesystem.
-    boost::filesystem::path GetCodeObjectPathname() const;
-    /// \return Copy of in-memory CO blob.
-    std::string GetCodeObjectBlob() const;
-    /// \return True if CO blob resides in-memory.
-    /// False if CO resides on filesystem.
-    bool IsCodeObjectInMemory() const;
-};
-} // namespace online_compile
-
-#endif
diff --git a/host/online_compile/include/hipoc_program_impl.hpp b/host/online_compile/include/hipoc_program_impl.hpp
deleted file mode 100644
index 4e6b59d265..0000000000
--- a/host/online_compile/include/hipoc_program_impl.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_OLC_HIPOC_PROGRAM_IMPL_HPP
-#define GUARD_OLC_HIPOC_PROGRAM_IMPL_HPP
-
-#include <target_properties.hpp>
-#include <manage_ptr.hpp>
-#include <tmp_dir.hpp>
-#include <boost/filesystem/path.hpp>
-#include <boost/optional.hpp>
-#include <hip/hip_runtime_api.h>
-
-namespace online_compile {
-
-using hipModulePtr = OLC_MANAGE_PTR(hipModule_t, hipModuleUnload);
-
-struct HIPOCProgramImpl
-{
-    HIPOCProgramImpl(){};
-    HIPOCProgramImpl(const std::string& program_name, const boost::filesystem::path& filespec);
-
-    HIPOCProgramImpl(const std::string& program_name,
-                     std::string params,
-                     const TargetProperties& target_);
-
-    std::string program;
-    TargetProperties target;
-    boost::filesystem::path hsaco_file;
-    hipModulePtr module;
-    boost::optional<TmpDir> dir;
-    std::vector<char> binary;
-
-    void
-    BuildCodeObjectInFile(std::string& params, const std::string& src, const std::string& filename);
-    void BuildCodeObject(std::string params);
-};
-} // namespace online_compile
-#endif // GUARD_OLC_HIPOC_PROGRAM_IMPL_HPP
diff --git a/host/online_compile/include/kernel.hpp b/host/online_compile/include/kernel.hpp
deleted file mode 100644
index 7d1fd81242..0000000000
--- a/host/online_compile/include/kernel.hpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_OLC_KERNEL_HPP
-#define GUARD_OLC_KERNEL_HPP
-
-#include <string>
-#include <vector>
-#include <hipoc_kernel.hpp>
-
-namespace online_compile {
-std::string GetKernelSrc(std::string name);
-std::string GetKernelInc(std::string key);
-std::vector<std::string> GetKernelIncList();
-std::vector<std::string> GetHipKernelIncList();
-
-using Kernel       = HIPOCKernel;
-using KernelInvoke = HIPOCKernelInvoke;
-using Program      = HIPOCProgram;
-
-} // namespace online_compile
-
-#endif
diff --git a/host/online_compile/include/kernel_build_params.hpp b/host/online_compile/include/kernel_build_params.hpp
deleted file mode 100644
index 30315ac9b7..0000000000
--- a/host/online_compile/include/kernel_build_params.hpp
+++ /dev/null
@@ -1,137 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2019 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_OLC_KERNEL_BUILD_PARAMETERS_HPP_
-#define GUARD_OLC_KERNEL_BUILD_PARAMETERS_HPP_
-
-#include <algorithm>
-#include <cassert>
-#include <initializer_list>
-#include <string>
-#include <vector>
-
-namespace online_compile {
-
-namespace kbp {
-struct Option
-{
-};
-} // namespace kbp
-
-enum class ParameterTypes
-{
-    Define,
-    Option,
-};
-
-struct KernelBuildParameter
-{
-    ParameterTypes type;
-    std::string name;
-    std::string value;
-};
-
-class KernelBuildParameters
-{
-    public:
-    struct KBPInit
-    {
-        friend class KernelBuildParameters;
-
-        KBPInit(const std::string& name, const std::string& value = "")
-            : data{ParameterTypes::Define, name, value}
-        {
-        }
-
-        template <class TValue, class = decltype(std::to_string(std::declval<TValue>()))>
-        KBPInit(const std::string& name, const TValue& value) : KBPInit(name, std::to_string(value))
-        {
-        }
-
-        KBPInit(kbp::Option, const std::string& name, const std::string& value = "")
-            : data{ParameterTypes::Option, name, value}
-        {
-        }
-
-        template <class TValue, class = decltype(std::to_string(std::declval<TValue>()))>
-        KBPInit(kbp::Option, const std::string& name, const TValue& value)
-            : KBPInit(kbp::Option{}, name, std::to_string(value))
-        {
-        }
-
-        private:
-        KernelBuildParameter data{};
-    };
-
-    KernelBuildParameters() = default;
-    KernelBuildParameters(const std::initializer_list<KBPInit>& defines_)
-    {
-        options.reserve(defines_.size());
-        for(const auto& define : defines_)
-        {
-            assert(ValidateUniqueness(define.data.name));
-            options.push_back(define.data);
-        }
-    }
-
-    bool Empty() const { return options.empty(); }
-
-    void Define(const std::string& name, const std::string& value = "")
-    {
-        assert(ValidateUniqueness(name));
-        options.push_back({ParameterTypes::Define, name, value});
-    }
-
-    template <class TValue, class = decltype(std::to_string(std::declval<TValue>()))>
-    void Define(const std::string& name, const TValue& value)
-    {
-        Define(name, std::to_string(value));
-    }
-
-    KernelBuildParameters& operator<<(const KernelBuildParameters& other)
-    {
-        std::copy(other.options.begin(), other.options.end(), std::back_inserter(options));
-        return *this;
-    }
-
-    template <class TFor>
-    std::string GenerateFor(TFor&&) const
-    {
-        return TFor::Generate(options);
-    }
-
-    private:
-    std::vector<KernelBuildParameter> options = {};
-
-    bool ValidateUniqueness(const std::string& name) const
-    {
-        const auto eq = [=](const auto& item) { return item.name == name; };
-        return std::find_if(options.begin(), options.end(), eq) == options.end();
-    }
-};
-
-} // namespace online_compile
-
-#endif
diff --git a/host/online_compile/include/kernel_cache.hpp b/host/online_compile/include/kernel_cache.hpp
deleted file mode 100644
index 20d26f6102..0000000000
--- a/host/online_compile/include/kernel_cache.hpp
+++ /dev/null
@@ -1,97 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-/* ************************************************************************
- * Copyright 2015 Vratis, Ltd.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * ************************************************************************ */
-
-#ifndef GUARD_OLC_KERNEL_CACHE_HPP_
-#define GUARD_OLC_KERNEL_CACHE_HPP_
-
-#include <handle.hpp>
-#include <kernel.hpp>
-#include <simple_hash.hpp>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-namespace online_compile {
-
-/**
- * @brief The KernelCache class Build and cache kernels
- *
- */
-class KernelCache
-{
-    public:
-    using Key        = std::pair<std::string, std::string>;
-    using KernelMap  = std::unordered_map<Key, std::vector<Kernel>, SimpleHash>;
-    using ProgramMap = std::unordered_map<Key, Program, SimpleHash>;
-
-    Kernel AddKernel(const Handle& h,
-                     const std::string& algorithm,
-                     const std::string& network_config,
-                     const std::string& program_name,
-                     const std::string& kernel_name,
-                     const std::vector<size_t>& vld,
-                     const std::vector<size_t>& vgd,
-                     std::string params      = "",
-                     std::size_t cache_index = 0);
-
-    void AddKernel(Key key, Kernel k, std::size_t cache_index);
-
-    void ClearKernels(const std::string& algorithm, const std::string& network_config);
-
-    const std::vector<Kernel>& GetKernels(const std::string& algorithm,
-                                          const std::string& network_config);
-
-    bool HasKernels(const std::string& algorithm, const std::string& network_config) const;
-
-    bool HasProgram(const std::string& name, const std::string& params) const;
-
-    void AddProgram(Program prog, const std::string& program_name, std::string params);
-
-    KernelCache();
-
-    private:
-    KernelMap kernel_map;
-    ProgramMap program_map;
-};
-
-} // namespace online_compile
-
-#endif // GUARD_OLC_KERNEL_CACHE_HPP_
diff --git a/host/online_compile/include/logger.hpp b/host/online_compile/include/logger.hpp
deleted file mode 100644
index a397a868ba..0000000000
--- a/host/online_compile/include/logger.hpp
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef _OLC_LOGGER_HPP_
-#define _OLC_LOGGER_HPP_
-
-#include <fstream>
-
-namespace online_compile {
-
-enum class LogLevel
-{
-    Quiet   = 1,
-    Error   = 2,
-    Warning = 3,
-    Info    = 4,
-    Info2   = 5
-};
-
-std::ostream& fdt_log(LogLevel level, const char* header, const char* content);
-std::ostream& fdt_log();
-void fdt_log_flush();
-
-}; // namespace online_compile
-
-#endif
diff --git a/host/online_compile/include/manage_ptr.hpp b/host/online_compile/include/manage_ptr.hpp
deleted file mode 100644
index f23807686b..0000000000
--- a/host/online_compile/include/manage_ptr.hpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_OLC_MANAGE_PTR_HPP
-#define GUARD_OLC_MANAGE_PTR_HPP
-
-#include <memory>
-#include <type_traits>
-
-namespace online_compile {
-
-template <class F, F f>
-struct manage_deleter
-{
-    template <class T>
-    void operator()(T* x) const
-    {
-        if(x != nullptr)
-        {
-            (void)f(x); // NOLINT (cppcoreguidelines-owning-memory)
-        }
-    }
-};
-
-struct null_deleter
-{
-    template <class T>
-    void operator()(T* /*x*/) const
-    {
-    }
-};
-
-template <class T, class F, F f>
-using manage_ptr = std::unique_ptr<T, manage_deleter<F, f>>;
-
-template <class T>
-struct element_type
-{
-    using type = typename T::element_type;
-};
-
-template <class T>
-using remove_ptr = typename std::
-    conditional<std::is_pointer<T>::value, std::remove_pointer<T>, element_type<T>>::type::type;
-
-template <class T>
-using shared = std::shared_ptr<remove_ptr<T>>;
-
-} // namespace online_compile
-
-#define OLC_MANAGE_PTR(T, F) \
-    online_compile::manage_ptr<typename std::remove_pointer<T>::type, decltype(&F), &F> // NOLINT
-
-#endif
diff --git a/host/online_compile/include/md5.hpp b/host/online_compile/include/md5.hpp
deleted file mode 100644
index 0fa25849a5..0000000000
--- a/host/online_compile/include/md5.hpp
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef GUARD_OLC_MD5_HPP
-#define GUARD_OLC_MD5_HPP
-
-#include <string>
-
-namespace online_compile {
-
-std::string md5(std::string s);
-
-} // namespace online_compile
-
-#endif
diff --git a/host/online_compile/include/op_kernel_args.hpp b/host/online_compile/include/op_kernel_args.hpp
deleted file mode 100644
index eb483265ea..0000000000
--- a/host/online_compile/include/op_kernel_args.hpp
+++ /dev/null
@@ -1,40 +0,0 @@
-#ifndef OLC_GUARD_MLOPEN_OP_KERNEL_ARGS_HPP
-#define OLC_GUARD_MLOPEN_OP_KERNEL_ARGS_HPP
-
-#include <type_traits>
-#include <cstdint>
-#include <half.hpp>
-
-#include <boost/container/small_vector.hpp>
-
-namespace online_compile {
-
-struct OpKernelArg
-{
-
-    OpKernelArg(char val, size_t sz) : buffer(sz) { std::fill(buffer.begin(), buffer.end(), val); }
-
-    template <typename T>
-    OpKernelArg(T arg) : buffer(sizeof(T))
-    {
-        static_assert(std::is_trivial<T>{} || std::is_same<T, half_float::half>{},
-                      "Only for trivial types");
-        *(reinterpret_cast<T*>(buffer.data())) = arg;
-    }
-
-    template <typename T>
-    OpKernelArg(T* arg) // NOLINT
-        : buffer(sizeof(T*))
-    {
-        *(reinterpret_cast<T**>(buffer.data())) = arg;
-        is_ptr                                  = true;
-    }
-
-    std::size_t size() const { return buffer.size(); };
-    boost::container::small_vector<char, 8> buffer;
-    bool is_ptr = false;
-};
-
-} // namespace online_compile
-
-#endif
diff --git a/host/online_compile/include/simple_hash.hpp b/host/online_compile/include/simple_hash.hpp
deleted file mode 100644
index 1afa2e2066..0000000000
--- a/host/online_compile/include/simple_hash.hpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2018 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
-#ifndef GUARD_OLC_SIMPLE_HASH_HPP
-#define GUARD_OLC_SIMPLE_HASH_HPP
-
-#include <string>
-
-namespace online_compile {
-struct SimpleHash
-{
-    size_t operator()(const std::pair<std::string, std::string>& p) const
-    {
-        using std::hash;
-        return (hash<std::string>()(p.first) ^ hash<std::string>()(p.second));
-    }
-};
-
-} // namespace online_compile
-
-#endif
diff --git a/host/online_compile/include/stringutils.hpp b/host/online_compile/include/stringutils.hpp
deleted file mode 100644
index 71975f430a..0000000000
--- a/host/online_compile/include/stringutils.hpp
+++ /dev/null
@@ -1,133 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_OLC_STRINGUTILS_HPP
-#define GUARD_OLC_STRINGUTILS_HPP
-
-#include <algorithm>
-#include <iterator>
-#include <numeric>
-#include <string>
-#include <vector>
-#include <sstream>
-
-#define OLC_STRINGIZE_1(...) #__VA_ARGS__
-#define OLC_STRINGIZE(...) OLC_STRINGIZE_1(__VA_ARGS__)
-
-namespace online_compile {
-
-inline std::string
-ReplaceString(std::string subject, const std::string& search, const std::string& replace)
-{
-    size_t pos = 0;
-    while((pos = subject.find(search, pos)) != std::string::npos)
-    {
-        subject.replace(pos, search.length(), replace);
-        pos += replace.length();
-    }
-    return subject;
-}
-
-inline bool EndsWith(const std::string& value, const std::string& suffix)
-{
-    if(suffix.size() > value.size())
-        return false;
-    else
-        return std::equal(suffix.rbegin(), suffix.rend(), value.rbegin());
-}
-
-template <class Strings>
-inline std::string JoinStrings(Strings strings, std::string delim)
-{
-    auto it = strings.begin();
-    if(it == strings.end())
-        return "";
-
-    auto nit = std::next(it);
-    return std::accumulate(
-        nit, strings.end(), *it, [&](std::string x, std::string y) { return x + delim + y; });
-}
-
-template <class F>
-static inline std::string TransformString(std::string s, F f)
-{
-    std::transform(s.begin(), s.end(), s.begin(), f);
-    return s;
-}
-
-inline std::string ToUpper(std::string s) { return TransformString(std::move(s), ::toupper); }
-
-inline bool StartsWith(const std::string& value, const std::string& prefix)
-{
-    if(prefix.size() > value.size())
-        return false;
-    else
-        return std::equal(prefix.begin(), prefix.end(), value.begin());
-}
-
-inline std::string RemovePrefix(std::string s, std::string prefix)
-{
-    if(StartsWith(s, prefix))
-        return s.substr(prefix.length());
-    else
-        return s;
-}
-
-inline std::vector<std::string> SplitSpaceSeparated(const std::string& in)
-{
-    std::istringstream ss(in);
-    std::istream_iterator<std::string> begin(ss), end;
-    return {begin, end};
-}
-
-inline std::vector<std::string> SplitSpaceSeparated(const std::string& in,
-                                                    const std::vector<std::string>& dontSplitAfter)
-{
-    std::vector<std::string> rv;
-    std::istringstream ss(in);
-    std::string s;
-    while(ss >> s)
-    {
-        if(std::any_of(dontSplitAfter.begin(), dontSplitAfter.end(), [&](const auto& dont) {
-               return dont == s;
-           }))
-        {
-            std::string s2;
-            if(ss >> s2)
-            {
-                s += std::string(" ").append(s2); // Exactly one space is important.
-                rv.push_back(s);
-                continue;
-            }
-            throw std::runtime_error("Error parsing string: '" + in + '\'');
-        }
-        rv.push_back(s);
-    }
-    return rv;
-}
-
-} // namespace online_compile
-
-#endif // GUARD_OLC_STRINGUTILS_HPP
diff --git a/host/online_compile/include/target_properties.hpp b/host/online_compile/include/target_properties.hpp
deleted file mode 100644
index 349a63fdd5..0000000000
--- a/host/online_compile/include/target_properties.hpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2020 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_OLC_TARGET_PROPERTIES_HPP
-#define GUARD_OLC_TARGET_PROPERTIES_HPP
-
-#include <boost/optional.hpp>
-#include <string>
-
-namespace online_compile {
-
-struct Handle;
-
-struct TargetProperties
-{
-    const std::string& Name() const { return name; }
-    const std::string& DbId() const { return dbId; }
-    boost::optional<bool> Xnack() const { return xnack; }
-    boost::optional<bool> Sramecc() const { return sramecc; }
-    boost::optional<bool> SrameccReported() const { return sramecc_reported; }
-    void Init(const Handle*);
-
-    private:
-    void InitDbId();
-    std::string name;
-    std::string dbId;
-    boost::optional<bool> xnack            = boost::none;
-    boost::optional<bool> sramecc          = boost::none;
-    boost::optional<bool> sramecc_reported = boost::none;
-};
-
-} // namespace online_compile
-
-#endif // GUARD_OLC_TARGET_PROPERTIES_HPP
diff --git a/host/online_compile/include/tmp_dir.hpp b/host/online_compile/include/tmp_dir.hpp
deleted file mode 100644
index 3221786061..0000000000
--- a/host/online_compile/include/tmp_dir.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef GUARD_OLC_TMP_DIR_HPP
-#define GUARD_OLC_TMP_DIR_HPP
-
-#include <string>
-#include <boost/filesystem/path.hpp>
-
-namespace online_compile {
-
-void SystemCmd(std::string cmd);
-
-struct TmpDir
-{
-    boost::filesystem::path path;
-    TmpDir(std::string prefix);
-
-    TmpDir(TmpDir const&) = delete;
-    TmpDir& operator=(TmpDir const&) = delete;
-
-    void Execute(std::string exe, std::string args) const;
-
-    ~TmpDir();
-};
-
-} // namespace online_compile
-
-#endif
diff --git a/host/online_compile/include/write_file.hpp b/host/online_compile/include/write_file.hpp
deleted file mode 100644
index 098ff17abf..0000000000
--- a/host/online_compile/include/write_file.hpp
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef GUARD_OLC_WRITE_FILE_HPP
-#define GUARD_OLC_WRITE_FILE_HPP
-
-#include <boost/filesystem.hpp>
-#include <manage_ptr.hpp>
-#include <fstream>
-
-namespace online_compile {
-
-using FilePtr = OLC_MANAGE_PTR(FILE*, std::fclose);
-
-inline void WriteFile(const std::string& content, const boost::filesystem::path& name)
-{
-    // std::cerr << "Write file: " << name << std::endl;
-    FilePtr f{std::fopen(name.string().c_str(), "w")};
-    if(std::fwrite(content.c_str(), 1, content.size(), f.get()) != content.size())
-        throw std::runtime_error("Failed to write to file");
-}
-
-inline void WriteFile(const std::vector<char>& content, const boost::filesystem::path& name)
-{
-    // std::cerr << "Write file: " << name << std::endl;
-    FilePtr f{std::fopen(name.string().c_str(), "w")};
-    if(std::fwrite(&content[0], 1, content.size(), f.get()) != content.size())
-        throw std::runtime_error("Failed to write to file");
-}
-
-} // namespace online_compile
-
-#endif
diff --git a/host/online_compile/kernel.cpp.in b/host/online_compile/kernel.cpp.in
deleted file mode 100644
index b9a9805284..0000000000
--- a/host/online_compile/kernel.cpp.in
+++ /dev/null
@@ -1,70 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include <algorithm>
-#include <map>
-#include <stdexcept>
-
-// clang-format off
-${KERNELS_DECLS}
-// clang-format on
-
-namespace online_compile {
-
-const std::map<std::string, std::string>& kernels()
-{
-    static const std::map<std::string, std::string> data{${INIT_KERNELS}};
-    return data;
-}
-
-std::string GetKernelSrc(std::string name)
-{
-    // Use the base name of the string
-    int start  = 0;
-    auto slash = static_cast<int>(name.find_last_of("/\\"));
-    if(slash != std::string::npos)
-    {
-        start = slash + 1;
-    }
-
-    int len = name.size();
-    auto ex = static_cast<int>(name.rfind('.'));
-    if(ex != std::string::npos)
-    {
-        len = ex - start;
-    }
-
-    auto key = name.substr(start, len);
-    // Convert to uppercase
-    std::transform(key.begin(), key.end(), key.begin(), ::toupper);
-
-    auto it = kernels().find(key);
-    if(it == kernels().end())
-        throw std::runtime_error("Failed to load kernel source: " + key);
-
-    return it->second;
-}
-
-} // namespace online_compile
diff --git a/host/online_compile/kernel_includes.cpp.in b/host/online_compile/kernel_includes.cpp.in
deleted file mode 100644
index a7e6bd689b..0000000000
--- a/host/online_compile/kernel_includes.cpp.in
+++ /dev/null
@@ -1,80 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2019 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include "olc_kernel_includes.h"
-#include <algorithm>
-#include <map>
-#include <stdexcept>
-#include <vector>
-
-namespace online_compile {
-
-static inline bool EndsWith(const std::string& value, const std::string& suffix)
-{
-    if(suffix.size() > value.size())
-        return false;
-    else
-        return std::equal(suffix.rbegin(), suffix.rend(), value.rbegin());
-}
-
-const std::map<std::string, std::string>& kernel_includes()
-{
-    static const std::map<std::string, std::string> data{${INIT_KERNELS}};
-    return data;
-}
-
-std::string GetKernelInc(std::string key)
-{
-    auto it = kernel_includes().find(key);
-    if(it == kernel_includes().end())
-        throw std::runtime_error("Failed to load kernel source: " + key);
-
-    return it->second;
-}
-
-std::vector<std::string> GetKernelIncList()
-{
-    std::vector<std::string> keys;
-    auto m = kernel_includes();
-    std::transform(m.begin(),
-                   m.end(),
-                   std::back_inserter(keys),
-                   [](decltype(m)::value_type const& pair) { return pair.first; });
-    return keys;
-}
-
-std::vector<std::string> GetHipKernelIncList()
-{
-    auto keys = GetKernelIncList();
-    keys.erase(std::remove_if(keys.begin(),
-                              keys.end(),
-                              [&](const auto& key) {
-                                  return !(EndsWith(key, ".hpp") || EndsWith(key, ".h"));
-                              }),
-               keys.end());
-    return keys;
-}
-
-} // namespace online_compile
diff --git a/host/online_compile/kernels_batch.cpp.in b/host/online_compile/kernels_batch.cpp.in
deleted file mode 100644
index a31caf9c5e..0000000000
--- a/host/online_compile/kernels_batch.cpp.in
+++ /dev/null
@@ -1 +0,0 @@
-#include "${KERNEL_SRC_HPP_FILENAME}"
diff --git a/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp b/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp
index b0c4921019..2b645e3c3b 100644
--- a/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp
@@ -2,136 +2,150 @@
 #define CONV_IGEMM_FWD_V6R1_DLOPS_NCHW_KCYX_NKHW_HPP
 
 #include <numeric>
+#include <sstream>
 
-namespace ck_driver {
+namespace ck {
+namespace driver {
 
 struct CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw
 {
-    ck::DataTypeEnum_t ABDataTypeEnum;
-    ck::DataTypeEnum_t AccDataTypeEnum;
-    ck::DataTypeEnum_t CDataTypeEnum;
+    auto GetCompileParameterString() const
+    {
+        auto param = std::stringstream();
 
-    int BlockSize;
+        // clang-format off
+        param <<
+            " -DCK_PARAM_ABDataTypeEnum=" << 
+                ABDataTypeEnum <<
+            " -DCK_PARAM_AccDataTypeEnum=" << 
+                AccDataTypeEnum <<
+            " -DCK_PARAM_CDataTypeEnum=" << 
+                CDataTypeEnum <<
+            " -DCK_PARAM_BlockSize=" << 
+                BlockSize <<
+            " -DCK_PARAM_GN0=" << 
+                GN0 <<
+            " -DCK_PARAM_GK1=" << 
+                GK1 <<
+            " -DCK_PARAM_GM1PerBlockGM11=" 
+                << GM1PerBlockGM11 <<
+            " -DCK_PARAM_GN1PerBlockGN11=" <<
+                GN1PerBlockGN11 <<
+            " -DCK_PARAM_GK0PerBlock=" <<
+                GK0PerBlock <<
+            " -DCK_PARAM_BM1PerThreadBM11=" <<
+                BM1PerThreadBM11 <<
+            " -DCK_PARAM_BN1PerThreadBN11=" <<
+                BN1PerThreadBN11 <<
+            " -DCK_PARAM_BK0PerThread=" <<
+                BK0PerThread <<
+            " -DCK_PARAM_BM10BN10ThreadClusterBM10Xs=" <<
+                BM10BN10ThreadClusterBM10Xs[0] << "," <<
+                BM10BN10ThreadClusterBM10Xs[1] <<
+            " -DCK_PARAM_BM10BN10ThreadClusterBN10Xs=" <<
+                BM10BN10ThreadClusterBN10Xs[0] << "," <<
+                BM10BN10ThreadClusterBN10Xs[1] <<
+            " -DCK_PARAM_ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1=" <<
+                ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[0] << "," <<
+                ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[1] << "," <<
+                ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[2] << "," <<
+                ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[3] << "," <<
+                ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[4] <<
+            " -DCK_PARAM_ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1=" <<
+                ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[0] << "," <<
+                ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[1] << "," <<
+                ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[2] << "," <<
+                ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[3] << "," <<
+                ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[4] << 
+            " -DCK_PARAM_ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1=" <<
+                ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0] << "," <<
+                ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1] << "," <<
+                ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2] << "," <<
+                ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3] << "," <<
+                ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4] <<
+            " -DCK_PARAM_ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1=" <<
+                ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0] << "," <<
+                ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1] << "," <<
+                ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2] << "," <<
+                ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3] << "," <<
+                ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4] <<
+            " -DCK_PARAM_BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1=" <<
+                BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[0] << "," <<
+                BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[1] << "," <<
+                BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[2] << "," <<
+                BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[3] << "," <<
+                BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[4] <<
+            " -DCK_PARAM_BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1=" <<
+                BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[0] << "," <<
+                BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[1] << "," <<
+                BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[2] << "," <<
+                BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[3] << "," <<
+                BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[4] << 
+            " -DCK_PARAM_BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1=" <<
+                BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0] << "," <<
+                BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1] << "," <<
+                BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2] << "," <<
+                BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3] << "," <<
+                BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4] << 
+            " -DCK_PARAM_BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1=" <<
+                BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0] << "," <<
+                BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1] << "," <<
+                BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2] << "," <<
+                BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3] << "," <<
+                BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4] << 
+            " -DCK_PARAM_CThreadTransferDstScalarPerVector=" <<
+                CThreadTransferDstScalarPerVector <<
+            " -DCK_PARAM_HasMainKBlockLoop=" <<
+                static_cast<int>(HasMainKBlockLoop) <<
+            " -DCK_PARAM_HasDoubleTailKBlockLoop=" <<
+                static_cast<int>(HasDoubleTailKBlockLoop);
+        // clang-format on
 
-    int GN0;
-    int GK1;
+        return param.str();
+    }
 
-    int GM1PerBlockGM11;
-    int GN1PerBlockGN11;
-    int GK0PerBlock;
+    ck::DataTypeEnum_t ABDataTypeEnum  = ck::DataTypeEnum_t::Unknown;
+    ck::DataTypeEnum_t AccDataTypeEnum = ck::DataTypeEnum_t::Unknown;
+    ck::DataTypeEnum_t CDataTypeEnum   = ck::DataTypeEnum_t::Unknown;
 
-    int BM1PerThreadBM11;
-    int BN1PerThreadBN11;
-    int BK0PerThread;
+    int BlockSize = -1;
 
-    std::array<int, 2> BM10BN10ThreadClusterBM10Xs;
-    std::array<int, 2> BM10BN10ThreadClusterBN10Xs;
+    int GN0 = -1;
+    int GK1 = -1;
 
-    std::array<int, 5> ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1;
-    std::array<int, 5> ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1;
-    std::array<int, 5> ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
-    std::array<int, 5> ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
+    int GM1PerBlockGM11 = -1;
+    int GN1PerBlockGN11 = -1;
+    int GK0PerBlock     = -1;
 
-    std::array<int, 5> BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1;
-    std::array<int, 5> BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1;
-    std::array<int, 5> BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
-    std::array<int, 5> BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
+    int BM1PerThreadBM11 = -1;
+    int BN1PerThreadBN11 = -1;
+    int BK0PerThread     = -1;
 
-    int CThreadTransferDstScalarPerVector;
+    std::array<int, 2> BM10BN10ThreadClusterBM10Xs = {-1, -1};
+    std::array<int, 2> BM10BN10ThreadClusterBN10Xs = {-1, -1};
 
-    bool HasMainKBlockLoop;
-    bool HasDoubleTailKBlockLoop;
+    std::array<int, 5> ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1 = {
+        -1, -1, -1, -1, -1};
+    std::array<int, 5> ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1 = {
+        -1, -1, -1, -1, -1};
+    std::array<int, 5> ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = {
+        -1, -1, -1, -1, -1};
+    std::array<int, 5> ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = {
+        -1, -1, -1, -1, -1};
 
-    auto GetCompileParameterString() const
-    {
-        // clang-format off
-        return
-            " -DCK_PARAM_ABDataTypeEnum=" + 
-                std::to_string(ABDataTypeEnum) + 
-            " -DCK_PARAM_AccDataTypeEnum=" + 
-                std::to_string(AccDataTypeEnum) +
-            " -DCK_PARAM_CDataTypeEnum=" + 
-                std::to_string(CDataTypeEnum) + 
-            " -DCK_PARAM_BlockSize=" +
-                std::to_string(BlockSize) +
-            " -DCK_PARAM_GN0=" +
-                std::to_string(GN0) +
-            " -DCK_PARAM_GK1=" +
-                std::to_string(GK1) +
-            " -DCK_PARAM_GM1PerBlockGM11=" +
-                std::to_string(GM1PerBlockGM11) +
-            " -DCK_PARAM_GN1PerBlockGN11=" +
-                std::to_string(GN1PerBlockGN11) +
-            " -DCK_PARAM_GK0PerBlock=" + 
-                std::to_string(GK0PerBlock) +
-            " -DCK_PARAM_BM1PerThreadBM11=" +
-                std::to_string(BM1PerThreadBM11) +
-            " -DCK_PARAM_BN1PerThreadBN11=" +
-                std::to_string(BN1PerThreadBN11) +
-            " -DCK_PARAM_BK0PerThread=" +
-                std::to_string(BK0PerThread) +
-            " -DCK_PARAM_BM10BN10ThreadClusterBM10Xs=" +
-                std::to_string(BM10BN10ThreadClusterBM10Xs[0]) + "," +
-                std::to_string(BM10BN10ThreadClusterBM10Xs[1]) +
-            " -DCK_PARAM_BM10BN10ThreadClusterBN10Xs=" +
-                std::to_string(BM10BN10ThreadClusterBN10Xs[0]) + "," +
-                std::to_string(BM10BN10ThreadClusterBN10Xs[1]) +
-            " -DCK_PARAM_ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1=" +
-                std::to_string(ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[0]) + "," +
-                std::to_string(ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[1]) + "," +
-                std::to_string(ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[2]) + "," +
-                std::to_string(ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[3]) + "," +
-                std::to_string(ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[4]) +
-            " -DCK_PARAM_ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1=" +
-                std::to_string(ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[0]) + "," +
-                std::to_string(ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[1]) + "," +
-                std::to_string(ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[2]) + "," +
-                std::to_string(ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[3]) + "," +
-                std::to_string(ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[4]) +
-            " -DCK_PARAM_ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1=" +
-                std::to_string(ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0]) +  "," +
-                std::to_string(ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1]) + "," +
-                std::to_string(ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2]) + "," +
-                std::to_string(ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3]) + "," +
-                std::to_string(ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4]) +
-            " -DCK_PARAM_ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1=" +
-                std::to_string(ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0]) + "," +
-                std::to_string(ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1]) + "," +
-                std::to_string(ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2]) + "," +
-                std::to_string(ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3]) + "," +
-                std::to_string(ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4]) +
-            " -DCK_PARAM_BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1=" +
-                std::to_string(BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[0]) + "," +
-                std::to_string(BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[1]) + "," +
-                std::to_string(BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[2]) + "," +
-                std::to_string(BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[3]) + "," +
-                std::to_string(BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[4]) +
-            " -DCK_PARAM_BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1=" +
-                std::to_string(BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[0]) + "," +
-                std::to_string(BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[1]) + "," +
-                std::to_string(BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[2]) + "," +
-                std::to_string(BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[3]) + "," +
-                std::to_string(BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[4]) +
-            " -DCK_PARAM_BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1=" +
-                std::to_string(BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0]) + "," +
-                std::to_string(BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1]) + "," +
-                std::to_string(BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2]) + "," +
-                std::to_string(BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3]) + "," +
-                std::to_string(BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4]) +
-            " -DCK_PARAM_BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1=" +
-                std::to_string(BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0]) + "," +
-                std::to_string(BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1]) + "," +
-                std::to_string(BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2]) + "," +
-                std::to_string(BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3]) + "," +
-                std::to_string(BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4]) +
-            " -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
-                std::to_string(CThreadTransferDstScalarPerVector) +
-            " -DCK_PARAM_HasMainKBlockLoop=" +
-                std::to_string(HasMainKBlockLoop) + 
-            " -DCK_PARAM_HasDoubleTailKBlockLoop=" +
-                std::to_string(HasDoubleTailKBlockLoop);
-        // clang-format on
-    }
+    std::array<int, 5> BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1 = {
+        -1, -1, -1, -1, -1};
+    std::array<int, 5> BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1 = {
+        -1, -1, -1, -1, -1};
+    std::array<int, 5> BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = {
+        -1, -1, -1, -1, -1};
+    std::array<int, 5> BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = {
+        -1, -1, -1, -1, -1};
+
+    int CThreadTransferDstScalarPerVector = -1;
+
+    bool HasMainKBlockLoop       = false;
+    bool HasDoubleTailKBlockLoop = false;
 };
 
 struct TunableConvIgemmFwdV6r1DlopsNchwKcyxNkhw
@@ -229,8 +243,6 @@ struct ConvIgemmFwdV6r1DlopsNchwKcyxNkhw
     CalculateCompileParameterBasedOnTunable(const ConvolutionProblemDescriptor& conv_problem_desc,
                                             const TunableConvIgemmFwdV6r1DlopsNchwKcyxNkhw& tunable)
     {
-        using namespace ck;
-
         const int C  = conv_problem_desc.C;
         const int Y  = conv_problem_desc.Y;
         const int X  = conv_problem_desc.X;
@@ -247,12 +259,17 @@ struct ConvIgemmFwdV6r1DlopsNchwKcyxNkhw
 
         DataTypeEnum_t AccDataTypeEnum;
 
-        switch(ABDataTypeEnum)
+        if(ABDataTypeEnum == DataTypeEnum_t::Float || ABDataTypeEnum == DataTypeEnum_t::Half)
+        {
+            AccDataTypeEnum = DataTypeEnum_t::Float;
+        }
+        else if(ABDataTypeEnum == DataTypeEnum_t::Int8)
         {
-        case DataTypeEnum_t::Float:
-        case DataTypeEnum_t::Half: AccDataTypeEnum = DataTypeEnum_t::Float; break;
-        case DataTypeEnum_t::Int8: AccDataTypeEnum = DataTypeEnum_t::Int32; break;
-        default: return std::make_tuple(CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{}, false);
+            AccDataTypeEnum = DataTypeEnum_t::Int32;
+        }
+        else
+        {
+            return std::make_tuple(CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{}, false);
         }
 
         const int BlockSize = tunable.BlockSize;
@@ -342,7 +359,7 @@ struct ConvIgemmFwdV6r1DlopsNchwKcyxNkhw
     {
         for(const auto& tunable : generate_tunable_list_conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw())
         {
-            CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param;
+            CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param{};
             bool found = false;
 
             std::tie(compile_param, found) =
@@ -368,8 +385,6 @@ struct ConvIgemmFwdV6r1DlopsNchwKcyxNkhw
     IsValidCompileParameter(const ConvolutionProblemDescriptor& conv_problem_desc,
                             const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw& compile_param)
     {
-        using namespace ck;
-
         const int N  = conv_problem_desc.N;
         const int K  = conv_problem_desc.K;
         const int C  = conv_problem_desc.C;
@@ -669,5 +684,6 @@ struct ConvIgemmFwdV6r1DlopsNchwKcyxNkhw
     }
 };
 
-} // namespace ck_driver
+} // namespace driver
+} // namespace ck
 #endif
diff --git a/host/solver/include/convolution_problem_descriptor.hpp b/host/solver/include/convolution_problem_descriptor.hpp
index df9c110e70..8c0ecbee80 100644
--- a/host/solver/include/convolution_problem_descriptor.hpp
+++ b/host/solver/include/convolution_problem_descriptor.hpp
@@ -1,7 +1,8 @@
 #ifndef CONVOLUTION_PROBLEM_DESCRIPTOR
 #define CONVOLUTION_PROBLEM_DESCRIPTOR
 
-namespace ck_driver {
+namespace ck {
+namespace driver {
 
 struct ConvolutionProblemDescriptor
 {
@@ -75,5 +76,6 @@ struct ConvolutionProblemDescriptor
     std::size_t CalculateFlop() const { return 2L * N * K * C * Y * X * Ho * Wo; }
 };
 
-} // namespace ck_driver
+} // namespace driver
+} // namespace ck
 #endif
diff --git a/host/driver_online/include/online_driver_common.hpp b/host/solver/include/solver_common.hpp
similarity index 72%
rename from host/driver_online/include/online_driver_common.hpp
rename to host/solver/include/solver_common.hpp
index d05a156d89..d1792f7681 100644
--- a/host/driver_online/include/online_driver_common.hpp
+++ b/host/solver/include/solver_common.hpp
@@ -1,14 +1,8 @@
-#ifndef ONLINE_DRIVER_COMMON_HPP
-#define ONLINE_DRIVER_COMMON_HPP
+#ifndef CK_SOLVER_COMMON_HPP
+#define CK_SOLVER_COMMON_HPP
 
-namespace ck_driver {
-
-inline auto get_ck_hip_online_compile_common_flag()
-{
-    std::string param = " -std=c++17";
-
-    return param;
-}
+namespace ck {
+namespace driver {
 
 // greatest common divisor, aka highest common factor
 inline int gcd(int x, int y)
@@ -47,5 +41,6 @@ auto gcd(X x, Ys... ys)
     return gcd(x, gcd(ys...));
 }
 
-} // namespace ck_driver
+} // namespace driver
+} // namespace ck
 #endif
diff --git a/script/cmake-rocm.sh b/script/cmake-rocm.sh
index e65c53ce1e..ebfa2b9f69 100755
--- a/script/cmake-rocm.sh
+++ b/script/cmake-rocm.sh
@@ -3,40 +3,16 @@ rm -f CMakeCache.txt
 rm -f *.cmake
 rm -rf CMakeFiles
 
-MY_PROJECT_SOURCE=../../../
+MY_PROJECT_SOURCE=../../..
 MY_PROJECT_INSTALL=../install.dir
 
 cmake                                                                                                                                          \
 -D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL}                                                                                                  \
+-D HALF_INCLUDE_DIR="/root/workspace/external/half/include"                                                                                    \
+-D BUILD_DEV=ON                                                                                                                                \
 -D CMAKE_BUILD_TYPE=Release                                                                                                                    \
--D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX906 -O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=$PWD"   \
--D HIP_ONLINE_COMPILER_FLAGS="-DCK_AMD_GPU_GFX906"                                                                                             \
+-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 -O3 --amdgpu-target=gfx908 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=$PWD"   \
 -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                                                                      \
 -D CMAKE_PREFIX_PATH=/opt/rocm                                                                                                                 \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                                                              \
 ${MY_PROJECT_SOURCE}
-
-#CXX_FLAG_TMP=-Weverything
-#            -Wno-c++98-compat \
-#            -Wno-c++98-compat-pedantic \
-#            -Wno-conversion \
-#            -Wno-double-promotion \
-#            -Wno-exit-time-destructors \
-#            -Wno-extra-semi \
-#            -Wno-float-conversion \
-#            -Wno-gnu-anonymous-struct \
-#            -Wno-gnu-zero-variadic-macro-arguments \
-#            -Wno-missing-noreturn \
-#            -Wno-missing-prototypes \
-#            -Wno-nested-anon-types \
-#            -Wno-padded \
-#            -Wno-return-std-move-in-c++11 \
-#            -Wno-shorten-64-to-32 \
-#            -Wno-sign-conversion \
-#            -Wno-unknown-warning-option \
-#            -Wno-unused-command-line-argument \
-#            -Wno-weak-vtables \
-#            -Wno-covered-switch-default \
-#            -Wno-disabled-macro-expansion \
-#            -Wno-undefined-reinterpret-cast
-

From 2f4fe706435ea8918d0fda78587e60270dfec192 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 15 Sep 2021 08:49:42 +0000
Subject: [PATCH 43/57] Tiny fix in using data type template parameters in
 blockwise and direct_threadwise kernel

---
 .../gridwise_generic_2d_reduction_blockwise.hpp               | 2 +-
 .../gridwise_generic_2d_reduction_direct_threadwise.hpp       | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp
index 20075526b2..0a950d29b8 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp
@@ -281,7 +281,7 @@ struct GridwiseReduction_xy_to_x_blockwise
                                             ThreadClusterLengths,
                                             Sequence<0, 1>,
                                             srcDataType,
-                                            dstDataType,
+                                            compType,
                                             src2dDescType,
                                             decltype(in_block_desc),
                                             Sequence<0, 1>,
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp
index a38c2dc335..99aaeef6c0 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp
@@ -232,7 +232,7 @@ struct GridwiseReduction_xy_to_x_direct_threadwise
         index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
 
         auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<srcDataType,
-                                                                    dstDataType,
+                                                                    compType,
                                                                     src2dDescType,
                                                                     decltype(ThreadBufferDesc),
                                                                     ThreadBufferLengths,
@@ -377,7 +377,7 @@ struct GridwiseReduction_xy_to_x_direct_threadwise
         index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
 
         auto threadwise_src_val_load = ThreadwiseTensorSliceTransfer_v2<srcDataType,
-                                                                        dstDataType,
+                                                                        compType,
                                                                         src2dDescType,
                                                                         decltype(ThreadBufferDesc),
                                                                         ThreadBufferLengths,

From f098bfbe4cf415500f79509a8d9d3eedc7b361dc Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 15 Sep 2021 08:54:54 +0000
Subject: [PATCH 44/57] Fix with regard to implementing GetZeroVal() in both
 kernel and host

---
 .../include/utility/reduction_operator.hpp    | 53 ++++++++++++-------
 test/cpu_reduce_util.hpp                      | 26 +--------
 2 files changed, 34 insertions(+), 45 deletions(-)

diff --git a/src/composable_kernel/composable_kernel/include/utility/reduction_operator.hpp b/src/composable_kernel/composable_kernel/include/utility/reduction_operator.hpp
index 269671a400..05484ccf74 100644
--- a/src/composable_kernel/composable_kernel/include/utility/reduction_operator.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/reduction_operator.hpp
@@ -82,7 +82,7 @@ struct Max
 {
     using dataType = T;
 
-    __device__ static T GetZeroVal() { return std::numeric_limits<T>::min(); };
+    __device__ static T GetZeroVal() { return std::numeric_limits<T>::lowest(); };
 
     __device__ inline constexpr void operator()(T& a, T b) const
     {
@@ -127,16 +127,45 @@ struct Min
     static constexpr bool indexable = true;
 };
 
+template <class T>
+struct AMax
+{
+    using dataType = T;
+
+    __device__ static T GetZeroVal() { return type_convert<T>{}(0.0f); };
+
+    __device__ inline constexpr void operator()(T& a, T b) const
+    {
+        if(a < b)
+            a = b;
+    }
+
+    __device__ inline constexpr void operator()(T& a, T b, bool& changed) const
+    {
+        if(a < b)
+        {
+            a       = b;
+            changed = true;
+        }
+    }
+
+    static constexpr bool indexable = true;
+};
+
 template <>
 __device__ half_t Max<half_t>::GetZeroVal()
 {
-    return type_convert<half_t>{}(std::numeric_limits<float>::min());
+    const unsigned short binary_lowest = 0xFBFF;
+
+    return *reinterpret_cast<const half_t*>(&binary_lowest);
 };
 
 template <>
 __device__ half_t Min<half_t>::GetZeroVal()
 {
-    return type_convert<half_t>{}(std::numeric_limits<float>::max());
+    const unsigned short binary_max = 0x7BFF;
+
+    return *reinterpret_cast<const half_t*>(&binary_max);
 };
 
 // Unary operators are usually called element-wisely before the reduction is executed on the
@@ -281,8 +310,6 @@ struct reduce_binary_operator<T, ReduceTensorOp_t::ADD>
     using opType   = reduce::Add<T>;
     using dataType = T;
 
-    __device__ static T GetZeroVal() { return reduce::Add<T>::GetZeroVal(); };
-
     static constexpr bool indexable = reduce::Add<T>::indexable;
 };
 
@@ -292,8 +319,6 @@ struct reduce_binary_operator<T, ReduceTensorOp_t::MUL>
     using opType   = reduce::Mul<T>;
     using dataType = T;
 
-    __device__ static T GetZeroVal() { return reduce::Mul<T>::GetZeroVal(); };
-
     static constexpr bool indexable = reduce::Mul<T>::indexable;
 };
 
@@ -303,8 +328,6 @@ struct reduce_binary_operator<T, ReduceTensorOp_t::MIN>
     using opType   = reduce::Min<T>;
     using dataType = T;
 
-    __device__ static T GetZeroVal() { return reduce::Min<T>::GetZeroVal(); };
-
     static constexpr bool indexable = reduce::Min<T>::indexable;
 };
 
@@ -314,19 +337,15 @@ struct reduce_binary_operator<T, ReduceTensorOp_t::MAX>
     using opType   = reduce::Max<T>;
     using dataType = T;
 
-    __device__ static T GetZeroVal() { return reduce::Max<T>::GetZeroVal(); };
-
     static constexpr bool indexable = reduce::Max<T>::indexable;
 };
 
 template <typename T>
 struct reduce_binary_operator<T, ReduceTensorOp_t::AMAX>
 {
-    using opType   = reduce::Max<T>;
+    using opType   = reduce::AMax<T>;
     using dataType = T;
 
-    __device__ static T GetZeroVal() { return reduce::Max<T>::GetZeroVal(); };
-
     static constexpr bool indexable = reduce::Max<T>::indexable;
 };
 
@@ -336,8 +355,6 @@ struct reduce_binary_operator<T, ReduceTensorOp_t::AVG>
     using opType   = reduce::Add<T>;
     using dataType = T;
 
-    __device__ static T GetZeroVal() { return reduce::Add<T>::GetZeroVal(); };
-
     static constexpr bool indexable = reduce::Add<T>::indexable;
 };
 
@@ -347,8 +364,6 @@ struct reduce_binary_operator<T, ReduceTensorOp_t::NORM1>
     using opType   = reduce::Add<T>;
     using dataType = T;
 
-    __device__ static T GetZeroVal() { return reduce::Add<T>::GetZeroVal(); };
-
     static constexpr bool indexable = reduce::Add<T>::indexable;
 };
 
@@ -358,8 +373,6 @@ struct reduce_binary_operator<T, ReduceTensorOp_t::NORM2>
     using opType   = reduce::Add<T>;
     using dataType = T;
 
-    __device__ static T GetZeroVal() { return reduce::Add<T>::GetZeroVal(); };
-
     static constexpr bool indexable = reduce::Add<T>::indexable;
 };
 
diff --git a/test/cpu_reduce_util.hpp b/test/cpu_reduce_util.hpp
index 3248bd0162..d9fb665f2b 100644
--- a/test/cpu_reduce_util.hpp
+++ b/test/cpu_reduce_util.hpp
@@ -194,7 +194,7 @@ static inline compType ReduceOpZeroVal(miopenReduceTensorOp_t op_)
 
     case MIOPEN_REDUCE_TENSOR_MIN: return (std::numeric_limits<compType>::max());
 
-    case MIOPEN_REDUCE_TENSOR_MAX: return (std::numeric_limits<compType>::min());
+    case MIOPEN_REDUCE_TENSOR_MAX: return (std::numeric_limits<compType>::lowest());
     case MIOPEN_REDUCE_TENSOR_AMAX: return (convert_type<compType>(0.0f));
     }
 
@@ -202,30 +202,6 @@ static inline compType ReduceOpZeroVal(miopenReduceTensorOp_t op_)
                              ": using undefined Reduction operation is not permitted");
 };
 
-template <>
-inline half_float::half ReduceOpZeroVal<half_float::half>(miopenReduceTensorOp_t op_)
-{
-    switch(op_)
-    {
-    case MIOPEN_REDUCE_TENSOR_ADD:
-    case MIOPEN_REDUCE_TENSOR_AVG:
-    case MIOPEN_REDUCE_TENSOR_NORM1:
-    case MIOPEN_REDUCE_TENSOR_NORM2:
-
-    case MIOPEN_REDUCE_TENSOR_MUL: return (convert_type<half_float::half>(1.0f));
-
-    case MIOPEN_REDUCE_TENSOR_MIN:
-        return (convert_type<half_float::half>(std::numeric_limits<float>::max()));
-
-    case MIOPEN_REDUCE_TENSOR_MAX:
-        return (convert_type<half_float::half>(std::numeric_limits<float>::min()));
-    case MIOPEN_REDUCE_TENSOR_AMAX: return (convert_type<half_float::half>(0.0f));
-    }
-
-    throw std::runtime_error(std::string(__FUNCTION__) +
-                             ": using undefined Reduction operation is not permitted");
-};
-
 template <typename compType>
 static inline void binop_with_nan_check(miopenNanPropagation_t nanOpt,
                                         std::function<void(compType&, compType)> opReduce,

From ba91b9918e5d78155b9764f22998584b76f3e79f Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 15 Sep 2021 10:58:37 +0000
Subject: [PATCH 45/57] Avoid convert to compType from dstDataType before
 writting the output value

---
 ...ridwise_generic_2d_reduction_blockwise.hpp | 30 +++++++++++++------
 ...generic_2d_reduction_direct_threadwise.hpp | 30 +++++++++++++------
 ...e_generic_2d_reduction_direct_warpwise.hpp | 30 +++++++++++++------
 3 files changed, 63 insertions(+), 27 deletions(-)

diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp
index 0a950d29b8..85d1abec7b 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp
@@ -180,6 +180,10 @@ struct GridwiseReduction_xy_to_x_blockwise
             if(!float_equal_one{}(alpha))
                 accuValue_buf(I0) *= type_convert<compType>{}(alpha);
 
+            StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> dstValue_buf;
+
+            dstValue_buf(I0) = type_convert<dstDataType>{}(accuValue_buf[I0]);
+
             if(!float_equal_zero{}(beta))
             {
                 auto threadwise_dst_load =
@@ -200,11 +204,11 @@ struct GridwiseReduction_xy_to_x_blockwise
                 threadwise_dst_load.Run(
                     dst1dDesc, dst_global_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf);
 
-                accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
+                dstValue_buf(I0) += priorDstValue_buf[I0] * beta;
             }
 
             auto threadwise_dst_store =
-                ThreadwiseTensorSliceTransfer_v1r3<compType,
+                ThreadwiseTensorSliceTransfer_v1r3<dstDataType,
                                                    dstDataType,
                                                    decltype(ReducedDataDesc),
                                                    dst1dDescType,
@@ -218,7 +222,7 @@ struct GridwiseReduction_xy_to_x_blockwise
                                                           make_multi_index(block_global_1d_id));
 
             threadwise_dst_store.Run(
-                ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_buf);
+                ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_buf);
         }
     };
 
@@ -345,6 +349,10 @@ struct GridwiseReduction_xy_to_x_blockwise
             if(!float_equal_one{}(alpha))
                 accuValue_buf(I0) *= type_convert<compType>{}(alpha);
 
+            StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> dstValue_buf;
+
+            dstValue_buf(I0) = type_convert<dstDataType>{}(accuValue_buf[I0]);
+
             if(!float_equal_zero{}(beta))
             {
                 auto threadwise_dst_load =
@@ -368,11 +376,11 @@ struct GridwiseReduction_xy_to_x_blockwise
                                         make_tuple(I0),
                                         priorDstValue_buf);
 
-                accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
+                dstValue_buf(I0) += priorDstValue_buf[I0] * beta;
             }
 
             auto threadwise_dst_val_store =
-                ThreadwiseTensorSliceTransfer_v1r3<compType,
+                ThreadwiseTensorSliceTransfer_v1r3<dstDataType,
                                                    dstDataType,
                                                    decltype(ReducedDataDesc),
                                                    dst1dDescType,
@@ -400,7 +408,7 @@ struct GridwiseReduction_xy_to_x_blockwise
                                                           make_multi_index(block_global_1d_id));
 
             threadwise_dst_val_store.Run(
-                ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_val_buf);
+                ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_val_buf);
             threadwise_dst_idx_store.Run(
                 ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
         }
@@ -547,6 +555,10 @@ struct GridwiseReduction_xy_to_x_blockwise
             if(!float_equal_one{}(alpha))
                 accuValue_buf(I0) *= type_convert<compType>{}(alpha);
 
+            StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> dstValue_buf;
+
+            dstValue_buf(I0) = type_convert<dstDataType>{}(accuValue_buf[I0]);
+
             if(!float_equal_zero{}(beta))
             {
                 auto threadwise_dst_load =
@@ -570,11 +582,11 @@ struct GridwiseReduction_xy_to_x_blockwise
                                         make_tuple(I0),
                                         priorDstValue_buf);
 
-                accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
+                dstValue_buf(I0) += priorDstValue_buf[I0] * beta;
             }
 
             auto threadwise_dst_val_store =
-                ThreadwiseTensorSliceTransfer_v1r3<compType,
+                ThreadwiseTensorSliceTransfer_v1r3<dstDataType,
                                                    dstDataType,
                                                    decltype(ReducedDataDesc),
                                                    dst1dDescType,
@@ -602,7 +614,7 @@ struct GridwiseReduction_xy_to_x_blockwise
                                                          make_multi_index(block_global_1d_id));
 
             threadwise_dst_val_store.Run(
-                ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_val_buf);
+                ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_val_buf);
             threadwise_dst_idx_store.Run(
                 ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
         }
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp
index 99aaeef6c0..beeb27e471 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp
@@ -147,6 +147,10 @@ struct GridwiseReduction_xy_to_x_direct_threadwise
         if(!float_equal_one{}(alpha))
             accuValue_buf(I0) *= type_convert<compType>{}(alpha);
 
+        StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> dstValue_buf;
+
+        dstValue_buf(I0) = type_convert<dstDataType>{}(accuValue_buf[I0]);
+
         if(!float_equal_zero{}(beta))
         {
             auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2<dstDataType,
@@ -166,11 +170,11 @@ struct GridwiseReduction_xy_to_x_direct_threadwise
             threadwise_dst_load.Run(
                 dst1dDesc, dst_global_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf);
 
-            accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
+            dstValue_buf(I0) += priorDstValue_buf[I0] * beta;
         }
 
         auto threadwise_dst_store =
-            ThreadwiseTensorSliceTransfer_v1r3<compType,
+            ThreadwiseTensorSliceTransfer_v1r3<dstDataType,
                                                dstDataType,
                                                decltype(ReducedDataDesc),
                                                dst1dDescType,
@@ -184,7 +188,7 @@ struct GridwiseReduction_xy_to_x_direct_threadwise
                                                      make_multi_index(thread_global_1d_id));
 
         threadwise_dst_store.Run(
-            ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_buf);
+            ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_buf);
     };
 
     template <>
@@ -271,6 +275,10 @@ struct GridwiseReduction_xy_to_x_direct_threadwise
         if(!float_equal_one{}(alpha))
             accuValue_buf(I0) *= type_convert<compType>{}(alpha);
 
+        StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> dstValue_buf;
+
+        dstValue_buf(I0) = type_convert<dstDataType>{}(accuValue_buf[I0]);
+
         if(!float_equal_zero{}(beta))
         {
             auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2<dstDataType,
@@ -290,11 +298,11 @@ struct GridwiseReduction_xy_to_x_direct_threadwise
             threadwise_dst_load.Run(
                 dst1dDesc, dst_global_val_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf);
 
-            accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
+            dstValue_buf(I0) += priorDstValue_buf[I0] * beta;
         }
 
         auto threadwise_dst_val_store =
-            ThreadwiseTensorSliceTransfer_v1r3<compType,
+            ThreadwiseTensorSliceTransfer_v1r3<dstDataType,
                                                dstDataType,
                                                decltype(ReducedDataDesc),
                                                dst1dDescType,
@@ -322,7 +330,7 @@ struct GridwiseReduction_xy_to_x_direct_threadwise
                                                       make_multi_index(thread_global_1d_id));
 
         threadwise_dst_val_store.Run(
-            ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_val_buf);
+            ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_val_buf);
         threadwise_dst_idx_store.Run(
             ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
     };
@@ -430,6 +438,10 @@ struct GridwiseReduction_xy_to_x_direct_threadwise
         if(!float_equal_one{}(alpha))
             accuValue_buf(I0) *= type_convert<compType>{}(alpha);
 
+        StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> dstValue_buf;
+
+        dstValue_buf(I0) = type_convert<dstDataType>{}(accuValue_buf[I0]);
+
         if(!float_equal_zero{}(beta))
         {
             auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2<dstDataType,
@@ -449,11 +461,11 @@ struct GridwiseReduction_xy_to_x_direct_threadwise
             threadwise_dst_load.Run(
                 dst1dDesc, dst_global_val_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf);
 
-            accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
+            dstValue_buf(I0) += priorDstValue_buf[I0] * beta;
         }
 
         auto threadwise_dst_val_store =
-            ThreadwiseTensorSliceTransfer_v1r3<compType,
+            ThreadwiseTensorSliceTransfer_v1r3<dstDataType,
                                                dstDataType,
                                                decltype(ReducedDataDesc),
                                                dst1dDescType,
@@ -481,7 +493,7 @@ struct GridwiseReduction_xy_to_x_direct_threadwise
                                                       make_multi_index(thread_global_1d_id));
 
         threadwise_dst_val_store.Run(
-            ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_val_buf);
+            ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_val_buf);
         threadwise_dst_idx_store.Run(
             ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
     };
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp
index 0d7cef9360..b4ebd2f57b 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp
@@ -156,6 +156,10 @@ struct GridwiseReduction_xy_to_x_direct_warpwise
             if(!float_equal_one{}(alpha))
                 accuValue_buf(I0) *= type_convert<compType>{}(alpha);
 
+            StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> dstValue_buf;
+
+            dstValue_buf(I0) = type_convert<dstDataType>{}(accuValue_buf[I0]);
+
             if(!float_equal_zero{}(beta))
             {
                 auto threadwise_dst_load =
@@ -176,11 +180,11 @@ struct GridwiseReduction_xy_to_x_direct_warpwise
                 threadwise_dst_load.Run(
                     dst1dDesc, dst_global_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf);
 
-                accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf(I0) * beta);
+                dstValue_buf(I0) += priorDstValue_buf(I0) * beta;
             }
 
             auto threadwise_dst_store =
-                ThreadwiseTensorSliceTransfer_v1r3<compType,
+                ThreadwiseTensorSliceTransfer_v1r3<dstDataType,
                                                    dstDataType,
                                                    decltype(ReducedDataDesc),
                                                    dst1dDescType,
@@ -194,7 +198,7 @@ struct GridwiseReduction_xy_to_x_direct_warpwise
                                                          make_multi_index(warp_global_1d_id));
 
             threadwise_dst_store.Run(
-                ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_buf);
+                ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_buf);
         }
     };
 
@@ -291,6 +295,10 @@ struct GridwiseReduction_xy_to_x_direct_warpwise
             if(!float_equal_one{}(alpha))
                 accuValue_buf(I0) *= type_convert<compType>{}(alpha);
 
+            StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> dstValue_buf;
+
+            dstValue_buf(I0) = type_convert<dstDataType>{}(accuValue_buf[I0]);
+
             if(!float_equal_zero{}(beta))
             {
                 auto threadwise_dst_load =
@@ -314,11 +322,11 @@ struct GridwiseReduction_xy_to_x_direct_warpwise
                                         make_tuple(I0),
                                         priorDstValue_buf);
 
-                accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
+                dstValue_buf(I0) += priorDstValue_buf[I0] * beta;
             }
 
             auto threadwise_dst_val_store =
-                ThreadwiseTensorSliceTransfer_v1r3<compType,
+                ThreadwiseTensorSliceTransfer_v1r3<dstDataType,
                                                    dstDataType,
                                                    decltype(ReducedDataDesc),
                                                    dst1dDescType,
@@ -346,7 +354,7 @@ struct GridwiseReduction_xy_to_x_direct_warpwise
                                                          make_multi_index(warp_global_1d_id));
 
             threadwise_dst_val_store.Run(
-                ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_val_buf);
+                ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_val_buf);
             threadwise_dst_idx_store.Run(
                 ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
         }
@@ -466,6 +474,10 @@ struct GridwiseReduction_xy_to_x_direct_warpwise
             if(!float_equal_one{}(alpha))
                 accuValue_buf(I0) *= type_convert<compType>{}(alpha);
 
+            StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> dstValue_buf;
+
+            dstValue_buf(I0) = type_convert<dstDataType>{}(accuValue_buf[I0]);
+
             if(!float_equal_zero{}(beta))
             {
                 auto threadwise_dst_load =
@@ -489,11 +501,11 @@ struct GridwiseReduction_xy_to_x_direct_warpwise
                                         make_tuple(I0),
                                         priorDstValue_buf);
 
-                accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
+                dstValue_buf(I0) += priorDstValue_buf[I0] * beta;
             }
 
             auto threadwise_dst_val_store =
-                ThreadwiseTensorSliceTransfer_v1r3<compType,
+                ThreadwiseTensorSliceTransfer_v1r3<dstDataType,
                                                    dstDataType,
                                                    decltype(ReducedDataDesc),
                                                    dst1dDescType,
@@ -521,7 +533,7 @@ struct GridwiseReduction_xy_to_x_direct_warpwise
                                                          make_multi_index(warp_global_1d_id));
 
             threadwise_dst_val_store.Run(
-                ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_val_buf);
+                ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_val_buf);
             threadwise_dst_idx_store.Run(
                 ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
         }

From a26986e3b86592604f7462de0144c13c1fdf69fd Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 17 Sep 2021 10:03:10 +0000
Subject: [PATCH 46/57] Add half_t support to NumericLimits and make constexpr
 GetZeroVal() of binary operator

---
 ...ridwise_generic_2d_reduction_blockwise.hpp |  6 ++---
 ...generic_2d_reduction_direct_threadwise.hpp |  6 ++---
 ...e_generic_2d_reduction_direct_warpwise.hpp |  6 ++---
 ...idwise_generic_2d_reduction_multiblock.hpp |  4 +--
 .../include/utility/data_type.hpp             | 27 ++++++++++++-------
 .../include/utility/reduction_operator.hpp    | 26 ++++--------------
 6 files changed, 33 insertions(+), 42 deletions(-)

diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp
index 85d1abec7b..de91659cf3 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp
@@ -92,7 +92,7 @@ struct GridwiseReduction_xy_to_x_blockwise
         // LDS
         __shared__ compType p_in_block_buffer[BlockBufferSize];
 
-        auto zeroVal = opReduce::GetZeroVal();
+        constexpr auto zeroVal = opReduce::GetZeroVal();
 
         const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
             p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
@@ -243,7 +243,7 @@ struct GridwiseReduction_xy_to_x_blockwise
         __shared__ compType p_in_block_buffer[BlockBufferSize];
         __shared__ int block_indices_buffer[BlockBufferSize];
 
-        auto zeroVal = opReduce::GetZeroVal();
+        constexpr auto zeroVal = opReduce::GetZeroVal();
 
         const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
             p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
@@ -431,7 +431,7 @@ struct GridwiseReduction_xy_to_x_blockwise
         __shared__ compType p_in_block_buffer[BlockBufferSize];
         __shared__ int block_indices_buffer[BlockBufferSize];
 
-        auto zeroVal = opReduce::GetZeroVal();
+        constexpr auto zeroVal = opReduce::GetZeroVal();
 
         const auto src_global_val_buf =
             make_dynamic_buffer<AddressSpaceEnum_t::Global>(ws_values_global,
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp
index beeb27e471..48ba8b6770 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp
@@ -82,7 +82,7 @@ struct GridwiseReduction_xy_to_x_direct_threadwise
         (void)ws_indices_global;
         (void)indices_global;
 
-        const auto zeroVal = opReduce::GetZeroVal();
+        constexpr auto zeroVal = opReduce::GetZeroVal();
 
         const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
             p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
@@ -204,7 +204,7 @@ struct GridwiseReduction_xy_to_x_direct_threadwise
     {
         (void)ws_indices_global;
 
-        const auto zeroVal = opReduce::GetZeroVal();
+        constexpr auto zeroVal = opReduce::GetZeroVal();
 
         const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
             p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
@@ -348,7 +348,7 @@ struct GridwiseReduction_xy_to_x_direct_threadwise
     {
         (void)origReduceLen;
 
-        const auto zeroVal = opReduce::GetZeroVal();
+        constexpr auto zeroVal = opReduce::GetZeroVal();
 
         const auto src_global_val_buf =
             make_dynamic_buffer<AddressSpaceEnum_t::Global>(ws_values_global,
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp
index b4ebd2f57b..e6831f2b6d 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp
@@ -82,7 +82,7 @@ struct GridwiseReduction_xy_to_x_direct_warpwise
         (void)ws_indices_global;
         (void)indices_global;
 
-        auto zeroVal = opReduce::GetZeroVal();
+        constexpr auto zeroVal = opReduce::GetZeroVal();
 
         const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
             p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
@@ -215,7 +215,7 @@ struct GridwiseReduction_xy_to_x_direct_warpwise
     {
         (void)ws_indices_global;
 
-        auto zeroVal = opReduce::GetZeroVal();
+        constexpr auto zeroVal = opReduce::GetZeroVal();
 
         const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
             p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
@@ -373,7 +373,7 @@ struct GridwiseReduction_xy_to_x_direct_warpwise
     {
         (void)origReduceLen;
 
-        auto zeroVal = opReduce::GetZeroVal();
+        constexpr auto zeroVal = opReduce::GetZeroVal();
 
         const auto src_global_val_buf =
             make_dynamic_buffer<AddressSpaceEnum_t::Global>(ws_values_global,
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp
index c563893101..cc65b74530 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp
@@ -86,7 +86,7 @@ struct GridwiseReduction_xy_to_x_multiblock
         (void)alpha; // unused
         (void)beta;  // unused
 
-        auto zeroVal = opReduce::GetZeroVal();
+        constexpr auto zeroVal = opReduce::GetZeroVal();
 
         // LDS
         __shared__ compType p_in_block_buffer[BlockBufferSize];
@@ -216,7 +216,7 @@ struct GridwiseReduction_xy_to_x_multiblock
         (void)alpha; // unused
         (void)beta;  // unused
 
-        auto zeroVal = opReduce::GetZeroVal();
+        constexpr auto zeroVal = opReduce::GetZeroVal();
 
         // LDS
         __shared__ compType p_in_block_values_buffer[BlockBufferSize];
diff --git a/src/composable_kernel/composable_kernel/include/utility/data_type.hpp b/src/composable_kernel/composable_kernel/include/utility/data_type.hpp
index bfaac8a939..07eceb84cf 100644
--- a/src/composable_kernel/composable_kernel/include/utility/data_type.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/data_type.hpp
@@ -1008,20 +1008,27 @@ struct inner_product_with_conversion
 };
 
 template <typename T>
-struct NumericLimits;
+struct NumericLimits
+{
+    __host__ __device__ static constexpr T Min() { return std::numeric_limits<T>::min(); }
+
+    __host__ __device__ static constexpr T Max() { return std::numeric_limits<T>::max(); }
+
+    __host__ __device__ static constexpr T Lowest() { return std::numeric_limits<T>::lowest(); }
+};
 
 template <>
-struct NumericLimits<int32_t>
+struct NumericLimits<half_t>
 {
-    __host__ __device__ static constexpr int32_t Min()
-    {
-        return std::numeric_limits<int32_t>::min();
-    }
+    static constexpr unsigned short binary_min    = 0x0400;
+    static constexpr unsigned short binary_max    = 0x7BFF;
+    static constexpr unsigned short binary_lowest = 0xFBFF;
 
-    __host__ __device__ static constexpr int32_t Max()
-    {
-        return std::numeric_limits<int32_t>::max();
-    }
+    __host__ __device__ static constexpr half_t Min() { return as_type<half_t>(binary_min); }
+
+    __host__ __device__ static constexpr half_t Max() { return as_type<half_t>(binary_max); }
+
+    __host__ __device__ static constexpr half_t Lowest() { return as_type<half_t>(binary_lowest); }
 };
 
 } // namespace ck
diff --git a/src/composable_kernel/composable_kernel/include/utility/reduction_operator.hpp b/src/composable_kernel/composable_kernel/include/utility/reduction_operator.hpp
index 05484ccf74..edc2f41735 100644
--- a/src/composable_kernel/composable_kernel/include/utility/reduction_operator.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/reduction_operator.hpp
@@ -58,7 +58,7 @@ struct Add
 {
     using dataType = T;
 
-    __device__ static T GetZeroVal() { return type_convert<T>{}(0.0f); };
+    __device__ static constexpr T GetZeroVal() { return static_cast<T>(0.0f); };
 
     __device__ inline constexpr void operator()(T& a, T b) const { a = a + b; }
 
@@ -70,7 +70,7 @@ struct Mul
 {
     using dataType = T;
 
-    __device__ static T GetZeroVal() { return type_convert<T>{}(1.0f); };
+    __device__ static constexpr T GetZeroVal() { return static_cast<T>(1.0f); };
 
     __device__ inline constexpr void operator()(T& a, T b) const { a = a * b; }
 
@@ -82,7 +82,7 @@ struct Max
 {
     using dataType = T;
 
-    __device__ static T GetZeroVal() { return std::numeric_limits<T>::lowest(); };
+    __device__ static constexpr T GetZeroVal() { return NumericLimits<T>::lowest(); };
 
     __device__ inline constexpr void operator()(T& a, T b) const
     {
@@ -107,7 +107,7 @@ struct Min
 {
     using dataType = T;
 
-    __device__ static T GetZeroVal() { return std::numeric_limits<T>::max(); };
+    __device__ static constexpr T GetZeroVal() { return NumericLimits<T>::Max(); };
 
     __device__ inline constexpr void operator()(T& a, T b) const
     {
@@ -132,7 +132,7 @@ struct AMax
 {
     using dataType = T;
 
-    __device__ static T GetZeroVal() { return type_convert<T>{}(0.0f); };
+    __device__ static constexpr T GetZeroVal() { return static_cast<T>(0.0f); };
 
     __device__ inline constexpr void operator()(T& a, T b) const
     {
@@ -152,22 +152,6 @@ struct AMax
     static constexpr bool indexable = true;
 };
 
-template <>
-__device__ half_t Max<half_t>::GetZeroVal()
-{
-    const unsigned short binary_lowest = 0xFBFF;
-
-    return *reinterpret_cast<const half_t*>(&binary_lowest);
-};
-
-template <>
-__device__ half_t Min<half_t>::GetZeroVal()
-{
-    const unsigned short binary_max = 0x7BFF;
-
-    return *reinterpret_cast<const half_t*>(&binary_max);
-};
-
 // Unary operators are usually called element-wisely before the reduction is executed on the
 // elements.
 // They are needed for easy implementation of reduction types of AVG, NRM1, NRM2

From a7a0c6b6b96aa1f89c45720b149a03017267e901 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sun, 5 Sep 2021 14:26:01 +0000
Subject: [PATCH 47/57] Add CONSTANT decorator for descriptor read buffer

---
 ...ric_reduction_first_call_blockwise_reduce_all_dims.cpp | 6 +++---
 ...reduction_first_call_blockwise_reduce_partial_dims.cpp | 6 +++---
 ...ic_reduction_first_call_multiblock_reduce_all_dims.cpp | 8 ++++----
 ...eduction_first_call_multiblock_reduce_partial_dims.cpp | 8 ++++----
 ...ic_reduction_first_call_threadwise_reduce_all_dims.cpp | 6 +++---
 ...eduction_first_call_threadwise_reduce_partial_dims.cpp | 6 +++---
 ...eric_reduction_first_call_warpwise_reduce_all_dims.cpp | 6 +++---
 ..._reduction_first_call_warpwise_reduce_partial_dims.cpp | 6 +++---
 .../gridwise_generic_reduction_second_call_blockwise.cpp  | 8 ++++----
 .../gridwise_generic_reduction_second_call_threadwise.cpp | 8 ++++----
 .../gridwise_generic_reduction_second_call_warpwise.cpp   | 8 ++++----
 11 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp
index e16010dee1..752cec7d20 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp
@@ -277,15 +277,15 @@ extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
                                                      const void* __restrict__ p_src_global,
                                                      float beta,
                                                      void* __restrict__ p_dst_global,
-                                                     void* __restrict__ ws_global,
+                                                     const void CONSTANT* ws_global,
                                                      long ws_buf2_bytes_offset,
                                                      void* __restrict__ indices_global)
 {
     (void)BlkGroupSize;
     (void)ws_buf2_bytes_offset;
 
-    const void* p_src2dDesc = ws_global;
-    const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
+    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
 
     const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
     const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp
index cba7ffe295..bc5eb88fff 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp
@@ -278,15 +278,15 @@ extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
                                                      const void* __restrict__ p_src_global,
                                                      float beta,
                                                      void* __restrict__ p_dst_global,
-                                                     void* __restrict__ ws_global,
+                                                     const void CONSTANT* ws_global,
                                                      long ws_buf2_bytes_offset,
                                                      void* __restrict__ indices_global)
 {
     (void)BlkGroupSize;
     (void)ws_buf2_bytes_offset;
 
-    const void* p_src2dDesc = ws_global;
-    const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
+    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
 
     const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
     const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp
index 34b877027c..0fafff722b 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp
@@ -279,16 +279,16 @@ extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
                                                      const void* __restrict__ p_src_global,
                                                      float beta,
                                                      void* __restrict__ p_dst_global,
-                                                     void* __restrict__ ws_global,
+                                                     const void CONSTANT* ws_global,
                                                      long ws_buf2_bytes_offset,
                                                      void* __restrict__ indices_global)
 {
     (void)p_dst_global;
     (void)indices_global;
 
-    const void* p_src2dDesc = ws_global;
-    const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
-    void* ws_buf1_global    = static_cast<char*>(ws_global) + 4096;
+    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
+    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
+    void* ws_buf1_global    = const_cast<char*>(static_cast<const char*>(p_src2dDesc) + 4096);
 
     const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
     const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp
index 9c7318dc15..24403ba4b2 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp
@@ -279,16 +279,16 @@ extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
                                                      const void* __restrict__ p_src_global,
                                                      float beta,
                                                      void* __restrict__ p_dst_global,
-                                                     void* __restrict__ ws_global,
+                                                     const void CONSTANT* ws_global,
                                                      long ws_buf2_bytes_offset,
                                                      void* __restrict__ indices_global)
 {
     (void)p_dst_global;
     (void)indices_global;
 
-    const void* p_src2dDesc = ws_global;
-    const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
-    void* ws_buf1_global    = static_cast<char*>(ws_global) + 4096;
+    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
+    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
+    void* ws_buf1_global    = const_cast<char*>(static_cast<const char*>(p_src2dDesc) + 4096);
 
     const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
     const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp
index 8e67d1faa1..884ab6a778 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp
@@ -290,15 +290,15 @@ extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
                                                      const void* __restrict__ p_src_global,
                                                      float beta,
                                                      void* __restrict__ p_dst_global,
-                                                     void* __restrict__ ws_global,
+                                                     const void CONSTANT* ws_global,
                                                      long ws_buf2_bytes_offset,
                                                      void* __restrict__ indices_global)
 {
     (void)BlkGroupSize;
     (void)ws_buf2_bytes_offset;
 
-    const void* p_src2dDesc = ws_global;
-    const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
+    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
 
     const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
     const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp
index fdbcda64ba..3e15b00f10 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp
@@ -291,15 +291,15 @@ extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
                                                      const void* __restrict__ p_src_global,
                                                      float beta,
                                                      void* __restrict__ p_dst_global,
-                                                     void* __restrict__ ws_global,
+                                                     const void CONSTANT* ws_global,
                                                      long ws_buf2_bytes_offset,
                                                      void* __restrict__ indices_global)
 {
     (void)BlkGroupSize;
     (void)ws_buf2_bytes_offset;
 
-    const void* p_src2dDesc = ws_global;
-    const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
+    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
 
     const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
     const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp
index 8aa1376c3a..c98d9d6bdb 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp
@@ -291,15 +291,15 @@ extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
                                                      const void* __restrict__ p_src_global,
                                                      float beta,
                                                      void* __restrict__ p_dst_global,
-                                                     void* __restrict__ ws_global,
+                                                     const void CONSTANT* ws_global,
                                                      long ws_buf2_bytes_offset,
                                                      void* __restrict__ indices_global)
 {
     (void)BlkGroupSize;
     (void)ws_buf2_bytes_offset;
 
-    const void* p_src2dDesc = ws_global;
-    const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
+    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
 
     const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
     const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp
index e18d623fe5..434f651cf9 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp
@@ -292,15 +292,15 @@ extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
                                                      const void* __restrict__ p_src_global,
                                                      float beta,
                                                      void* __restrict__ p_dst_global,
-                                                     void* __restrict__ ws_global,
+                                                     const void CONSTANT* ws_global,
                                                      long ws_buf2_bytes_offset,
                                                      void* __restrict__ indices_global)
 {
     (void)BlkGroupSize;
     (void)ws_buf2_bytes_offset;
 
-    const void* p_src2dDesc = ws_global;
-    const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
+    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
 
     const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
     const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise.cpp
index b7b58cbb90..dc11200b83 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise.cpp
@@ -237,15 +237,15 @@ extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen,
                                                      const void* __restrict__ p_src_global,
                                                      float beta,
                                                      void* __restrict__ p_dst_global,
-                                                     void* __restrict__ ws_global,
+                                                     const void CONSTANT* ws_global,
                                                      long ws_buf2_bytes_offset,
                                                      void* __restrict__ indices_global)
 {
     (void)p_src_global;
 
-    const void* p_src2dDesc = ws_global;
-    const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
-    void* ws_buf1_global    = static_cast<char*>(ws_global) + 4096;
+    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
+    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
+    void* ws_buf1_global    = const_cast<char*>(static_cast<const char*>(p_src2dDesc) + 4096);
 
     const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
     const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise.cpp
index ef88547028..402c4c7f1f 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise.cpp
@@ -251,15 +251,15 @@ extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen,
                                                      const void* __restrict__ p_src_global,
                                                      float beta,
                                                      void* __restrict__ p_dst_global,
-                                                     void* __restrict__ ws_global,
+                                                     const void CONSTANT* ws_global,
                                                      long ws_buf2_bytes_offset,
                                                      void* __restrict__ indices_global)
 {
     (void)p_src_global;
 
-    const void* p_src2dDesc = ws_global;
-    const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
-    void* ws_buf1_global    = static_cast<char*>(ws_global) + 4096;
+    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
+    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
+    void* ws_buf1_global    = const_cast<char*>(static_cast<const char*>(p_src2dDesc) + 4096);
 
     const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
     const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise.cpp
index 53b0e1e759..81897cfc13 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise.cpp
@@ -252,15 +252,15 @@ extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen,
                                                      const void* __restrict__ p_src_global,
                                                      float beta,
                                                      void* __restrict__ p_dst_global,
-                                                     void* __restrict__ ws_global,
+                                                     const void CONSTANT* ws_global,
                                                      long ws_buf2_bytes_offset,
                                                      void* __restrict__ indices_global)
 {
     (void)p_src_global;
 
-    const void* p_src2dDesc = ws_global;
-    const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
-    void* ws_buf1_global    = static_cast<char*>(ws_global) + 4096;
+    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
+    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
+    void* ws_buf1_global    = const_cast<char*>(static_cast<const char*>(p_src2dDesc) + 4096);
 
     const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
     const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);

From 9e8229c2f1b4659d37074b75d374e3b97d9ac354 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 2 Sep 2021 09:15:13 +0000
Subject: [PATCH 48/57] Use get_thread_local_1d_id() for thread local Id

---
 ...ric_reduction_first_call_blockwise_reduce_all_dims.cpp | 6 +++---
 ...reduction_first_call_blockwise_reduce_partial_dims.cpp | 6 +++---
 ...ic_reduction_first_call_multiblock_reduce_all_dims.cpp | 6 +++---
 ...eduction_first_call_multiblock_reduce_partial_dims.cpp | 6 +++---
 ...ic_reduction_first_call_threadwise_reduce_all_dims.cpp | 8 ++++----
 ...eduction_first_call_threadwise_reduce_partial_dims.cpp | 8 ++++----
 ...eric_reduction_first_call_warpwise_reduce_all_dims.cpp | 8 ++++----
 ..._reduction_first_call_warpwise_reduce_partial_dims.cpp | 8 ++++----
 .../gridwise_generic_reduction_second_call_blockwise.cpp  | 6 +++---
 .../gridwise_generic_reduction_second_call_threadwise.cpp | 8 ++++----
 .../gridwise_generic_reduction_second_call_warpwise.cpp   | 8 ++++----
 11 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp
index 752cec7d20..049d720b86 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp
@@ -179,16 +179,16 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
                                                    make_pad_transform(toReduceLen, 0, srcPad)),
                                         make_tuple(Sequence<0>{}, Sequence<1>{}),
                                         make_tuple(Sequence<0>{}, Sequence<1>{}));
-        if(hipThreadIdx_x == 0)
+        if(get_thread_local_1d_id() == 0)
             *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
     }
     else
     {
-        if(hipThreadIdx_x == 0)
+        if(get_thread_local_1d_id() == 0)
             *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
     }
 
-    if(hipThreadIdx_x == 0)
+    if(get_thread_local_1d_id() == 0)
         *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
 };
 
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp
index bc5eb88fff..8df91e8c51 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp
@@ -179,16 +179,16 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
                                                    make_pad_transform(toReduceLen, 0, srcPad)),
                                         make_tuple(Sequence<0>{}, Sequence<1>{}),
                                         make_tuple(Sequence<0>{}, Sequence<1>{}));
-        if(hipThreadIdx_x == 0)
+        if(get_thread_local_1d_id() == 0)
             *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
     }
     else
     {
-        if(hipThreadIdx_x == 0)
+        if(get_thread_local_1d_id() == 0)
             *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
     }
 
-    if(hipThreadIdx_x == 0)
+    if(get_thread_local_1d_id() == 0)
         *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
 };
 
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp
index 0fafff722b..8df63f5519 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp
@@ -181,16 +181,16 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
                                                    make_pad_transform(toReduceLen, 0, srcPad)),
                                         make_tuple(Sequence<0>{}, Sequence<1>{}),
                                         make_tuple(Sequence<0>{}, Sequence<1>{}));
-        if(hipThreadIdx_x == 0)
+        if(get_thread_local_1d_id() == 0)
             *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
     }
     else
     {
-        if(hipThreadIdx_x == 0)
+        if(get_thread_local_1d_id() == 0)
             *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
     }
 
-    if(hipThreadIdx_x == 0)
+    if(get_thread_local_1d_id() == 0)
         *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
 };
 
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp
index 24403ba4b2..d7cf4633b9 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp
@@ -180,16 +180,16 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
                                                    make_pad_transform(toReduceLen, 0, srcPad)),
                                         make_tuple(Sequence<0>{}, Sequence<1>{}),
                                         make_tuple(Sequence<0>{}, Sequence<1>{}));
-        if(hipThreadIdx_x == 0)
+        if(get_thread_local_1d_id() == 0)
             *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
     }
     else
     {
-        if(hipThreadIdx_x == 0)
+        if(get_thread_local_1d_id() == 0)
             *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
     }
 
-    if(hipThreadIdx_x == 0)
+    if(get_thread_local_1d_id() == 0)
         *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
 };
 
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp
index 884ab6a778..52c9a7c8fd 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp
@@ -178,12 +178,12 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
                                                    make_pad_transform(toReduceLen, 0, srcPad2)),
                                         make_tuple(Sequence<0>{}, Sequence<1>{}),
                                         make_tuple(Sequence<0>{}, Sequence<1>{}));
-        if(hipThreadIdx_x == 0)
+        if(get_thread_local_1d_id() == 0)
             *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
     }
     else
     {
-        if(hipThreadIdx_x == 0)
+        if(get_thread_local_1d_id() == 0)
             *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
     }
 
@@ -195,12 +195,12 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
                                         make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
                                         make_tuple(Sequence<0>{}),
                                         make_tuple(Sequence<0>{}));
-        if(hipThreadIdx_x == 0)
+        if(get_thread_local_1d_id() == 0)
             *static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
     }
     else
     {
-        if(hipThreadIdx_x == 0)
+        if(get_thread_local_1d_id() == 0)
             *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
     }
 };
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp
index 3e15b00f10..b9daaee1fd 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp
@@ -178,12 +178,12 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
                                                    make_pad_transform(toReduceLen, 0, srcPad2)),
                                         make_tuple(Sequence<0>{}, Sequence<1>{}),
                                         make_tuple(Sequence<0>{}, Sequence<1>{}));
-        if(hipThreadIdx_x == 0)
+        if(get_thread_local_1d_id() == 0)
             *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
     }
     else
     {
-        if(hipThreadIdx_x == 0)
+        if(get_thread_local_1d_id() == 0)
             *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
     }
 
@@ -195,12 +195,12 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
                                         make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
                                         make_tuple(Sequence<0>{}),
                                         make_tuple(Sequence<0>{}));
-        if(hipThreadIdx_x == 0)
+        if(get_thread_local_1d_id() == 0)
             *static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
     }
     else
     {
-        if(hipThreadIdx_x == 0)
+        if(get_thread_local_1d_id() == 0)
             *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
     }
 };
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp
index c98d9d6bdb..bf34ed44c8 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp
@@ -179,12 +179,12 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
                                                    make_pad_transform(toReduceLen, 0, srcPad2)),
                                         make_tuple(Sequence<0>{}, Sequence<1>{}),
                                         make_tuple(Sequence<0>{}, Sequence<1>{}));
-        if(hipThreadIdx_x == 0)
+        if(get_thread_local_1d_id() == 0)
             *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
     }
     else
     {
-        if(hipThreadIdx_x == 0)
+        if(get_thread_local_1d_id() == 0)
             *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
     }
 
@@ -196,12 +196,12 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
                                         make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
                                         make_tuple(Sequence<0>{}),
                                         make_tuple(Sequence<0>{}));
-        if(hipThreadIdx_x == 0)
+        if(get_thread_local_1d_id() == 0)
             *static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
     }
     else
     {
-        if(hipThreadIdx_x == 0)
+        if(get_thread_local_1d_id() == 0)
             *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
     }
 };
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp
index 434f651cf9..43d45ad700 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp
@@ -179,12 +179,12 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
                                                    make_pad_transform(toReduceLen, 0, srcPad2)),
                                         make_tuple(Sequence<0>{}, Sequence<1>{}),
                                         make_tuple(Sequence<0>{}, Sequence<1>{}));
-        if(hipThreadIdx_x == 0)
+        if(get_thread_local_1d_id() == 0)
             *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
     }
     else
     {
-        if(hipThreadIdx_x == 0)
+        if(get_thread_local_1d_id() == 0)
             *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
     }
 
@@ -196,12 +196,12 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
                                         make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
                                         make_tuple(Sequence<0>{}),
                                         make_tuple(Sequence<0>{}));
-        if(hipThreadIdx_x == 0)
+        if(get_thread_local_1d_id() == 0)
             *static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
     }
     else
     {
-        if(hipThreadIdx_x == 0)
+        if(get_thread_local_1d_id() == 0)
             *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
     }
 };
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise.cpp
index dc11200b83..f7b2f5e32c 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise.cpp
@@ -152,16 +152,16 @@ extern "C" __global__ void gridwise_generic_reduce_2_prepare(int GridSize,
                                                    make_pad_transform(toReduceLen, 0, srcPad)),
                                         make_tuple(Sequence<0>{}, Sequence<1>{}),
                                         make_tuple(Sequence<0>{}, Sequence<1>{}));
-        if(hipThreadIdx_x == 0)
+        if(get_thread_local_1d_id() == 0)
             *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
     }
     else
     {
-        if(hipThreadIdx_x == 0)
+        if(get_thread_local_1d_id() == 0)
             *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
     }
 
-    if(hipThreadIdx_x == 0)
+    if(get_thread_local_1d_id() == 0)
         *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
 };
 
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise.cpp
index 402c4c7f1f..826b9257ab 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise.cpp
@@ -152,12 +152,12 @@ extern "C" __global__ void gridwise_generic_reduce_2_prepare(int GridSize,
                                                    make_pad_transform(toReduceLen, 0, srcPad2)),
                                         make_tuple(Sequence<0>{}, Sequence<1>{}),
                                         make_tuple(Sequence<0>{}, Sequence<1>{}));
-        if(hipThreadIdx_x == 0)
+        if(get_thread_local_1d_id() == 0)
             *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
     }
     else
     {
-        if(hipThreadIdx_x == 0)
+        if(get_thread_local_1d_id() == 0)
             *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
     }
 
@@ -169,12 +169,12 @@ extern "C" __global__ void gridwise_generic_reduce_2_prepare(int GridSize,
                                         make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
                                         make_tuple(Sequence<0>{}),
                                         make_tuple(Sequence<0>{}));
-        if(hipThreadIdx_x == 0)
+        if(get_thread_local_1d_id() == 0)
             *static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
     }
     else
     {
-        if(hipThreadIdx_x == 0)
+        if(get_thread_local_1d_id() == 0)
             *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
     }
 };
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise.cpp
index 81897cfc13..483c74bdd2 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise.cpp
@@ -153,12 +153,12 @@ extern "C" __global__ void gridwise_generic_reduce_2_prepare(int GridSize,
                                                    make_pad_transform(toReduceLen, 0, srcPad2)),
                                         make_tuple(Sequence<0>{}, Sequence<1>{}),
                                         make_tuple(Sequence<0>{}, Sequence<1>{}));
-        if(hipThreadIdx_x == 0)
+        if(get_thread_local_1d_id() == 0)
             *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
     }
     else
     {
-        if(hipThreadIdx_x == 0)
+        if(get_thread_local_1d_id() == 0)
             *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
     }
 
@@ -170,12 +170,12 @@ extern "C" __global__ void gridwise_generic_reduce_2_prepare(int GridSize,
                                         make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
                                         make_tuple(Sequence<0>{}),
                                         make_tuple(Sequence<0>{}));
-        if(hipThreadIdx_x == 0)
+        if(get_thread_local_1d_id() == 0)
             *static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
     }
     else
     {
-        if(hipThreadIdx_x == 0)
+        if(get_thread_local_1d_id() == 0)
             *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
     }
 };

From b4838399ff9932485af00770e502eb453bc3f8af Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 17 Sep 2021 11:54:20 +0000
Subject: [PATCH 49/57] Rename GetZeroVal() to GetReductionZeroVal() in the
 kernels

---
 .../gridwise_generic_2d_reduction_blockwise.hpp  |  6 +++---
 ...se_generic_2d_reduction_direct_threadwise.hpp |  6 +++---
 ...wise_generic_2d_reduction_direct_warpwise.hpp |  6 +++---
 .../gridwise_generic_2d_reduction_multiblock.hpp |  4 ++--
 .../reduction_functions_blockwise.hpp            |  4 ++--
 .../reduction_functions_warpwise.hpp             | 12 ++++++------
 .../include/utility/reduction_operator.hpp       | 16 ++++++++--------
 7 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp
index de91659cf3..0ad45f50fe 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp
@@ -92,7 +92,7 @@ struct GridwiseReduction_xy_to_x_blockwise
         // LDS
         __shared__ compType p_in_block_buffer[BlockBufferSize];
 
-        constexpr auto zeroVal = opReduce::GetZeroVal();
+        constexpr auto zeroVal = opReduce::GetReductionZeroVal();
 
         const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
             p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
@@ -243,7 +243,7 @@ struct GridwiseReduction_xy_to_x_blockwise
         __shared__ compType p_in_block_buffer[BlockBufferSize];
         __shared__ int block_indices_buffer[BlockBufferSize];
 
-        constexpr auto zeroVal = opReduce::GetZeroVal();
+        constexpr auto zeroVal = opReduce::GetReductionZeroVal();
 
         const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
             p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
@@ -431,7 +431,7 @@ struct GridwiseReduction_xy_to_x_blockwise
         __shared__ compType p_in_block_buffer[BlockBufferSize];
         __shared__ int block_indices_buffer[BlockBufferSize];
 
-        constexpr auto zeroVal = opReduce::GetZeroVal();
+        constexpr auto zeroVal = opReduce::GetReductionZeroVal();
 
         const auto src_global_val_buf =
             make_dynamic_buffer<AddressSpaceEnum_t::Global>(ws_values_global,
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp
index 48ba8b6770..5cf7352a9b 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp
@@ -82,7 +82,7 @@ struct GridwiseReduction_xy_to_x_direct_threadwise
         (void)ws_indices_global;
         (void)indices_global;
 
-        constexpr auto zeroVal = opReduce::GetZeroVal();
+        constexpr auto zeroVal = opReduce::GetReductionZeroVal();
 
         const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
             p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
@@ -204,7 +204,7 @@ struct GridwiseReduction_xy_to_x_direct_threadwise
     {
         (void)ws_indices_global;
 
-        constexpr auto zeroVal = opReduce::GetZeroVal();
+        constexpr auto zeroVal = opReduce::GetReductionZeroVal();
 
         const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
             p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
@@ -348,7 +348,7 @@ struct GridwiseReduction_xy_to_x_direct_threadwise
     {
         (void)origReduceLen;
 
-        constexpr auto zeroVal = opReduce::GetZeroVal();
+        constexpr auto zeroVal = opReduce::GetReductionZeroVal();
 
         const auto src_global_val_buf =
             make_dynamic_buffer<AddressSpaceEnum_t::Global>(ws_values_global,
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp
index e6831f2b6d..5fe7bfc8c9 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp
@@ -82,7 +82,7 @@ struct GridwiseReduction_xy_to_x_direct_warpwise
         (void)ws_indices_global;
         (void)indices_global;
 
-        constexpr auto zeroVal = opReduce::GetZeroVal();
+        constexpr auto zeroVal = opReduce::GetReductionZeroVal();
 
         const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
             p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
@@ -215,7 +215,7 @@ struct GridwiseReduction_xy_to_x_direct_warpwise
     {
         (void)ws_indices_global;
 
-        constexpr auto zeroVal = opReduce::GetZeroVal();
+        constexpr auto zeroVal = opReduce::GetReductionZeroVal();
 
         const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
             p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
@@ -373,7 +373,7 @@ struct GridwiseReduction_xy_to_x_direct_warpwise
     {
         (void)origReduceLen;
 
-        constexpr auto zeroVal = opReduce::GetZeroVal();
+        constexpr auto zeroVal = opReduce::GetReductionZeroVal();
 
         const auto src_global_val_buf =
             make_dynamic_buffer<AddressSpaceEnum_t::Global>(ws_values_global,
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp
index cc65b74530..06e1930ab2 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp
@@ -86,7 +86,7 @@ struct GridwiseReduction_xy_to_x_multiblock
         (void)alpha; // unused
         (void)beta;  // unused
 
-        constexpr auto zeroVal = opReduce::GetZeroVal();
+        constexpr auto zeroVal = opReduce::GetReductionZeroVal();
 
         // LDS
         __shared__ compType p_in_block_buffer[BlockBufferSize];
@@ -216,7 +216,7 @@ struct GridwiseReduction_xy_to_x_multiblock
         (void)alpha; // unused
         (void)beta;  // unused
 
-        constexpr auto zeroVal = opReduce::GetZeroVal();
+        constexpr auto zeroVal = opReduce::GetReductionZeroVal();
 
         // LDS
         __shared__ compType p_in_block_values_buffer[BlockBufferSize];
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/reduction_functions_blockwise.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/reduction_functions_blockwise.hpp
index 3df257a6d9..046d3311aa 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/reduction_functions_blockwise.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/reduction_functions_blockwise.hpp
@@ -56,7 +56,7 @@ struct BlockwiseReduction_2d_block_buffer
     Reduce(BufferType& block_buffer, index_t toReduceBlocks, compType& accuData)
     {
         const index_t thread_local_id = get_thread_local_1d_id();
-        compType lAccuData            = opReduce::GetZeroVal();
+        compType lAccuData            = opReduce::GetReductionZeroVal();
 
         index_t offset;
         for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++)
@@ -115,7 +115,7 @@ struct BlockwiseReduction_2d_block_buffer
                                    int& accuIndex)
     {
         const index_t thread_local_id = get_thread_local_1d_id();
-        compType lAccuData            = opReduce::GetZeroVal();
+        compType lAccuData            = opReduce::GetReductionZeroVal();
         int lAccuIndex                = 0;
 
         if constexpr(blockIsOneRow)
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/reduction_functions_warpwise.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/reduction_functions_warpwise.hpp
index a8d5750b25..9687d2d8c8 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/reduction_functions_warpwise.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/reduction_functions_warpwise.hpp
@@ -62,7 +62,7 @@ struct WarpReduce
     // This interface implementation uses HIP built-in device shuffling functions
     __device__ static void ReduceImpl1(const BufferType& thread_buffer, compType& accuData)
     {
-        compType lAccuData = opReduce::GetZeroVal();
+        compType lAccuData = opReduce::GetReductionZeroVal();
 
         static_for<0, ThreadBufferLen, 1>{}(
             [&](auto I) { binop::calculate(lAccuData, thread_buffer[I]); });
@@ -84,7 +84,7 @@ struct WarpReduce
     // since for fp16, built-in shuffling functions is not provided by HIP
     __device__ static void ReduceImpl2(const BufferType& thread_buffer, compType& accuData)
     {
-        compType lAccuData = opReduce::GetZeroVal();
+        compType lAccuData = opReduce::GetReductionZeroVal();
 
         static_for<0, ThreadBufferLen, 1>{}(
             [&](auto I) { binop::calculate(lAccuData, thread_buffer[I]); });
@@ -138,7 +138,7 @@ struct WarpReduce
                                         int& accuIndex,
                                         int indexStart)
     {
-        compType lAccuData       = opReduce::GetZeroVal();
+        compType lAccuData       = opReduce::GetReductionZeroVal();
         int lAccuIndex           = 0;
         index_t thread_inwarp_id = get_thread_local_1d_id() % warpSize;
 
@@ -170,7 +170,7 @@ struct WarpReduce
                                         int& accuIndex,
                                         int indexStart)
     {
-        compType lAccuData       = opReduce::GetZeroVal();
+        compType lAccuData       = opReduce::GetReductionZeroVal();
         int lAccuIndex           = 0;
         index_t thread_id        = get_thread_local_1d_id();
         index_t warpId           = thread_id / warpSize;
@@ -278,7 +278,7 @@ struct WarpReduceWithIndicesInput
                                        compType& accuData,
                                        int& accuIndex)
     {
-        compType lAccuData = opReduce::GetZeroVal();
+        compType lAccuData = opReduce::GetReductionZeroVal();
         int lAccuIndex     = 0;
 
         static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
@@ -307,7 +307,7 @@ struct WarpReduceWithIndicesInput
                                        compType& accuData,
                                        int& accuIndex)
     {
-        compType lAccuData       = opReduce::GetZeroVal();
+        compType lAccuData       = opReduce::GetReductionZeroVal();
         int lAccuIndex           = 0;
         index_t thread_id        = get_thread_local_1d_id();
         index_t warpId           = thread_id / warpSize;
diff --git a/src/composable_kernel/composable_kernel/include/utility/reduction_operator.hpp b/src/composable_kernel/composable_kernel/include/utility/reduction_operator.hpp
index edc2f41735..75829e3c50 100644
--- a/src/composable_kernel/composable_kernel/include/utility/reduction_operator.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/reduction_operator.hpp
@@ -35,8 +35,8 @@ namespace reduce {
 // Every binary operator used in reduction is represented by a templated functor class. Each functor
 // class must provide at least
 // three members:
-// 1) GetZeroVal() -- the interface to return the "identity element" for the binary operator,
-// "identity element" is the unique
+// 1) GetReductionZeroVal() -- the interface to return the "identity element" for the binary
+// operator, "identity element" is the unique
 //                    element in the algebraic space that doesn't affect the value of other elements
 //                    when operated with any of them.
 // 2) indexable -- boolean value indicating whether indices of the operated elements could be
@@ -58,7 +58,7 @@ struct Add
 {
     using dataType = T;
 
-    __device__ static constexpr T GetZeroVal() { return static_cast<T>(0.0f); };
+    __device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(0.0f); };
 
     __device__ inline constexpr void operator()(T& a, T b) const { a = a + b; }
 
@@ -70,7 +70,7 @@ struct Mul
 {
     using dataType = T;
 
-    __device__ static constexpr T GetZeroVal() { return static_cast<T>(1.0f); };
+    __device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(1.0f); };
 
     __device__ inline constexpr void operator()(T& a, T b) const { a = a * b; }
 
@@ -82,7 +82,7 @@ struct Max
 {
     using dataType = T;
 
-    __device__ static constexpr T GetZeroVal() { return NumericLimits<T>::lowest(); };
+    __device__ static constexpr T GetReductionZeroVal() { return NumericLimits<T>::lowest(); };
 
     __device__ inline constexpr void operator()(T& a, T b) const
     {
@@ -107,7 +107,7 @@ struct Min
 {
     using dataType = T;
 
-    __device__ static constexpr T GetZeroVal() { return NumericLimits<T>::Max(); };
+    __device__ static constexpr T GetReductionZeroVal() { return NumericLimits<T>::Max(); };
 
     __device__ inline constexpr void operator()(T& a, T b) const
     {
@@ -132,7 +132,7 @@ struct AMax
 {
     using dataType = T;
 
-    __device__ static constexpr T GetZeroVal() { return static_cast<T>(0.0f); };
+    __device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(0.0f); };
 
     __device__ inline constexpr void operator()(T& a, T b) const
     {
@@ -281,7 +281,7 @@ struct unary_sqrt<half_t>
 
 // The templated struct reduce_binary_operator maps the enum Ids of binary operators to their
 // respective functor classes.
-// The "GetZeroVal()" interface and boolean member "indexable" are also provided in
+// The "GetReductionZeroVal()" interface and boolean member "indexable" are also provided in
 // reduce_binary_operactor for
 // easier checking by the upper-layer codes in the kernels.
 

From cbc46965cbf2e4b889a97baab84d38e0aea798d8 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 17 Sep 2021 13:23:32 +0000
Subject: [PATCH 50/57] Remove constexpr from initialized zeroVal and tiny fix
 in reduction_operator.hpp

---
 .../gridwise_generic_2d_reduction_blockwise.hpp             | 6 +++---
 .../gridwise_generic_2d_reduction_direct_threadwise.hpp     | 6 +++---
 .../gridwise_generic_2d_reduction_direct_warpwise.hpp       | 6 +++---
 .../gridwise_generic_2d_reduction_multiblock.hpp            | 4 ++--
 .../include/utility/reduction_operator.hpp                  | 2 +-
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp
index 0ad45f50fe..c635da57f4 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp
@@ -92,7 +92,7 @@ struct GridwiseReduction_xy_to_x_blockwise
         // LDS
         __shared__ compType p_in_block_buffer[BlockBufferSize];
 
-        constexpr auto zeroVal = opReduce::GetReductionZeroVal();
+        const auto zeroVal = opReduce::GetReductionZeroVal();
 
         const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
             p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
@@ -243,7 +243,7 @@ struct GridwiseReduction_xy_to_x_blockwise
         __shared__ compType p_in_block_buffer[BlockBufferSize];
         __shared__ int block_indices_buffer[BlockBufferSize];
 
-        constexpr auto zeroVal = opReduce::GetReductionZeroVal();
+        const auto zeroVal = opReduce::GetReductionZeroVal();
 
         const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
             p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
@@ -431,7 +431,7 @@ struct GridwiseReduction_xy_to_x_blockwise
         __shared__ compType p_in_block_buffer[BlockBufferSize];
         __shared__ int block_indices_buffer[BlockBufferSize];
 
-        constexpr auto zeroVal = opReduce::GetReductionZeroVal();
+        const auto zeroVal = opReduce::GetReductionZeroVal();
 
         const auto src_global_val_buf =
             make_dynamic_buffer<AddressSpaceEnum_t::Global>(ws_values_global,
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp
index 5cf7352a9b..adfeacc037 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp
@@ -82,7 +82,7 @@ struct GridwiseReduction_xy_to_x_direct_threadwise
         (void)ws_indices_global;
         (void)indices_global;
 
-        constexpr auto zeroVal = opReduce::GetReductionZeroVal();
+        const auto zeroVal = opReduce::GetReductionZeroVal();
 
         const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
             p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
@@ -204,7 +204,7 @@ struct GridwiseReduction_xy_to_x_direct_threadwise
     {
         (void)ws_indices_global;
 
-        constexpr auto zeroVal = opReduce::GetReductionZeroVal();
+        const auto zeroVal = opReduce::GetReductionZeroVal();
 
         const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
             p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
@@ -348,7 +348,7 @@ struct GridwiseReduction_xy_to_x_direct_threadwise
     {
         (void)origReduceLen;
 
-        constexpr auto zeroVal = opReduce::GetReductionZeroVal();
+        const auto zeroVal = opReduce::GetReductionZeroVal();
 
         const auto src_global_val_buf =
             make_dynamic_buffer<AddressSpaceEnum_t::Global>(ws_values_global,
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp
index 5fe7bfc8c9..4136dae75f 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp
@@ -82,7 +82,7 @@ struct GridwiseReduction_xy_to_x_direct_warpwise
         (void)ws_indices_global;
         (void)indices_global;
 
-        constexpr auto zeroVal = opReduce::GetReductionZeroVal();
+        const auto zeroVal = opReduce::GetReductionZeroVal();
 
         const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
             p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
@@ -215,7 +215,7 @@ struct GridwiseReduction_xy_to_x_direct_warpwise
     {
         (void)ws_indices_global;
 
-        constexpr auto zeroVal = opReduce::GetReductionZeroVal();
+        const auto zeroVal = opReduce::GetReductionZeroVal();
 
         const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
             p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
@@ -373,7 +373,7 @@ struct GridwiseReduction_xy_to_x_direct_warpwise
     {
         (void)origReduceLen;
 
-        constexpr auto zeroVal = opReduce::GetReductionZeroVal();
+        const auto zeroVal = opReduce::GetReductionZeroVal();
 
         const auto src_global_val_buf =
             make_dynamic_buffer<AddressSpaceEnum_t::Global>(ws_values_global,
diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp
index 06e1930ab2..feee2b594a 100644
--- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp
+++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp
@@ -86,7 +86,7 @@ struct GridwiseReduction_xy_to_x_multiblock
         (void)alpha; // unused
         (void)beta;  // unused
 
-        constexpr auto zeroVal = opReduce::GetReductionZeroVal();
+        const auto zeroVal = opReduce::GetReductionZeroVal();
 
         // LDS
         __shared__ compType p_in_block_buffer[BlockBufferSize];
@@ -216,7 +216,7 @@ struct GridwiseReduction_xy_to_x_multiblock
         (void)alpha; // unused
         (void)beta;  // unused
 
-        constexpr auto zeroVal = opReduce::GetReductionZeroVal();
+        const auto zeroVal = opReduce::GetReductionZeroVal();
 
         // LDS
         __shared__ compType p_in_block_values_buffer[BlockBufferSize];
diff --git a/src/composable_kernel/composable_kernel/include/utility/reduction_operator.hpp b/src/composable_kernel/composable_kernel/include/utility/reduction_operator.hpp
index 75829e3c50..59c23e1517 100644
--- a/src/composable_kernel/composable_kernel/include/utility/reduction_operator.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/reduction_operator.hpp
@@ -82,7 +82,7 @@ struct Max
 {
     using dataType = T;
 
-    __device__ static constexpr T GetReductionZeroVal() { return NumericLimits<T>::lowest(); };
+    __device__ static constexpr T GetReductionZeroVal() { return NumericLimits<T>::Lowest(); };
 
     __device__ inline constexpr void operator()(T& a, T b) const
     {

From 3adc483fe27aefc1bd8a4c0aef9fc5d6cf1fd216 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sat, 18 Sep 2021 08:08:04 +0000
Subject: [PATCH 51/57] Occasional tiny simplification and update in the kernel
 files

---
 .../include/utility/reduction_common.hpp      | 23 ++-----------------
 ...n_first_call_blockwise_reduce_all_dims.cpp |  2 +-
 2 files changed, 3 insertions(+), 22 deletions(-)

diff --git a/src/composable_kernel/composable_kernel/include/utility/reduction_common.hpp b/src/composable_kernel/composable_kernel/include/utility/reduction_common.hpp
index 139a18c2a4..1b5486a005 100644
--- a/src/composable_kernel/composable_kernel/include/utility/reduction_common.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/reduction_common.hpp
@@ -28,13 +28,6 @@
 
 // this enumerate should be synchronized with include/miopen/reduce_common.hpp
 namespace ck {
-enum class ReductionMethod_t
-{
-    DirectThreadWise = 1,
-    DirectWarpWise   = 2,
-    BlockWise        = 3,
-    MultiBlock       = 4
-}; // end of namespace ck
 
 enum class ReduceTensorOp_t
 {
@@ -71,31 +64,19 @@ enum class IndicesType_t
 
 struct float_equal_one
 {
-    template <class T>
-    __device__ static inline bool apply(T x)
-    {
-        return x <= type_convert<T>{}(1.0f) and x >= type_convert<T>{}(1.0f);
-    }
-
     template <class T>
     __device__ inline bool operator()(T x)
     {
-        return (float_equal_one::apply(x));
+        return x <= static_cast<T>(1.0f) and x >= static_cast<T>(1.0f);
     };
 };
 
 struct float_equal_zero
 {
-    template <class T>
-    __device__ static inline bool apply(T x)
-    {
-        return x <= type_convert<T>{}(0.0f) and x >= type_convert<T>{}(0.0f);
-    }
-
     template <class T>
     __device__ inline bool operator()(T x)
     {
-        return (float_equal_zero::apply(x));
+        return x <= static_cast<T>(0.0f) and x >= static_cast<T>(0.0f);
     };
 };
 
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp
index 049d720b86..271dfd34ba 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp
@@ -253,7 +253,7 @@ using refType_src2dDesc_padded_34 =
 using refType_dst1dDesc_padded =
     typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_dst1dDesc_padded;
 
-template <ReductionMethod_t impl, bool need_padding>
+template <bool need_padding>
 static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
 {
     if constexpr(need_padding)

From 44e55372d1d5d135dde58a96d65fb1ac6067bbb6 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sat, 18 Sep 2021 08:11:14 +0000
Subject: [PATCH 52/57] Update in src/reducetensor.cpp for consistent IDs
 passing to the kernel

---
 src/reducetensor.cpp | 202 +++++++++++++++++++++++++++----------------
 1 file changed, 126 insertions(+), 76 deletions(-)

diff --git a/src/reducetensor.cpp b/src/reducetensor.cpp
index 8831546790..abd1eb4be5 100644
--- a/src/reducetensor.cpp
+++ b/src/reducetensor.cpp
@@ -43,6 +43,10 @@
 #include <iostream>
 #include <sstream>
 
+// headers from composable kernel, to get consistent ID mapping
+#include "../composable_kernel/composable_kernel/include/utility/data_type_enum.hpp"
+#include "../composable_kernel/composable_kernel/include/utility/reduction_common.hpp"
+
 MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_DYNAMIC_REDUCTION);
 
 #define WORKAROUND_MIOPEN_ISSUE_557 1
@@ -59,43 +63,6 @@ enum ReductionMethod_t
 
 namespace detail {
 
-struct get_tunable_reduction_kernel_constants
-{
-    int GredThreadBufferLength;
-    int GredAccessesPerThreadInBlock;
-    int GredAccessesPerThreadInWarp;
-
-    get_tunable_reduction_kernel_constants(ReductionMethod_t reduceImpl)
-    {
-        switch(reduceImpl)
-        {
-        case Reduce_DirectThreadWise:
-            GredThreadBufferLength       = 8;
-            GredAccessesPerThreadInBlock = 0;
-            GredAccessesPerThreadInWarp  = 0;
-            break;
-        case Reduce_BlockWise:
-            GredThreadBufferLength       = 0;
-            GredAccessesPerThreadInBlock = 2;
-            GredAccessesPerThreadInWarp  = 0;
-            break;
-        case Reduce_DirectWarpWise:
-            GredThreadBufferLength       = 0;
-            GredAccessesPerThreadInBlock = 0;
-            GredAccessesPerThreadInWarp  = 2;
-            break;
-        case Reduce_MultiBlock:
-            GredThreadBufferLength =
-                8; // needed since the second-time reduction could be DirectThreadWise
-            GredAccessesPerThreadInBlock =
-                2; // needed since the second-time reduction could be BlockWise
-            GredAccessesPerThreadInWarp =
-                2; // needed since the second-time reduction could be DirectWarpWise
-            break;
-        };
-    };
-};
-
 struct ReductionKernelConfigurator
 {
     ReductionKernelConfigurator() = default;
@@ -253,6 +220,47 @@ inline int GetDataTypeSize(miopenDataType_t t)
     };
 };
 
+}; // end of namespace detail
+
+namespace detailS {
+
+struct get_tunable_reduction_kernel_constants
+{
+    int GredThreadBufferLength;
+    int GredAccessesPerThreadInBlock;
+    int GredAccessesPerThreadInWarp;
+
+    get_tunable_reduction_kernel_constants(ReductionMethod_t reduceImpl)
+    {
+        switch(reduceImpl)
+        {
+        case Reduce_DirectThreadWise:
+            GredThreadBufferLength       = 8;
+            GredAccessesPerThreadInBlock = 0;
+            GredAccessesPerThreadInWarp  = 0;
+            break;
+        case Reduce_BlockWise:
+            GredThreadBufferLength       = 0;
+            GredAccessesPerThreadInBlock = 2;
+            GredAccessesPerThreadInWarp  = 0;
+            break;
+        case Reduce_DirectWarpWise:
+            GredThreadBufferLength       = 0;
+            GredAccessesPerThreadInBlock = 0;
+            GredAccessesPerThreadInWarp  = 2;
+            break;
+        case Reduce_MultiBlock:
+            GredThreadBufferLength =
+                8; // needed since the second-time reduction could be DirectThreadWise
+            GredAccessesPerThreadInBlock =
+                2; // needed since the second-time reduction could be BlockWise
+            GredAccessesPerThreadInWarp =
+                2; // needed since the second-time reduction could be DirectWarpWise
+            break;
+        };
+    };
+};
+
 inline int GetDataTypeId(miopenDataType_t t)
 {
     switch(t)
@@ -285,6 +293,46 @@ inline int GetReduceTensorOpId(miopenReduceTensorOp_t t)
     };
 };
 
+}; // end of namespace detailS
+
+namespace detailD {
+
+static ck::DataTypeEnum_t mapDataTypeId(miopenDataType_t t)
+{
+    using namespace ck;
+
+    switch(t)
+    {
+    case miopenHalf: return DataTypeEnum_t::Half;
+    case miopenFloat: return DataTypeEnum_t::Float;
+    case miopenBFloat16: return DataTypeEnum_t::BFloat16;
+    case miopenDouble: return DataTypeEnum_t::Double;
+    case miopenInt8: return DataTypeEnum_t::Int8;
+    case miopenInt8x4: return DataTypeEnum_t::Int8x4;
+    case miopenInt32: return DataTypeEnum_t::Int32;
+    default: MIOPEN_THROW("Only float, half, double data type is supported.");
+    };
+};
+
+static ck::ReduceTensorOp_t mapReduceOpId(miopenReduceTensorOp_t t)
+{
+    using namespace ck;
+
+    switch(t)
+    {
+    case MIOPEN_REDUCE_TENSOR_ADD: return ReduceTensorOp_t::ADD;
+    case MIOPEN_REDUCE_TENSOR_MUL: return ReduceTensorOp_t::MUL;
+    case MIOPEN_REDUCE_TENSOR_MIN: return ReduceTensorOp_t::MIN;
+    case MIOPEN_REDUCE_TENSOR_MAX: return ReduceTensorOp_t::MAX;
+    case MIOPEN_REDUCE_TENSOR_AMAX: return ReduceTensorOp_t::AMAX;
+    case MIOPEN_REDUCE_TENSOR_AVG: return ReduceTensorOp_t::AVG;
+    case MIOPEN_REDUCE_TENSOR_NORM1: return ReduceTensorOp_t::NORM1;
+    case MIOPEN_REDUCE_TENSOR_NORM2: return ReduceTensorOp_t::NORM2;
+
+    default: MIOPEN_THROW("Operation is not supported");
+    };
+};
+
 static std::string get_network_config_string_from_type_enums(miopenDataType_t TSrc,
                                                              miopenDataType_t TComp,
                                                              miopenDataType_t TDst)
@@ -302,9 +350,9 @@ static std::string get_definition_string_from_type_enums(miopenDataType_t TSrc,
 {
     std::ostringstream outs;
 
-    outs << " -DCK_PARAM_SRC_DATATYPE=" << TSrc;
-    outs << " -DCK_PARAM_DST_DATATYPE=" << TDst;
-    outs << " -DCK_PARAM_REDUCE_COMPTYPE=" << TComp;
+    outs << " -DCK_PARAM_SRC_DATATYPE=" << mapDataTypeId(TSrc);
+    outs << " -DCK_PARAM_DST_DATATYPE=" << mapDataTypeId(TDst);
+    outs << " -DCK_PARAM_REDUCE_COMPTYPE=" << mapDataTypeId(TComp);
 
     return (outs.str());
 };
@@ -394,7 +442,7 @@ static std::string get_first_call_kernel_file(const ReductionMethod_t reduceImpl
 {
     std::ostringstream outs;
 
-    outs << "gridwise_generic_reduction_first_call_" << detail::getReductionMethodStr(reduceImpl);
+    outs << "gridwise_generic_reduction_first_call_" << getReductionMethodStr(reduceImpl);
     if(allDimsReduced)
         outs << "_reduce_all_dims.cpp";
     else
@@ -403,7 +451,7 @@ static std::string get_first_call_kernel_file(const ReductionMethod_t reduceImpl
     return (outs.str());
 };
 
-}; // end of namespace detail
+}; // end of namespace detailD
 
 ReduceTensorDescriptor::ReduceTensorDescriptor(miopenReduceTensorOp_t reduceTensorOp,
                                                miopenDataType_t reduceTensorCompType,
@@ -628,7 +676,7 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
             }
         };
 
-        detail::get_tunable_reduction_kernel_constants get_constants(reduceImpl);
+        detailS::get_tunable_reduction_kernel_constants get_constants(reduceImpl);
 
         int GredThreadBufferLength       = get_constants.GredThreadBufferLength;
         int GredAccessesPerThreadInBlock = get_constants.GredAccessesPerThreadInBlock;
@@ -639,9 +687,9 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
         param = std::string(" -std=c++14 ");
         param += " -DCK_PARAM_BLOCKSIZE=" + std::to_string(blockSize);
         param += " -DCK_PARAM_BLKGROUPSIZE=" + std::to_string(blkGroupSize);
-        param += " -DCK_PARAM_SRC_DATATYPE=" + std::to_string(detail::GetDataTypeId(srcDataType));
-        param += " -DCK_PARAM_DST_DATATYPE=" + std::to_string(detail::GetDataTypeId(dstDataType));
-        param += " -DCK_PARAM_REDUCE_COMPTYPE=" + std::to_string(detail::GetDataTypeId(compType));
+        param += " -DCK_PARAM_SRC_DATATYPE=" + std::to_string(detailS::GetDataTypeId(srcDataType));
+        param += " -DCK_PARAM_DST_DATATYPE=" + std::to_string(detailS::GetDataTypeId(dstDataType));
+        param += " -DCK_PARAM_REDUCE_COMPTYPE=" + std::to_string(detailS::GetDataTypeId(compType));
 
         param += " -DCK_PARAM_SRC_DESC_LENGTHS=";
         for(int i = 0; i < inDescLengths.size(); i++)
@@ -704,7 +752,7 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
         else
             param += " -DCK_PARAM_INVARIANT_DIMS= ";
 
-        param += " -DCK_PARAM_REDUCE_OP=" + std::to_string(detail::GetReduceTensorOpId(reduceOp));
+        param += " -DCK_PARAM_REDUCE_OP=" + std::to_string(detailS::GetReduceTensorOpId(reduceOp));
         param += " -DCK_PARAM_NAN_PROPAGATE=" +
                  std::to_string(nanPropaOpt == MIOPEN_PROPAGATE_NAN ? 1 : 0);
         param += " -DCK_PARAM_REDUCE_INDICES=" +
@@ -848,8 +896,9 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
 
         param += solver::ck_utility::get_ck_common_compiler_flag(handle);
 
-        param += detail::get_definition_string_from_type_enums(srcDataType, compType, dstDataType) +
-                 " " + detail::get_definition_string_from_tunable(tunable);
+        param +=
+            detailD::get_definition_string_from_type_enums(srcDataType, compType, dstDataType) +
+            " " + detailD::get_definition_string_from_tunable(tunable);
 
         param += " -DCK_PARAM_TOREDUCE_DIMS=";
         for(int i = 0; i < toReduceDims.size(); i++)
@@ -872,20 +921,21 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
         else
             param += " -DCK_PARAM_INVARIANT_DIMS= ";
 
-        param += " -DCK_PARAM_REDUCE_OP=" + std::to_string(reduceOp);
-        param += " -DCK_PARAM_NAN_PROPAGATE=" +
-                 std::to_string(nanPropaOpt == MIOPEN_PROPAGATE_NAN ? 1 : 0);
-        param += " -DCK_PARAM_REDUCE_INDICES=" +
-                 std::to_string(reduceIndicesOpt == MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES ? 1 : 0);
+        param += " -DCK_PARAM_REDUCE_OP=" +
+                 std::to_string(static_cast<int>(detailD::mapReduceOpId(reduceOp)));
+        param += " -DCK_PARAM_NAN_PROPAGATE=";
+        param += (nanPropaOpt == MIOPEN_PROPAGATE_NAN) ? "1" : "0";
+        param += " -DCK_PARAM_REDUCE_INDICES=";
+        param += (reduceIndicesOpt == MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES) ? "1" : "0";
         param += " -DCK_PARAM_IN_DIMS=" + std::to_string(inDescLengths.size());
-        param += " -DCK_PARAM_OUT_DIMS=" +
-                 std::to_string(reduceAllDims ? 1 : static_cast<int>(invariantDims.size()));
+        param += " -DCK_PARAM_OUT_DIMS=";
+        param += reduceAllDims ? "1" : std::to_string(invariantDims.size());
 
         float time_reduce = 0.0f;
 
         network_config =
-            detail::get_network_config_string_from_type_enums(srcDataType, compType, dstDataType) +
-            "_" + detail::get_network_config_string_from_tunable(tunable) + "_";
+            detailD::get_network_config_string_from_type_enums(srcDataType, compType, dstDataType) +
+            "_" + detailD::get_network_config_string_from_tunable(tunable) + "_";
 
         network_config += "I" + std::to_string(inDescLengths.size()) + "_";
 
@@ -894,14 +944,14 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
             network_config += std::to_string(dim) + "_";
         network_config += "BSIZE_" + std::to_string(tunable->BlockSize);
 
-        auto use_padding = detail::get_padding_need(reduceImpl,
-                                                    invariantLength,
-                                                    toReduceLength,
-                                                    gridSize,
-                                                    tunable->BlockSize,
-                                                    handle.GetWavefrontWidth(),
-                                                    blkGroupSize,
-                                                    tunable);
+        auto use_padding = detailD::get_padding_need(reduceImpl,
+                                                     invariantLength,
+                                                     toReduceLength,
+                                                     gridSize,
+                                                     tunable->BlockSize,
+                                                     handle.GetWavefrontWidth(),
+                                                     blkGroupSize,
+                                                     tunable);
 
         std::string param1 =
             param +
@@ -909,7 +959,7 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
             " -DCK_PARAM_DST1D_PADDING=" + std::to_string(static_cast<int>(use_padding.second));
 
         const std::string program_name1 =
-            detail::get_first_call_kernel_file(reduceImpl, reduceAllDims);
+            detailD::get_first_call_kernel_file(reduceImpl, reduceAllDims);
         std::string kernel_name1     = "gridwise_generic_reduce_1_prepare";
         std::string network_config_1 = network_config + "_1_P" + std::to_string(reduceImpl) +
                                        std::to_string(static_cast<int>(use_padding.first)) +
@@ -977,14 +1027,14 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
                 static_cast<size_t>(gridSize_2) * tunable->BlockSize, size_t{1}, size_t{1}};
             const auto reduceImpl2 =
                 configurator.GetReductionMethod_2(invariantLength, toReduceLength_2);
-            const auto use_padding2 = detail::get_padding_need(reduceImpl2,
-                                                               invariantLength,
-                                                               toReduceLength_2,
-                                                               gridSize_2,
-                                                               tunable->BlockSize,
-                                                               handle.GetWavefrontWidth(),
-                                                               1,
-                                                               tunable);
+            const auto use_padding2 = detailD::get_padding_need(reduceImpl2,
+                                                                invariantLength,
+                                                                toReduceLength_2,
+                                                                gridSize_2,
+                                                                tunable->BlockSize,
+                                                                handle.GetWavefrontWidth(),
+                                                                1,
+                                                                tunable);
 
             std::string param2 = param + " -DCK_PARAM_SRC2D_PADDING=" +
                                  std::to_string(static_cast<int>(use_padding2.first)) +
@@ -992,7 +1042,7 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
                                  std::to_string(static_cast<int>(use_padding2.second));
 
             std::string program_name2 = "gridwise_generic_reduction_second_call_" +
-                                        detail::getReductionMethodStr(reduceImpl2) + ".cpp";
+                                        detailD::getReductionMethodStr(reduceImpl2) + ".cpp";
             std::string kernel_name2     = "gridwise_generic_reduce_2_prepare";
             std::string network_config_2 = network_config + "_2_P" + std::to_string(reduceImpl2) +
                                            std::to_string(static_cast<int>(use_padding2.first)) +

From dbb77c50bfea07fd94ae7aef9bd82b8b57021cbb Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sat, 18 Sep 2021 13:35:42 +0000
Subject: [PATCH 53/57] Update to re-order tensor dimensions on the host, split
 second_call kernel wrapper files and simplify reduce_all kernel wrappers

---
 ...n_first_call_blockwise_reduce_all_dims.cpp |  70 ++----
 ...rst_call_blockwise_reduce_partial_dims.cpp |  27 +-
 ..._first_call_multiblock_reduce_all_dims.cpp |  71 ++----
 ...st_call_multiblock_reduce_partial_dims.cpp |  27 +-
 ..._first_call_threadwise_reduce_all_dims.cpp |  72 ++----
 ...st_call_threadwise_reduce_partial_dims.cpp |  27 +-
 ...on_first_call_warpwise_reduce_all_dims.cpp |  45 +---
 ...irst_call_warpwise_reduce_partial_dims.cpp |  27 +-
 ..._second_call_blockwise_reduce_all_dims.cpp | 213 ++++++++++++++++
 ...nd_call_blockwise_reduce_partial_dims.cpp} |  19 +-
 ...second_call_threadwise_reduce_all_dims.cpp | 230 ++++++++++++++++++
 ...d_call_threadwise_reduce_partial_dims.cpp} |   7 +-
 ...n_second_call_warpwise_reduce_all_dims.cpp | 229 +++++++++++++++++
 ...ond_call_warpwise_reduce_partial_dims.cpp} |  17 +-
 src/reducetensor.cpp                          | 167 +++++++------
 15 files changed, 864 insertions(+), 384 deletions(-)
 create mode 100644 src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_all_dims.cpp
 rename src/composable_kernel/composable_kernel/src/kernel_wrapper/{gridwise_generic_reduction_second_call_blockwise.cpp => gridwise_generic_reduction_second_call_blockwise_reduce_partial_dims.cpp} (94%)
 create mode 100644 src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_all_dims.cpp
 rename src/composable_kernel/composable_kernel/src/kernel_wrapper/{gridwise_generic_reduction_second_call_threadwise.cpp => gridwise_generic_reduction_second_call_threadwise_reduce_partial_dims.cpp} (97%)
 create mode 100644 src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_all_dims.cpp
 rename src/composable_kernel/composable_kernel/src/kernel_wrapper/{gridwise_generic_reduction_second_call_warpwise.cpp => gridwise_generic_reduction_second_call_warpwise_reduce_partial_dims.cpp} (95%)

diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp
index 271dfd34ba..089e775152 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp
@@ -45,7 +45,7 @@ constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
 constexpr index_t srcDims = CK_PARAM_IN_DIMS;
 constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
 
-using toReduceDims = Sequence<CK_PARAM_TOREDUCE_DIMS>;
+using toReduceDims = typename arithmetic_sequence_gen<0, srcDims, 1>::type;
 
 constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
 constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
@@ -58,13 +58,6 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
 constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
 constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
 
-////////////////////////////////////////////////////////////////////////////////////////
-using specDims = typename sequence_merge<Sequence<>, toReduceDims>::type;
-
-static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
-              "Wrong invariant and/or toReduce dimensions!");
-
-// The number of invariant dimensions can be zero if all dimension are to be reduced
 static_assert(dstDims == 1,
               "If all source dimensions are reduced, the dest should have only one dimension !!");
 
@@ -110,18 +103,6 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
                                                              int inStride3,
                                                              int inStride4,
                                                              int inStride5,
-                                                             int outLength0,
-                                                             int outLength1,
-                                                             int outLength2,
-                                                             int outLength3,
-                                                             int outLength4,
-                                                             int outLength5,
-                                                             int outStride0,
-                                                             int outStride1,
-                                                             int outStride2,
-                                                             int outStride3,
-                                                             int outStride4,
-                                                             int outStride5,
                                                              void* __restrict__ ws_global)
 {
     (void)GridSize;
@@ -132,18 +113,14 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
 
     const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
     const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
-    const int dstLengths[6] = {
-        outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
-    const int dstStrides[6] = {
-        outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
 
     const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
     const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
-    const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
-    const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
+    const auto tupleDstLengths = make_tuple(1);
+    const auto tupleDstStrides = make_tuple(1);
 
     const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
-    const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+    auto dstDesc       = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
 
     const auto one_dim_srcDesc = transform_tensor_descriptor(
         srcDesc,
@@ -157,14 +134,8 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
         make_tuple(Sequence<0>{}),
         make_tuple(Sequence<0, 1>{}));
 
-    auto dst1dDesc = transform_tensor_descriptor(
-        dstDesc,
-        make_tuple(make_merge_transform(tupleDstLengths)),
-        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
-        make_tuple(Sequence<0>{}));
-
-    const auto invariantLen = src2dDesc.GetLength(Number<0>{});
-    const auto toReduceLen  = src2dDesc.GetLength(Number<1>{});
+    constexpr int invariantLen = 1;
+    const auto toReduceLen     = src2dDesc.GetLength(Number<1>{});
 
     constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock;
 
@@ -189,20 +160,18 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
     }
 
     if(get_thread_local_1d_id() == 0)
-        *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
+        *static_cast<decltype(dstDesc)*>(p_dst1dDesc) = dstDesc;
 };
 
-template <index_t srcDims, index_t dstDims, typename invariantDims, typename toReduceDims>
+template <index_t srcDims>
 struct get_ref_desc_types
 {
     static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
-    static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 1>::type{};
 
     // don't have to use accurate strides to get an expected referrence type
     static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
         make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
-    static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
-        make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
+    static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(make_tuple(1), make_tuple(1));
 
     static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor(
         ref_srcDesc,
@@ -217,12 +186,6 @@ struct get_ref_desc_types
                                     make_tuple(Sequence<0>{}),
                                     make_tuple(Sequence<0, 1>{}));
 
-    static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
-        ref_dstDesc,
-        make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
-        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
-        make_tuple(Sequence<0>{}));
-
     static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
     static constexpr auto ref_toReduceLen  = ref_src2dDesc.GetLength(Number<1>{});
 
@@ -235,23 +198,20 @@ struct get_ref_desc_types
                                     make_tuple(Sequence<0>{}, Sequence<1>{})));
 
     using refType_dst1dDesc_padded =
-        decltype(transform_tensor_descriptor(ref_dst1dDesc,
+        decltype(transform_tensor_descriptor(ref_dstDesc,
                                              make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
                                              make_tuple(Sequence<0>{}),
                                              make_tuple(Sequence<0>{})));
 
     using refType_src2dDesc = decltype(ref_src2dDesc);
-    using refType_dst1dDesc = decltype(ref_dst1dDesc);
+    using refType_dst1dDesc = decltype(ref_dstDesc);
 };
 
-using refType_src2dDesc =
-    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_src2dDesc;
-using refType_dst1dDesc =
-    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_dst1dDesc;
+using refType_src2dDesc = typename get_ref_desc_types<srcDims>::refType_src2dDesc;
+using refType_dst1dDesc = typename get_ref_desc_types<srcDims>::refType_dst1dDesc;
 using refType_src2dDesc_padded_34 =
-    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_src2dDesc_padded_34;
-using refType_dst1dDesc_padded =
-    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_dst1dDesc_padded;
+    typename get_ref_desc_types<srcDims>::refType_src2dDesc_padded_34;
+using refType_dst1dDesc_padded = typename get_ref_desc_types<srcDims>::refType_dst1dDesc_padded;
 
 template <bool need_padding>
 static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp
index 8df91e8c51..a3daeaf163 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp
@@ -45,8 +45,11 @@ constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
 constexpr index_t srcDims = CK_PARAM_IN_DIMS;
 constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
 
-using toReduceDims  = Sequence<CK_PARAM_TOREDUCE_DIMS>;
-using invariantDims = Sequence<CK_PARAM_INVARIANT_DIMS>;
+constexpr index_t num_toReduceDims  = CK_PARAM_NUM_TOREDUCE_DIMS;
+constexpr index_t num_invariantDims = srcDims - num_toReduceDims;
+
+using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type;
+using toReduceDims  = typename arithmetic_sequence_gen<num_invariantDims, srcDims, 1>::type;
 
 constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
 constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
@@ -59,15 +62,7 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
 constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
 constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
 
-////////////////////////////////////////////////////////////////////////////////////////
-using specDims = typename sequence_merge<invariantDims, toReduceDims>::type;
-
-static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
-              "Wrong invariant and/or toReduce dimensions!");
-
-// The number of invariant dimensions can be zero if all dimension are to be reduced
-static_assert(invariantDims::Size() > 0 || dstDims == 1,
-              "If all source dimensions are reduced, the dest should have only one dimension !!");
+static_assert(num_invariantDims > 0, "Not all dimensins are reduced for this kernel !!");
 
 constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
 constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
@@ -111,12 +106,6 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
                                                              int inStride3,
                                                              int inStride4,
                                                              int inStride5,
-                                                             int outLength0,
-                                                             int outLength1,
-                                                             int outLength2,
-                                                             int outLength3,
-                                                             int outLength4,
-                                                             int outLength5,
                                                              int outStride0,
                                                              int outStride1,
                                                              int outStride2,
@@ -133,14 +122,12 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
 
     const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
     const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
-    const int dstLengths[6] = {
-        outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
     const int dstStrides[6] = {
         outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
 
     const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
     const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
-    const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
+    const auto tupleDstLengths = make_tuple_from_array(srcLengths, Number<dstDims>{});
     const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
 
     const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp
index 8df63f5519..a425af0dff 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp
@@ -45,8 +45,7 @@ constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
 constexpr index_t srcDims = CK_PARAM_IN_DIMS;
 constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
 
-using toReduceDims  = Sequence<CK_PARAM_TOREDUCE_DIMS>;
-using invariantDims = Sequence<CK_PARAM_INVARIANT_DIMS>; // this could be empty
+using toReduceDims = typename arithmetic_sequence_gen<0, srcDims, 1>::type;
 
 constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
 constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
@@ -59,13 +58,6 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
 constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
 constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
 
-////////////////////////////////////////////////////////////////////////////////////////
-using specDims = typename sequence_merge<Sequence<>, toReduceDims>::type;
-
-static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
-              "Wrong invariant and/or toReduce dimensions!");
-
-// The number of invariant dimensions can be zero if all dimension are to be reduced
 static_assert(dstDims == 1,
               "If all source dimensions are reduced, the dest should have only one dimension !!");
 
@@ -111,18 +103,6 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
                                                              int inStride3,
                                                              int inStride4,
                                                              int inStride5,
-                                                             int outLength0,
-                                                             int outLength1,
-                                                             int outLength2,
-                                                             int outLength3,
-                                                             int outLength4,
-                                                             int outLength5,
-                                                             int outStride0,
-                                                             int outStride1,
-                                                             int outStride2,
-                                                             int outStride3,
-                                                             int outStride4,
-                                                             int outStride5,
                                                              void* __restrict__ ws_global)
 {
     (void)GridSize;
@@ -132,18 +112,14 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
 
     const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
     const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
-    const int dstLengths[6] = {
-        outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
-    const int dstStrides[6] = {
-        outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
 
     const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
     const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
-    const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
-    const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
+    const auto tupleDstLengths = make_tuple(1);
+    const auto tupleDstStrides = make_tuple(1);
 
     const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
-    const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+    auto dstDesc       = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
 
     const auto one_dim_srcDesc = transform_tensor_descriptor(
         srcDesc,
@@ -157,14 +133,8 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
         make_tuple(Sequence<0>{}),
         make_tuple(Sequence<0, 1>{}));
 
-    auto dst1dDesc = transform_tensor_descriptor(
-        dstDesc,
-        make_tuple(make_merge_transform(tupleDstLengths)),
-        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
-        make_tuple(Sequence<0>{}));
-
-    const auto invariantLen = src2dDesc.GetLength(Number<0>{});
-    const auto toReduceLen  = src2dDesc.GetLength(Number<1>{});
+    constexpr int invariantLen = 1;
+    const auto toReduceLen     = src2dDesc.GetLength(Number<1>{});
 
     constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock;
     const index_t reduceSizePerBlock =
@@ -191,20 +161,18 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
     }
 
     if(get_thread_local_1d_id() == 0)
-        *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
+        *static_cast<decltype(dstDesc)*>(p_dst1dDesc) = dstDesc;
 };
 
-template <index_t srcDims, index_t dstDims, typename toReduceDims>
+template <index_t srcDims>
 struct get_ref_desc_types
 {
     static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
-    static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 1>::type{};
 
     // don't have to use accurate strides to get an expected referrence type
     static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
         make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
-    static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
-        make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
+    static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(make_tuple(1), make_tuple(1));
 
     static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor(
         ref_srcDesc,
@@ -219,12 +187,6 @@ struct get_ref_desc_types
                                     make_tuple(Sequence<0>{}),
                                     make_tuple(Sequence<0, 1>{}));
 
-    static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
-        ref_dstDesc,
-        make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
-        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
-        make_tuple(Sequence<0>{}));
-
     static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
     static constexpr auto ref_toReduceLen  = ref_src2dDesc.GetLength(Number<1>{});
 
@@ -237,23 +199,20 @@ struct get_ref_desc_types
                                     make_tuple(Sequence<0>{}, Sequence<1>{})));
 
     using refType_dst1dDesc_padded =
-        decltype(transform_tensor_descriptor(ref_dst1dDesc,
+        decltype(transform_tensor_descriptor(ref_dstDesc,
                                              make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
                                              make_tuple(Sequence<0>{}),
                                              make_tuple(Sequence<0>{})));
 
     using refType_src2dDesc = decltype(ref_src2dDesc);
-    using refType_dst1dDesc = decltype(ref_dst1dDesc);
+    using refType_dst1dDesc = decltype(ref_dstDesc);
 };
 
-using refType_src2dDesc =
-    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_src2dDesc;
-using refType_dst1dDesc =
-    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_dst1dDesc;
+using refType_src2dDesc = typename get_ref_desc_types<srcDims>::refType_src2dDesc;
+using refType_dst1dDesc = typename get_ref_desc_types<srcDims>::refType_dst1dDesc;
 using refType_src2dDesc_padded_34 =
-    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_src2dDesc_padded_34;
-using refType_dst1dDesc_padded =
-    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_dst1dDesc_padded;
+    typename get_ref_desc_types<srcDims>::refType_src2dDesc_padded_34;
+using refType_dst1dDesc_padded = typename get_ref_desc_types<srcDims>::refType_dst1dDesc_padded;
 
 template <bool need_padding>
 static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp
index d7cf4633b9..0e578f4d1d 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp
@@ -45,8 +45,11 @@ constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
 constexpr index_t srcDims = CK_PARAM_IN_DIMS;
 constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
 
-using toReduceDims  = Sequence<CK_PARAM_TOREDUCE_DIMS>;
-using invariantDims = Sequence<CK_PARAM_INVARIANT_DIMS>;
+constexpr index_t num_toReduceDims  = CK_PARAM_NUM_TOREDUCE_DIMS;
+constexpr index_t num_invariantDims = srcDims - num_toReduceDims;
+
+using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type;
+using toReduceDims  = typename arithmetic_sequence_gen<num_invariantDims, srcDims, 1>::type;
 
 constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
 constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
@@ -59,15 +62,7 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
 constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
 constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
 
-////////////////////////////////////////////////////////////////////////////////////////
-using specDims = typename sequence_merge<invariantDims, toReduceDims>::type;
-
-static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
-              "Wrong invariant and/or toReduce dimensions!");
-
-// The number of invariant dimensions can be zero if all dimension are to be reduced
-static_assert(invariantDims::Size() > 0 || dstDims == 1,
-              "If all source dimensions are reduced, the dest should have only one dimension !!");
+static_assert(num_invariantDims > 0, "Not all dimensins are reduced for this kernel !!");
 
 constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
 constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
@@ -111,12 +106,6 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
                                                              int inStride3,
                                                              int inStride4,
                                                              int inStride5,
-                                                             int outLength0,
-                                                             int outLength1,
-                                                             int outLength2,
-                                                             int outLength3,
-                                                             int outLength4,
-                                                             int outLength5,
                                                              int outStride0,
                                                              int outStride1,
                                                              int outStride2,
@@ -132,14 +121,12 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
 
     const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
     const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
-    const int dstLengths[6] = {
-        outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
     const int dstStrides[6] = {
         outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
 
     const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
     const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
-    const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
+    const auto tupleDstLengths = make_tuple_from_array(srcLengths, Number<dstDims>{});
     const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
 
     const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp
index 52c9a7c8fd..6dd806567c 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp
@@ -45,7 +45,7 @@ constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
 constexpr index_t srcDims = CK_PARAM_IN_DIMS;
 constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
 
-using toReduceDims = Sequence<CK_PARAM_TOREDUCE_DIMS>;
+using toReduceDims = typename arithmetic_sequence_gen<0, srcDims, 1>::type;
 
 constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
 constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
@@ -58,13 +58,6 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
 constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
 constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
 
-////////////////////////////////////////////////////////////////////////////////////////
-using specDims = typename sequence_merge<Sequence<>, toReduceDims>::type;
-
-static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
-              "Wrong invariant and/or toReduce dimensions!");
-
-// The number of invariant dimensions can be zero if all dimension are to be reduced
 static_assert(dstDims == 1,
               "If all source dimensions are reduced, the dest should have only one dimension !!");
 
@@ -110,18 +103,6 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
                                                              int inStride3,
                                                              int inStride4,
                                                              int inStride5,
-                                                             int outLength0,
-                                                             int outLength1,
-                                                             int outLength2,
-                                                             int outLength3,
-                                                             int outLength4,
-                                                             int outLength5,
-                                                             int outStride0,
-                                                             int outStride1,
-                                                             int outStride2,
-                                                             int outStride3,
-                                                             int outStride4,
-                                                             int outStride5,
                                                              void* __restrict__ ws_global)
 {
     (void)BlkGroupSize;
@@ -131,18 +112,14 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
 
     const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
     const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
-    const int dstLengths[6] = {
-        outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
-    const int dstStrides[6] = {
-        outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
 
     const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
     const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
-    const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
-    const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
+    const auto tupleDstLengths = make_tuple(1);
+    const auto tupleDstStrides = make_tuple(1);
 
     const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
-    const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+    auto dstDesc       = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
 
     const auto one_dim_srcDesc = transform_tensor_descriptor(
         srcDesc,
@@ -156,14 +133,8 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
         make_tuple(Sequence<0>{}),
         make_tuple(Sequence<0, 1>{}));
 
-    auto dst1dDesc = transform_tensor_descriptor(
-        dstDesc,
-        make_tuple(make_merge_transform(tupleDstLengths)),
-        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
-        make_tuple(Sequence<0>{}));
-
-    const auto invariantLen = src2dDesc.GetLength(Number<0>{});
-    const auto toReduceLen  = src2dDesc.GetLength(Number<1>{});
+    constexpr int invariantLen = 1;
+    const auto toReduceLen     = src2dDesc.GetLength(Number<1>{});
 
     constexpr auto copySliceLen = GredThreadBufferLength;
 
@@ -191,7 +162,7 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
     {
         const auto dstPad = GridSize * BlockSize - invariantLen;
         auto dst1dDesc_2 =
-            transform_tensor_descriptor(dst1dDesc,
+            transform_tensor_descriptor(dstdDesc,
                                         make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
                                         make_tuple(Sequence<0>{}),
                                         make_tuple(Sequence<0>{}));
@@ -201,21 +172,19 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
     else
     {
         if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
+            *static_cast<decltype(dstDesc)*>(p_dst1dDesc) = dstDesc;
     }
 };
 
-template <index_t srcDims, index_t dstDims, typename toReduceDims>
+template <index_t srcDims>
 struct get_ref_desc_types
 {
     static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
-    static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 1>::type{};
 
     // don't have to use accurate strides to get an expected referrence type
     static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
         make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
-    static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
-        make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
+    static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(make_tuple(1), make_tuple(1));
 
     static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor(
         ref_srcDesc,
@@ -230,12 +199,6 @@ struct get_ref_desc_types
                                     make_tuple(Sequence<0>{}),
                                     make_tuple(Sequence<0, 1>{}));
 
-    static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
-        ref_dstDesc,
-        make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
-        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
-        make_tuple(Sequence<0>{}));
-
     static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
     static constexpr auto ref_toReduceLen  = ref_src2dDesc.GetLength(Number<1>{});
 
@@ -248,23 +211,20 @@ struct get_ref_desc_types
                                              make_tuple(Sequence<0>{}, Sequence<1>{})));
 
     using refType_dst1dDesc_padded =
-        decltype(transform_tensor_descriptor(ref_dst1dDesc,
+        decltype(transform_tensor_descriptor(ref_dstDesc,
                                              make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
                                              make_tuple(Sequence<0>{}),
                                              make_tuple(Sequence<0>{})));
 
     using refType_src2dDesc = decltype(ref_src2dDesc);
-    using refType_dst1dDesc = decltype(ref_dst1dDesc);
+    using refType_dst1dDesc = decltype(ref_dstDesc);
 };
 
-using refType_src2dDesc =
-    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_src2dDesc;
-using refType_dst1dDesc =
-    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_dst1dDesc;
+using refType_src2dDesc = typename get_ref_desc_types<srcDims>::refType_src2dDesc;
+using refType_dst1dDesc = typename get_ref_desc_types<srcDims>::refType_dst1dDesc;
 using refType_src2dDesc_padded_12 =
-    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_src2dDesc_padded_12;
-using refType_dst1dDesc_padded =
-    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_dst1dDesc_padded;
+    typename get_ref_desc_types<srcDims>::refType_src2dDesc_padded_12;
+using refType_dst1dDesc_padded = typename get_ref_desc_types<srcDims>::refType_dst1dDesc_padded;
 
 template <bool need_padding>
 static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp
index b9daaee1fd..698f740058 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp
@@ -45,8 +45,11 @@ constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
 constexpr index_t srcDims = CK_PARAM_IN_DIMS;
 constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
 
-using toReduceDims  = Sequence<CK_PARAM_TOREDUCE_DIMS>;
-using invariantDims = Sequence<CK_PARAM_INVARIANT_DIMS>;
+constexpr index_t num_toReduceDims  = CK_PARAM_NUM_TOREDUCE_DIMS;
+constexpr index_t num_invariantDims = srcDims - num_toReduceDims;
+
+using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type;
+using toReduceDims  = typename arithmetic_sequence_gen<num_invariantDims, srcDims, 1>::type;
 
 constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
 constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
@@ -59,15 +62,7 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
 constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
 constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
 
-////////////////////////////////////////////////////////////////////////////////////////
-using specDims = typename sequence_merge<invariantDims, toReduceDims>::type;
-
-static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
-              "Wrong invariant and/or toReduce dimensions!");
-
-// The number of invariant dimensions can be zero if all dimension are to be reduced
-static_assert(invariantDims::Size() > 0 || dstDims == 1,
-              "If all source dimensions are reduced, the dest should have only one dimension !!");
+static_assert(num_invariantDims > 0, "Not all dimensins are reduced for this kernel !!");
 
 constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
 constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
@@ -111,12 +106,6 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
                                                              int inStride3,
                                                              int inStride4,
                                                              int inStride5,
-                                                             int outLength0,
-                                                             int outLength1,
-                                                             int outLength2,
-                                                             int outLength3,
-                                                             int outLength4,
-                                                             int outLength5,
                                                              int outStride0,
                                                              int outStride1,
                                                              int outStride2,
@@ -132,14 +121,12 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
 
     const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
     const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
-    const int dstLengths[6] = {
-        outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
     const int dstStrides[6] = {
         outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
 
     const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
     const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
-    const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
+    const auto tupleDstLengths = make_tuple_from_array(srcLengths, Number<dstDims>{});
     const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
 
     const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp
index bf34ed44c8..e4387151d5 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp
@@ -45,7 +45,7 @@ constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
 constexpr index_t srcDims = CK_PARAM_IN_DIMS;
 constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
 
-using toReduceDims = Sequence<CK_PARAM_TOREDUCE_DIMS>;
+using toReduceDims = typename arithmetic_sequence_gen<0, srcDims, 1>::type;
 
 constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
 constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
@@ -58,13 +58,6 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
 constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
 constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
 
-////////////////////////////////////////////////////////////////////////////////////////
-using specDims = typename sequence_merge<Sequence<>, toReduceDims>::type;
-
-static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
-              "Wrong invariant and/or toReduce dimensions!");
-
-// The number of invariant dimensions can be zero if all dimension are to be reduced
 static_assert(dstDims == 1,
               "If all source dimensions are reduced, the dest should have only one dimension !!");
 
@@ -110,18 +103,6 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
                                                              int inStride3,
                                                              int inStride4,
                                                              int inStride5,
-                                                             int outLength0,
-                                                             int outLength1,
-                                                             int outLength2,
-                                                             int outLength3,
-                                                             int outLength4,
-                                                             int outLength5,
-                                                             int outStride0,
-                                                             int outStride1,
-                                                             int outStride2,
-                                                             int outStride3,
-                                                             int outStride4,
-                                                             int outStride5,
                                                              void* __restrict__ ws_global)
 {
     (void)BlkGroupSize;
@@ -131,18 +112,14 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
 
     const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
     const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
-    const int dstLengths[6] = {
-        outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
-    const int dstStrides[6] = {
-        outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
 
     const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
     const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
-    const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
-    const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
+    const auto tupleDstLengths = make_tuple(1);
+    const auto tupleDstStrides = make_tuple(1);
 
     const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
-    const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+    auto dstDesc       = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
 
     const auto one_dim_srcDesc = transform_tensor_descriptor(
         srcDesc,
@@ -156,14 +133,8 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
         make_tuple(Sequence<0>{}),
         make_tuple(Sequence<0, 1>{}));
 
-    auto dst1dDesc = transform_tensor_descriptor(
-        dstDesc,
-        make_tuple(make_merge_transform(tupleDstLengths)),
-        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
-        make_tuple(Sequence<0>{}));
-
-    const auto invariantLen = src2dDesc.GetLength(Number<0>{});
-    const auto toReduceLen  = src2dDesc.GetLength(Number<1>{});
+    constexpr int invariantLen = 1;
+    const auto toReduceLen     = src2dDesc.GetLength(Number<1>{});
 
     constexpr auto copySliceLen = warpSize * GredAccessesPerThreadInWarp;
 
@@ -192,7 +163,7 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
     {
         const auto dstPad = GridSize * BlockSize / warpSize - invariantLen;
         auto dst1dDesc_2 =
-            transform_tensor_descriptor(dst1dDesc,
+            transform_tensor_descriptor(dstDesc,
                                         make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
                                         make_tuple(Sequence<0>{}),
                                         make_tuple(Sequence<0>{}));
@@ -202,7 +173,7 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
     else
     {
         if(get_thread_local_1d_id() == 0)
-            *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
+            *static_cast<decltype(dstDesc)*>(p_dst1dDesc) = dstDesc;
     }
 };
 
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp
index 43d45ad700..a641527900 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp
@@ -45,8 +45,11 @@ constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
 constexpr index_t srcDims = CK_PARAM_IN_DIMS;
 constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
 
-using toReduceDims  = Sequence<CK_PARAM_TOREDUCE_DIMS>;
-using invariantDims = Sequence<CK_PARAM_INVARIANT_DIMS>;
+constexpr index_t num_toReduceDims  = CK_PARAM_NUM_TOREDUCE_DIMS;
+constexpr index_t num_invariantDims = srcDims - num_toReduceDims;
+
+using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type;
+using toReduceDims  = typename arithmetic_sequence_gen<num_invariantDims, srcDims, 1>::type;
 
 constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
 constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
@@ -59,15 +62,7 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
 constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
 constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
 
-////////////////////////////////////////////////////////////////////////////////////////
-using specDims = typename sequence_merge<invariantDims, toReduceDims>::type;
-
-static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
-              "Wrong invariant and/or toReduce dimensions!");
-
-// The number of invariant dimensions can be zero if all dimension are to be reduced
-static_assert(invariantDims::Size() > 0 || dstDims == 1,
-              "If all source dimensions are reduced, the dest should have only one dimension !!");
+static_assert(num_invariantDims > 0, "Not all dimensins are reduced for this kernel !!");
 
 constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
 constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
@@ -111,12 +106,6 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
                                                              int inStride3,
                                                              int inStride4,
                                                              int inStride5,
-                                                             int outLength0,
-                                                             int outLength1,
-                                                             int outLength2,
-                                                             int outLength3,
-                                                             int outLength4,
-                                                             int outLength5,
                                                              int outStride0,
                                                              int outStride1,
                                                              int outStride2,
@@ -132,14 +121,12 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
 
     const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
     const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
-    const int dstLengths[6] = {
-        outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
     const int dstStrides[6] = {
         outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
 
     const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
     const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
-    const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
+    const auto tupleDstLengths = make_tuple_from_array(srcLengths, Number<dstDims>{});
     const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
 
     const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_all_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_all_dims.cpp
new file mode 100644
index 0000000000..fd3350d43d
--- /dev/null
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_all_dims.cpp
@@ -0,0 +1,213 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "config.hpp"
+#include "number.hpp"
+#include "sequence.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "data_type_enum_helper.hpp"
+#include "reduction_common.hpp"
+#include "gridwise_generic_2d_reduction_blockwise.hpp"
+
+using namespace ck;
+
+using srcDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
+using dstDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
+using compType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
+
+constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
+
+constexpr index_t srcDims = CK_PARAM_IN_DIMS;
+constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
+
+constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
+constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
+                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
+                                             : NanPropagation_t::PROPAGATE_NAN;
+constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
+                                                       ? ReduceTensorIndices_t::NO_INDICES
+                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
+
+constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
+constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
+
+static_assert(dstDims == 1,
+              "If all source dimensions are reduced, the dest should have only one dimension !!");
+
+constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
+constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
+
+constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable
+
+extern "C" __global__ void
+gridwise_generic_reduce_2_prepare(int GridSize, int BlkGroupSize, void* __restrict__ ws_global)
+{
+    (void)GridSize;
+
+    void* p_src2dDesc = ws_global;
+    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+
+    const auto tupleDstLengths = make_tuple(1);
+    const auto tupleDstStrides = make_tuple(1);
+
+    auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+
+    const index_t invariantLen = dstDesc.GetLength(Number<0>{});
+    const index_t toReduceLen  = BlkGroupSize;
+
+    auto src2dDesc = make_naive_tensor_descriptor_packed(make_tuple(invariantLen, toReduceLen));
+
+    constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock;
+
+    if constexpr(src2d_need_padding)
+    {
+        const auto srcPad =
+            ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
+
+        auto src2dDesc_2 =
+            transform_tensor_descriptor(src2dDesc,
+                                        make_tuple(make_pass_through_transform(invariantLen),
+                                                   make_pad_transform(toReduceLen, 0, srcPad)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
+    }
+    else
+    {
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
+    }
+
+    if(get_thread_local_1d_id() == 0)
+        *static_cast<decltype(dstDesc)*>(p_dst1dDesc) = dstDesc;
+};
+
+template <index_t srcDims>
+struct get_ref_desc_types
+{
+    static constexpr auto ref_tupleDstLengths = make_tuple(8);
+    static constexpr auto ref_dstDesc =
+        make_naive_tensor_descriptor(ref_tupleDstLengths, ref_tupleDstLengths);
+
+    static constexpr index_t ref_invariantLen = ref_dstDesc.GetLength(Number<0>{});
+    static constexpr index_t ref_toReduceLen  = 8;
+
+    static constexpr auto ref_src2dDesc =
+        make_naive_tensor_descriptor_packed(make_tuple(ref_invariantLen, ref_toReduceLen));
+
+    using refType_src2dDesc = decltype(ref_src2dDesc);
+    using refType_dst1dDesc = decltype(ref_dstDesc);
+
+    // used by the BlockWise and MultiBlock method
+    using refType_src2dDesc_padded_34 = decltype(
+        transform_tensor_descriptor(ref_src2dDesc,
+                                    make_tuple(make_pass_through_transform(ref_invariantLen),
+                                               make_pad_transform(ref_toReduceLen, 0, 2)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{})));
+
+    using refType_dst1dDesc_padded =
+        decltype(transform_tensor_descriptor(ref_dstDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}),
+                                             make_tuple(Sequence<0>{})));
+};
+
+using refType_src2dDesc = typename get_ref_desc_types<srcDims>::refType_src2dDesc;
+using refType_dst1dDesc = typename get_ref_desc_types<srcDims>::refType_dst1dDesc;
+using refType_src2dDesc_padded_34 =
+    typename get_ref_desc_types<srcDims>::refType_src2dDesc_padded_34;
+using refType_dst1dDesc_padded = typename get_ref_desc_types<srcDims>::refType_dst1dDesc_padded;
+
+template <bool need_padding>
+static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_src2dDesc_padded_34*>(p_src2dDesc));
+    else
+        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
+};
+
+template <bool need_padding>
+static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
+    else
+        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
+};
+
+extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen,
+                                                     float alpha,
+                                                     const void* __restrict__ p_src_global,
+                                                     float beta,
+                                                     void* __restrict__ p_dst_global,
+                                                     const void CONSTANT* ws_global,
+                                                     long ws_buf2_bytes_offset,
+                                                     void* __restrict__ indices_global)
+{
+    (void)p_src_global;
+
+    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
+    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
+    void* ws_buf1_global    = const_cast<char*>(static_cast<const char*>(p_src2dDesc) + 4096);
+
+    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
+    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
+
+    using gridwise_2d_reduce = GridwiseReduction_xy_to_x_blockwise<BlockSize,
+                                                                   srcDataType,
+                                                                   dstDataType,
+                                                                   compType,
+                                                                   decltype(src2dDesc),
+                                                                   decltype(dst1dDesc),
+                                                                   op,
+                                                                   nanPropaOpt,
+                                                                   reduceIndicesOpt,
+                                                                   false,
+                                                                   true,
+                                                                   GredAccessesPerThreadInBlock>;
+
+    void* const ws_buf2_global =
+        ws_buf2_bytes_offset > 0
+            ? static_cast<void*>(static_cast<char*>(ws_buf1_global) + ws_buf2_bytes_offset)
+            : nullptr;
+
+    constexpr int RunId = need_indices ? 3 : 1;
+    gridwise_2d_reduce::template Run<RunId>(
+        src2dDesc,
+        dst1dDesc,
+        origReduceLen,
+        alpha,
+        static_cast<const srcDataType* const __restrict__>(ws_buf1_global),
+        beta,
+        static_cast<dstDataType* const __restrict__>(p_dst_global),
+        static_cast<const int* const __restrict__>(ws_buf2_global),
+        static_cast<int* const __restrict__>(indices_global));
+};
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_partial_dims.cpp
similarity index 94%
rename from src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise.cpp
rename to src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_partial_dims.cpp
index f7b2f5e32c..7fc2ce48ce 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_partial_dims.cpp
@@ -45,8 +45,11 @@ constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
 constexpr index_t srcDims = CK_PARAM_IN_DIMS;
 constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
 
-using toReduceDims  = Sequence<CK_PARAM_TOREDUCE_DIMS>;
-using invariantDims = Sequence<CK_PARAM_INVARIANT_DIMS>; // this could be empty
+constexpr index_t num_toReduceDims  = CK_PARAM_NUM_TOREDUCE_DIMS;
+constexpr index_t num_invariantDims = srcDims - num_toReduceDims;
+
+using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type;
+using toReduceDims  = typename arithmetic_sequence_gen<num_invariantDims, srcDims, 1>::type;
 
 constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
 constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
@@ -59,15 +62,11 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
 constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
 constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
 
-////////////////////////////////////////////////////////////////////////////////////////
-using specDims = typename sequence_merge<invariantDims, toReduceDims>::type;
-
-static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
-              "Wrong invariant and/or toReduce dimensions!");
+constexpr index_t num_toReduceDims  = CK_PARAM_NUM_TOREDUCE_DIMS;
+constexpr index_t num_invariantDims = srcDims - num_toReduceDims;
 
-// The number of invariant dimensions can be zero if all dimension are to be reduced
-static_assert(invariantDims::Size() > 0 || dstDims == 1,
-              "If all source dimensions are reduced, the dest should have only one dimension !!");
+using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type;
+using toReduceDims  = typename arithmetic_sequence_gen<num_invariantDims, srcDims, 1>::type;
 
 constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
 constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_all_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_all_dims.cpp
new file mode 100644
index 0000000000..7e8c43797e
--- /dev/null
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_all_dims.cpp
@@ -0,0 +1,230 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "config.hpp"
+#include "number.hpp"
+#include "sequence.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "data_type_enum_helper.hpp"
+#include "reduction_common.hpp"
+#include "gridwise_generic_2d_reduction_direct_threadwise.hpp"
+
+using namespace ck;
+
+using srcDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
+using dstDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
+using compType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
+
+constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
+
+constexpr index_t srcDims = CK_PARAM_IN_DIMS;
+constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
+
+using toReduceDims  = Sequence<CK_PARAM_TOREDUCE_DIMS>;
+using invariantDims = Sequence<CK_PARAM_INVARIANT_DIMS>; // this could be empty
+
+constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
+constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
+                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
+                                             : NanPropagation_t::PROPAGATE_NAN;
+constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
+                                                       ? ReduceTensorIndices_t::NO_INDICES
+                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
+
+constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
+constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
+
+static_assert(dstDims == 1,
+              "If all source dimensions are reduced, the dest should have only one dimension !!");
+
+constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
+constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
+
+constexpr index_t GredThreadBufferLength = CK_PARAM_THREAD_BUFFER_LENGTH; // tunable
+
+extern "C" __global__ void
+gridwise_generic_reduce_2_prepare(int GridSize, int BlkGroupSize, void* __restrict__ ws_global)
+{
+    (void)BlkGroupSize;
+
+    void* p_src2dDesc = ws_global;
+    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+
+    const auto tupleDstLengths = make_tuple(1);
+    const auto tupleDstStrides = make_tuple(1);
+
+    auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+
+    const index_t invariantLen = dstDesc.GetLength(Number<0>{});
+    const index_t toReduceLen  = BlkGroupSize;
+
+    auto src2dDesc = make_naive_tensor_descriptor_packed(make_tuple(invariantLen, toReduceLen));
+
+    constexpr auto copySliceLen = GredThreadBufferLength;
+
+    if constexpr(src2d_need_padding)
+    {
+        const auto srcPad1 = GridSize * BlockSize - invariantLen;
+        const auto srcPad2 =
+            ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
+        auto src2dDesc_2 =
+            transform_tensor_descriptor(src2dDesc,
+                                        make_tuple(make_pad_transform(invariantLen, 0, srcPad1),
+                                                   make_pad_transform(toReduceLen, 0, srcPad2)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
+    }
+    else
+    {
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
+    }
+
+    if constexpr(dst1d_need_padding)
+    {
+        const auto dstPad = GridSize * BlockSize - invariantLen;
+        auto dst1dDesc_2 =
+            transform_tensor_descriptor(dstDesc,
+                                        make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
+    }
+    else
+    {
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(dstDesc)*>(p_dst1dDesc) = dstDesc;
+    }
+};
+
+template <index_t srcDims>
+struct get_ref_desc_types
+{
+    static constexpr auto ref_tupleDstLengths = make_tuple(8);
+    static constexpr auto ref_dstDesc =
+        make_naive_tensor_descriptor(ref_tupleDstLengths, ref_tupleDstLengths);
+
+    static constexpr index_t ref_invariantLen = ref_dstDesc.GetLength(Number<0>{});
+    static constexpr index_t ref_toReduceLen  = 8;
+
+    static constexpr auto ref_src2dDesc =
+        make_naive_tensor_descriptor_packed(make_tuple(ref_invariantLen, ref_toReduceLen));
+
+    using refType_src2dDesc = decltype(ref_src2dDesc);
+    using refType_dst1dDesc = decltype(ref_dstDesc);
+
+    // used by the DirectThreadWise and DirectWarpWise method
+    using refType_src2dDesc_padded_12 =
+        decltype(transform_tensor_descriptor(ref_src2dDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2),
+                                                        make_pad_transform(ref_toReduceLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                             make_tuple(Sequence<0>{}, Sequence<1>{})));
+
+    using refType_dst1dDesc_padded =
+        decltype(transform_tensor_descriptor(ref_dstDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}),
+                                             make_tuple(Sequence<0>{})));
+};
+
+using refType_src2dDesc = typename get_ref_desc_types<srcDims>::refType_src2dDesc;
+using refType_dst1dDesc = typename get_ref_desc_types<srcDims>::refType_dst1dDesc;
+using refType_src2dDesc_padded_12 =
+    typename get_ref_desc_types<srcDims>::refType_src2dDesc_padded_12;
+using refType_dst1dDesc_padded = typename get_ref_desc_types<srcDims>::refType_dst1dDesc_padded;
+
+template <bool need_padding>
+static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_src2dDesc_padded_12*>(p_src2dDesc));
+    else
+        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
+};
+
+template <bool need_padding>
+static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
+    else
+        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
+};
+
+extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen,
+                                                     float alpha,
+                                                     const void* __restrict__ p_src_global,
+                                                     float beta,
+                                                     void* __restrict__ p_dst_global,
+                                                     const void CONSTANT* ws_global,
+                                                     long ws_buf2_bytes_offset,
+                                                     void* __restrict__ indices_global)
+{
+    (void)p_src_global;
+
+    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
+    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
+    void* ws_buf1_global    = const_cast<char*>(static_cast<const char*>(p_src2dDesc) + 4096);
+
+    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
+    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
+
+    using gridwise_2d_reduce = GridwiseReduction_xy_to_x_direct_threadwise<BlockSize,
+                                                                           srcDataType,
+                                                                           dstDataType,
+                                                                           compType,
+                                                                           decltype(src2dDesc),
+                                                                           decltype(dst1dDesc),
+                                                                           op,
+                                                                           nanPropaOpt,
+                                                                           reduceIndicesOpt,
+                                                                           false,
+                                                                           true,
+                                                                           GredThreadBufferLength>;
+
+    void* const ws_buf2_global =
+        ws_buf2_bytes_offset > 0
+            ? static_cast<void*>(static_cast<char*>(ws_buf1_global) + ws_buf2_bytes_offset)
+            : nullptr;
+
+    constexpr int RunId = need_indices ? 3 : 1;
+    gridwise_2d_reduce::template Run<RunId>(
+        src2dDesc,
+        dst1dDesc,
+        origReduceLen,
+        alpha,
+        static_cast<const srcDataType* const __restrict__>(ws_buf1_global),
+        beta,
+        static_cast<dstDataType* const __restrict__>(p_dst_global),
+        static_cast<const int* const __restrict__>(ws_buf2_global),
+        static_cast<int* const __restrict__>(indices_global));
+};
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_partial_dims.cpp
similarity index 97%
rename from src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise.cpp
rename to src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_partial_dims.cpp
index 826b9257ab..8d8755b615 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_partial_dims.cpp
@@ -45,8 +45,11 @@ constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
 constexpr index_t srcDims = CK_PARAM_IN_DIMS;
 constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
 
-using toReduceDims  = Sequence<CK_PARAM_TOREDUCE_DIMS>;
-using invariantDims = Sequence<CK_PARAM_INVARIANT_DIMS>; // this could be empty
+constexpr index_t num_toReduceDims  = CK_PARAM_NUM_TOREDUCE_DIMS;
+constexpr index_t num_invariantDims = srcDims - num_toReduceDims;
+
+using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type;
+using toReduceDims  = typename arithmetic_sequence_gen<num_invariantDims, srcDims, 1>::type;
 
 constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
 constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_all_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_all_dims.cpp
new file mode 100644
index 0000000000..328e65012b
--- /dev/null
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_all_dims.cpp
@@ -0,0 +1,229 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "config.hpp"
+#include "number.hpp"
+#include "sequence.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "data_type_enum_helper.hpp"
+#include "reduction_common.hpp"
+#include "gridwise_generic_2d_reduction_direct_warpwise.hpp"
+
+using namespace ck;
+
+using srcDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
+using dstDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
+using compType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
+
+constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
+
+constexpr index_t srcDims = CK_PARAM_IN_DIMS;
+constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
+
+constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
+constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
+                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
+                                             : NanPropagation_t::PROPAGATE_NAN;
+constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
+                                                       ? ReduceTensorIndices_t::NO_INDICES
+                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
+
+constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
+constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
+
+static_assert(dstDims == 1,
+              "If all source dimensions are reduced, the dest should have only one dimension !!");
+
+constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
+constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
+
+constexpr index_t GredAccessesPerThreadInWarp = CK_PARAM_ACCESSES_PER_THREAD_INWARP; // tunable
+
+extern "C" __global__ void
+gridwise_generic_reduce_2_prepare(int GridSize, int BlkGroupSize, void* __restrict__ ws_global)
+{
+    (void)BlkGroupSize;
+
+    void* p_src2dDesc = ws_global;
+    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+
+    const auto tupleDstLengths = make_tuple(1);
+    const auto tupleDstStrides = make_tuple(1);
+
+    auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+
+    const index_t invariantLen = dstDesc.GetLength(Number<0>{});
+    const index_t toReduceLen  = BlkGroupSize;
+
+    auto src2dDesc = make_naive_tensor_descriptor_packed(make_tuple(invariantLen, toReduceLen));
+
+    constexpr auto copySliceLen = warpSize * GredAccessesPerThreadInWarp;
+
+    if constexpr(src2d_need_padding)
+    {
+        const auto srcPad1 = GridSize * BlockSize / warpSize - invariantLen;
+        const auto srcPad2 =
+            ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
+
+        auto src2dDesc_2 =
+            transform_tensor_descriptor(src2dDesc,
+                                        make_tuple(make_pad_transform(invariantLen, 0, srcPad1),
+                                                   make_pad_transform(toReduceLen, 0, srcPad2)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
+    }
+    else
+    {
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
+    }
+
+    if constexpr(dst1d_need_padding)
+    {
+        const auto dstPad = GridSize * BlockSize / warpSize - invariantLen;
+        auto dst1dDesc_2 =
+            transform_tensor_descriptor(dstDesc,
+                                        make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
+    }
+    else
+    {
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(dstDesc)*>(p_dst1dDesc) = dstDesc;
+    }
+};
+
+template <index_t srcDims>
+struct get_ref_desc_types
+{
+    static constexpr auto ref_tupleDstLengths = make_tuple(8);
+    static constexpr auto ref_dstDesc =
+        make_naive_tensor_descriptor(ref_tupleDstLengths, ref_tupleDstLengths);
+
+    static constexpr index_t ref_invariantLen = ref_dstDesc.GetLength(Number<0>{});
+    static constexpr index_t ref_toReduceLen  = 8;
+
+    static constexpr auto ref_src2dDesc =
+        make_naive_tensor_descriptor_packed(make_tuple(ref_invariantLen, ref_toReduceLen));
+
+    using refType_src2dDesc = decltype(ref_src2dDesc);
+    using refType_dst1dDesc = decltype(ref_dstDesc);
+
+    // used by the DirectThreadWise and DirectWarpWise method
+    using refType_src2dDesc_padded_12 =
+        decltype(transform_tensor_descriptor(ref_src2dDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2),
+                                                        make_pad_transform(ref_toReduceLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                             make_tuple(Sequence<0>{}, Sequence<1>{})));
+
+    using refType_dst1dDesc_padded =
+        decltype(transform_tensor_descriptor(ref_dstDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}),
+                                             make_tuple(Sequence<0>{})));
+};
+
+using refType_src2dDesc = typename get_ref_desc_types<srcDims>::refType_src2dDesc;
+using refType_dst1dDesc = typename get_ref_desc_types<srcDims>::refType_dst1dDesc;
+using refType_src2dDesc_padded_12 =
+    typename get_ref_desc_types<srcDims>::refType_src2dDesc_padded_12;
+using refType_dst1dDesc_padded = typename get_ref_desc_types<srcDims>::refType_dst1dDesc_padded;
+
+template <bool need_padding>
+static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_src2dDesc_padded_12*>(p_src2dDesc));
+    else
+        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
+};
+
+template <bool need_padding>
+static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
+    else
+        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
+};
+
+extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen,
+                                                     float alpha,
+                                                     const void* __restrict__ p_src_global,
+                                                     float beta,
+                                                     void* __restrict__ p_dst_global,
+                                                     const void CONSTANT* ws_global,
+                                                     long ws_buf2_bytes_offset,
+                                                     void* __restrict__ indices_global)
+{
+    (void)p_src_global;
+
+    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
+    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
+    void* ws_buf1_global    = const_cast<char*>(static_cast<const char*>(p_src2dDesc) + 4096);
+
+    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
+    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
+
+    using gridwise_2d_reduce =
+        GridwiseReduction_xy_to_x_direct_warpwise<BlockSize,
+                                                  srcDataType,
+                                                  dstDataType,
+                                                  compType,
+                                                  decltype(src2dDesc),
+                                                  decltype(dst1dDesc),
+                                                  op,
+                                                  nanPropaOpt,
+                                                  reduceIndicesOpt,
+                                                  false,
+                                                  true,
+                                                  GredAccessesPerThreadInWarp>;
+
+    void* const ws_buf2_global =
+        ws_buf2_bytes_offset > 0
+            ? static_cast<void*>(static_cast<char*>(ws_buf1_global) + ws_buf2_bytes_offset)
+            : nullptr;
+
+    constexpr int RunId = need_indices ? 3 : 1;
+    gridwise_2d_reduce::template Run<RunId>(
+        src2dDesc,
+        dst1dDesc,
+        origReduceLen,
+        alpha,
+        static_cast<const srcDataType* const __restrict__>(ws_buf1_global),
+        beta,
+        static_cast<dstDataType* const __restrict__>(p_dst_global),
+        static_cast<const int* const __restrict__>(ws_buf2_global),
+        static_cast<int* const __restrict__>(indices_global));
+};
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_partial_dims.cpp
similarity index 95%
rename from src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise.cpp
rename to src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_partial_dims.cpp
index 483c74bdd2..612f33f006 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_partial_dims.cpp
@@ -45,8 +45,11 @@ constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
 constexpr index_t srcDims = CK_PARAM_IN_DIMS;
 constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
 
-using toReduceDims  = Sequence<CK_PARAM_TOREDUCE_DIMS>;
-using invariantDims = Sequence<CK_PARAM_INVARIANT_DIMS>; // this could be empty
+constexpr index_t num_toReduceDims  = CK_PARAM_NUM_TOREDUCE_DIMS;
+constexpr index_t num_invariantDims = srcDims - num_toReduceDims;
+
+using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type;
+using toReduceDims  = typename arithmetic_sequence_gen<num_invariantDims, srcDims, 1>::type;
 
 constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
 constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
@@ -59,15 +62,7 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
 constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
 constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
 
-////////////////////////////////////////////////////////////////////////////////////////
-using specDims = typename sequence_merge<invariantDims, toReduceDims>::type;
-
-static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
-              "Wrong invariant and/or toReduce dimensions!");
-
-// The number of invariant dimensions can be zero if all dimension are to be reduced
-static_assert(invariantDims::Size() > 0 || dstDims == 1,
-              "If all source dimensions are reduced, the dest should have only one dimension !!");
+static_assert(num_toReduceDims > 0, "At least one dimension need be reduced!!!");
 
 constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
 constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
diff --git a/src/reducetensor.cpp b/src/reducetensor.cpp
index abd1eb4be5..66a1d67693 100644
--- a/src/reducetensor.cpp
+++ b/src/reducetensor.cpp
@@ -437,12 +437,17 @@ static std::pair<bool, bool> get_padding_need(ReductionMethod_t reduceImpl,
     return (std::make_pair(src_need_padding, dst_need_padding));
 };
 
-static std::string get_first_call_kernel_file(const ReductionMethod_t reduceImpl,
-                                              const bool allDimsReduced)
+static std::string get_kernel_file_name(const bool isFirstCall,
+                                        const ReductionMethod_t reduceImpl,
+                                        const bool allDimsReduced)
 {
     std::ostringstream outs;
 
-    outs << "gridwise_generic_reduction_first_call_" << getReductionMethodStr(reduceImpl);
+    if(isFirstCall)
+        outs << "gridwise_generic_reduction_first_call_" << getReductionMethodStr(reduceImpl);
+    else
+        outs << "gridwise_generic_reduction_second_call_" << getReductionMethodStr(reduceImpl);
+
     if(allDimsReduced)
         outs << "_reduce_all_dims.cpp";
     else
@@ -862,19 +867,27 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
         int p_outLengths[6] = {0};
         int p_outStrides[6] = {0};
 
-        for(int i = 0; i < inDescLengths.size(); i++)
-            p_inLengths[i] = static_cast<int>(inDescLengths[i]);
-
-        for(int i = 0; i < inDescStrides.size(); i++)
-            p_inStrides[i] = static_cast<int>(inDescStrides[i]);
-
         int pos = 0;
         for(int i = 0; i < outDescLengths.size(); i++)
         {
+            // invariant dimensions
             if(outDescLengths[i] > 1)
             {
                 p_outLengths[pos] = static_cast<int>(outDescLengths[i]);
                 p_outStrides[pos] = static_cast<int>(outDescStrides[i]);
+                p_inLengths[pos]  = static_cast<int>(inDescLengths[i]);
+                p_inStrides[pos]  = static_cast<int>(inDescStrides[i]);
+                pos++;
+            };
+        };
+
+        for(int i = 0; i < outDescLengths.size(); i++)
+        {
+            // toReduce dimensions
+            if(outDescLengths[i] == 1)
+            {
+                p_inLengths[pos] = static_cast<int>(inDescLengths[i]);
+                p_inStrides[pos] = static_cast<int>(inDescStrides[i]);
                 pos++;
             };
         };
@@ -900,26 +913,8 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
             detailD::get_definition_string_from_type_enums(srcDataType, compType, dstDataType) +
             " " + detailD::get_definition_string_from_tunable(tunable);
 
-        param += " -DCK_PARAM_TOREDUCE_DIMS=";
-        for(int i = 0; i < toReduceDims.size(); i++)
-        {
-            param += std::to_string(toReduceDims[i]);
-            if(i < toReduceDims.size() - 1)
-                param += ",";
-        };
-
         if(!reduceAllDims)
-        {
-            param += " -DCK_PARAM_INVARIANT_DIMS=";
-            for(int i = 0; i < invariantDims.size(); i++)
-            {
-                param += std::to_string(invariantDims[i]);
-                if(i < invariantDims.size() - 1)
-                    param += ",";
-            };
-        }
-        else
-            param += " -DCK_PARAM_INVARIANT_DIMS= ";
+            param += " -DCK_PARAM_NUM_TOREDUCE_DIMS=" + std::to_string(toReduceDims.size());
 
         param += " -DCK_PARAM_REDUCE_OP=" +
                  std::to_string(static_cast<int>(detailD::mapReduceOpId(reduceOp)));
@@ -959,41 +954,54 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
             " -DCK_PARAM_DST1D_PADDING=" + std::to_string(static_cast<int>(use_padding.second));
 
         const std::string program_name1 =
-            detailD::get_first_call_kernel_file(reduceImpl, reduceAllDims);
+            detailD::get_kernel_file_name(true, reduceImpl, reduceAllDims);
         std::string kernel_name1     = "gridwise_generic_reduce_1_prepare";
         std::string network_config_1 = network_config + "_1_P" + std::to_string(reduceImpl) +
                                        std::to_string(static_cast<int>(use_padding.first)) +
                                        std::to_string(static_cast<int>(use_padding.second));
 
-        handle.AddKernel(
-            algo_name, network_config_1, program_name1, kernel_name1, vld, vgd1, param1)(
-            gridSize,
-            blkGroupSize,
-            p_inLengths[0],
-            p_inLengths[1],
-            p_inLengths[2],
-            p_inLengths[3],
-            p_inLengths[4],
-            p_inLengths[5],
-            p_inStrides[0],
-            p_inStrides[1],
-            p_inStrides[2],
-            p_inStrides[3],
-            p_inStrides[4],
-            p_inStrides[5],
-            p_outLengths[0],
-            p_outLengths[1],
-            p_outLengths[2],
-            p_outLengths[3],
-            p_outLengths[4],
-            p_outLengths[5],
-            p_outStrides[0],
-            p_outStrides[1],
-            p_outStrides[2],
-            p_outStrides[3],
-            p_outStrides[4],
-            p_outStrides[5],
-            workspace);
+        if(!reduceAllDims)
+            handle.AddKernel(
+                algo_name, network_config_1, program_name1, kernel_name1, vld, vgd1, param1)(
+                gridSize,
+                blkGroupSize,
+                p_inLengths[0],
+                p_inLengths[1],
+                p_inLengths[2],
+                p_inLengths[3],
+                p_inLengths[4],
+                p_inLengths[5],
+                p_inStrides[0],
+                p_inStrides[1],
+                p_inStrides[2],
+                p_inStrides[3],
+                p_inStrides[4],
+                p_inStrides[5],
+                p_outStrides[0],
+                p_outStrides[1],
+                p_outStrides[2],
+                p_outStrides[3],
+                p_outStrides[4],
+                p_outStrides[5],
+                workspace);
+        else
+            handle.AddKernel(
+                algo_name, network_config_1, program_name1, kernel_name1, vld, vgd1, param1)(
+                gridSize,
+                blkGroupSize,
+                p_inLengths[0],
+                p_inLengths[1],
+                p_inLengths[2],
+                p_inLengths[3],
+                p_inLengths[4],
+                p_inLengths[5],
+                p_inStrides[0],
+                p_inStrides[1],
+                p_inStrides[2],
+                p_inStrides[3],
+                p_inStrides[4],
+                p_inStrides[5],
+                workspace);
 
         if(handle.IsProfilingEnabled())
             time_reduce += handle.GetKernelTime();
@@ -1041,30 +1049,35 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
                                  " -DCK_PARAM_DST1D_PADDING=" +
                                  std::to_string(static_cast<int>(use_padding2.second));
 
-            std::string program_name2 = "gridwise_generic_reduction_second_call_" +
-                                        detailD::getReductionMethodStr(reduceImpl2) + ".cpp";
+            std::string program_name2 =
+                detailD::get_kernel_file_name(false, reduceImpl2, reduceAllDims);
             std::string kernel_name2     = "gridwise_generic_reduce_2_prepare";
             std::string network_config_2 = network_config + "_2_P" + std::to_string(reduceImpl2) +
                                            std::to_string(static_cast<int>(use_padding2.first)) +
                                            std::to_string(static_cast<int>(use_padding2.second));
 
-            handle.AddKernel(
-                algo_name, network_config_2, program_name2, kernel_name2, vld, vgd1, param2)(
-                gridSize_2,
-                blkGroupSize,
-                p_outLengths[0],
-                p_outLengths[1],
-                p_outLengths[2],
-                p_outLengths[3],
-                p_outLengths[4],
-                p_outLengths[5],
-                p_outStrides[0],
-                p_outStrides[1],
-                p_outStrides[2],
-                p_outStrides[3],
-                p_outStrides[4],
-                p_outStrides[5],
-                workspace);
+            if(!reduceAllDims)
+                handle.AddKernel(
+                    algo_name, network_config_2, program_name2, kernel_name2, vld, vgd1, param2)(
+                    gridSize_2,
+                    blkGroupSize,
+                    p_outLengths[0],
+                    p_outLengths[1],
+                    p_outLengths[2],
+                    p_outLengths[3],
+                    p_outLengths[4],
+                    p_outLengths[5],
+                    p_outStrides[0],
+                    p_outStrides[1],
+                    p_outStrides[2],
+                    p_outStrides[3],
+                    p_outStrides[4],
+                    p_outStrides[5],
+                    workspace);
+            else
+                handle.AddKernel(
+                    algo_name, network_config_2, program_name2, kernel_name2, vld, vgd1, param2)(
+                    gridSize_2, blkGroupSize, workspace);
 
             if(handle.IsProfilingEnabled())
                 time_reduce += handle.GetKernelTime();

From 41f57aff1178c9c960cabe972651c8df1dce3e10 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sat, 18 Sep 2021 14:15:44 +0000
Subject: [PATCH 54/57] Update to remove OpenCL tidy checking failures

---
 .../include/utility/reduction_common.hpp      | 36 +---------
 .../include/utility/reduction_enums.hpp       | 66 +++++++++++++++++++
 src/reducetensor.cpp                          |  6 +-
 3 files changed, 71 insertions(+), 37 deletions(-)
 create mode 100644 src/composable_kernel/composable_kernel/include/utility/reduction_enums.hpp

diff --git a/src/composable_kernel/composable_kernel/include/utility/reduction_common.hpp b/src/composable_kernel/composable_kernel/include/utility/reduction_common.hpp
index 1b5486a005..ff574c315c 100644
--- a/src/composable_kernel/composable_kernel/include/utility/reduction_common.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/reduction_common.hpp
@@ -26,41 +26,9 @@
 #ifndef CK_REDUCTION_COMMON_HPP
 #define CK_REDUCTION_COMMON_HPP
 
-// this enumerate should be synchronized with include/miopen/reduce_common.hpp
-namespace ck {
-
-enum class ReduceTensorOp_t
-{
-    ADD   = 0,
-    MUL   = 1,
-    MIN   = 2,
-    MAX   = 3,
-    AMAX  = 4,
-    AVG   = 5,
-    NORM1 = 6,
-    NORM2 = 7,
-    // MUL_NO_ZEROS = 8,
-};
-
-enum class NanPropagation_t
-{
-    NOT_PROPAGATE_NAN = 0,
-    PROPAGATE_NAN     = 1,
-};
+#include "reduction_enums.hpp"
 
-enum class ReduceTensorIndices_t
-{
-    NO_INDICES        = 0,
-    FLATTENED_INDICES = 1,
-};
-
-enum class IndicesType_t
-{
-    INDICES_32BIT = 0,
-    INDICES_64BIT = 1,
-    INDICES_16BIT = 2,
-    INDICES_8BIT  = 3,
-};
+namespace ck {
 
 struct float_equal_one
 {
diff --git a/src/composable_kernel/composable_kernel/include/utility/reduction_enums.hpp b/src/composable_kernel/composable_kernel/include/utility/reduction_enums.hpp
new file mode 100644
index 0000000000..e97108179e
--- /dev/null
+++ b/src/composable_kernel/composable_kernel/include/utility/reduction_enums.hpp
@@ -0,0 +1,66 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef CK_REDUCTION_ENUMS_HPP
+#define CK_REDUCTION_ENUMS_HPP
+
+namespace ck {
+
+enum class ReduceTensorOp_t
+{
+    ADD   = 0,
+    MUL   = 1,
+    MIN   = 2,
+    MAX   = 3,
+    AMAX  = 4,
+    AVG   = 5,
+    NORM1 = 6,
+    NORM2 = 7,
+    // MUL_NO_ZEROS = 8,
+};
+
+enum class NanPropagation_t
+{
+    NOT_PROPAGATE_NAN = 0,
+    PROPAGATE_NAN     = 1,
+};
+
+enum class ReduceTensorIndices_t
+{
+    NO_INDICES        = 0,
+    FLATTENED_INDICES = 1,
+};
+
+enum class IndicesType_t
+{
+    INDICES_32BIT = 0,
+    INDICES_64BIT = 1,
+    INDICES_16BIT = 2,
+    INDICES_8BIT  = 3,
+};
+
+}; // end of namespace ck
+
+#endif
diff --git a/src/reducetensor.cpp b/src/reducetensor.cpp
index 66a1d67693..524abed88e 100644
--- a/src/reducetensor.cpp
+++ b/src/reducetensor.cpp
@@ -45,7 +45,7 @@
 
 // headers from composable kernel, to get consistent ID mapping
 #include "../composable_kernel/composable_kernel/include/utility/data_type_enum.hpp"
-#include "../composable_kernel/composable_kernel/include/utility/reduction_common.hpp"
+#include "../composable_kernel/composable_kernel/include/utility/reduction_enums.hpp"
 
 MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_DYNAMIC_REDUCTION);
 
@@ -299,7 +299,7 @@ namespace detailD {
 
 static ck::DataTypeEnum_t mapDataTypeId(miopenDataType_t t)
 {
-    using namespace ck;
+    using ck::DataTypeEnum_t;
 
     switch(t)
     {
@@ -316,7 +316,7 @@ static ck::DataTypeEnum_t mapDataTypeId(miopenDataType_t t)
 
 static ck::ReduceTensorOp_t mapReduceOpId(miopenReduceTensorOp_t t)
 {
-    using namespace ck;
+    using ck::ReduceTensorOp_t;
 
     switch(t)
     {

From a78d0ea76ee727e27190c0da0108f8c92a3d28d9 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 22 Sep 2021 06:24:14 +0000
Subject: [PATCH 55/57] Small updates in src/reducetensor.cpp

---
 src/reducetensor.cpp | 50 +++++++++++++++++++++++++++++++-------------
 1 file changed, 36 insertions(+), 14 deletions(-)

diff --git a/src/reducetensor.cpp b/src/reducetensor.cpp
index 524abed88e..0d893d2dee 100644
--- a/src/reducetensor.cpp
+++ b/src/reducetensor.cpp
@@ -178,11 +178,8 @@ struct ReductionKernelConfigurator
             return (invariantLength); // let one block to do each reduction
     };
 
-    ReductionMethod_t GetReductionMethod_2(std::size_t invariantLength,
-                                           std::size_t toReduceLength) const
+    ReductionMethod_t GetReductionMethod_2(std::size_t toReduceLength) const
     {
-        (void)invariantLength;
-
         if(toReduceLength <= warpSize_ / 4) // let one thread to do each reduction
             return (Reduce_DirectThreadWise);
         else if(toReduceLength <= blockSize_) // let one warp to do each reduction
@@ -381,6 +378,30 @@ static std::string get_definition_string_from_tunable(const tunable_generic_redu
     return (outs.str());
 };
 
+static std::string
+get_network_config_string_from_options(miopenNanPropagation_t nanPropaOpt,
+                                       miopenReduceTensorIndices_t reduceIndicesOpt)
+{
+    std::ostringstream outs;
+
+    outs << "O_" << ((nanPropaOpt == MIOPEN_PROPAGATE_NAN) ? 1 : 0)
+         << ((reduceIndicesOpt == MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES) ? 1 : 0);
+
+    return (outs.str());
+};
+
+static std::string get_definition_string_from_options(miopenNanPropagation_t nanPropaOpt,
+                                                      miopenReduceTensorIndices_t reduceIndicesOpt)
+{
+    std::ostringstream outs;
+
+    outs << " -DCK_PARAM_NAN_PROPAGATE=" << ((nanPropaOpt == MIOPEN_PROPAGATE_NAN) ? 1 : 0);
+    outs << " -DCK_PARAM_REDUCE_INDICES="
+         << ((reduceIndicesOpt == MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES) ? 1 : 0);
+
+    return (outs.str());
+};
+
 static std::string getReductionMethodStr(ReductionMethod_t reduceImpl)
 {
     switch(reduceImpl)
@@ -904,10 +925,10 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
 
         std::string algo_name = "dynamic_generic_reduction";
 
-        std::string param = " -std=c++17 ";
+        std::string param;
         std::string network_config;
 
-        param += solver::ck_utility::get_ck_common_compiler_flag(handle);
+        param = solver::ck_utility::get_ck_common_compiler_flag(handle);
 
         param +=
             detailD::get_definition_string_from_type_enums(srcDataType, compType, dstDataType) +
@@ -918,10 +939,9 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
 
         param += " -DCK_PARAM_REDUCE_OP=" +
                  std::to_string(static_cast<int>(detailD::mapReduceOpId(reduceOp)));
-        param += " -DCK_PARAM_NAN_PROPAGATE=";
-        param += (nanPropaOpt == MIOPEN_PROPAGATE_NAN) ? "1" : "0";
-        param += " -DCK_PARAM_REDUCE_INDICES=";
-        param += (reduceIndicesOpt == MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES) ? "1" : "0";
+
+        param += detailD::get_definition_string_from_options(nanPropaOpt, reduceIndicesOpt);
+
         param += " -DCK_PARAM_IN_DIMS=" + std::to_string(inDescLengths.size());
         param += " -DCK_PARAM_OUT_DIMS=";
         param += reduceAllDims ? "1" : std::to_string(invariantDims.size());
@@ -932,11 +952,14 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
             detailD::get_network_config_string_from_type_enums(srcDataType, compType, dstDataType) +
             "_" + detailD::get_network_config_string_from_tunable(tunable) + "_";
 
+        network_config += std::to_string(static_cast<int>(detailD::mapReduceOpId(reduceOp))) + "_";
+        network_config +=
+            detailD::get_network_config_string_from_options(nanPropaOpt, reduceIndicesOpt);
+
         network_config += "I" + std::to_string(inDescLengths.size()) + "_";
 
         network_config += "RED";
-        for(auto dim : toReduceDims)
-            network_config += std::to_string(dim) + "_";
+        network_config += std::to_string(toReduceDims.size()) + "_";
         network_config += "BSIZE_" + std::to_string(tunable->BlockSize);
 
         auto use_padding = detailD::get_padding_need(reduceImpl,
@@ -1033,8 +1056,7 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
                 static_cast<int>(configurator.getGridSize_2(invariantLength, toReduceLength_2));
             const std::vector<size_t> vgd2_2 = {
                 static_cast<size_t>(gridSize_2) * tunable->BlockSize, size_t{1}, size_t{1}};
-            const auto reduceImpl2 =
-                configurator.GetReductionMethod_2(invariantLength, toReduceLength_2);
+            const auto reduceImpl2  = configurator.GetReductionMethod_2(toReduceLength_2);
             const auto use_padding2 = detailD::get_padding_need(reduceImpl2,
                                                                 invariantLength,
                                                                 toReduceLength_2,

From 6ca9b6f31472c9334bbc0ee93f1fbe325cbace46 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 24 Sep 2021 07:37:19 +0000
Subject: [PATCH 56/57] Update for better readability

---
 .../include/utility/reduction_operator.hpp    |  4 +-
 src/reducetensor.cpp                          | 83 ++++++++++---------
 2 files changed, 47 insertions(+), 40 deletions(-)

diff --git a/src/composable_kernel/composable_kernel/include/utility/reduction_operator.hpp b/src/composable_kernel/composable_kernel/include/utility/reduction_operator.hpp
index 59c23e1517..c0afbec869 100644
--- a/src/composable_kernel/composable_kernel/include/utility/reduction_operator.hpp
+++ b/src/composable_kernel/composable_kernel/include/utility/reduction_operator.hpp
@@ -38,7 +38,9 @@ namespace reduce {
 // 1) GetReductionZeroVal() -- the interface to return the "identity element" for the binary
 // operator, "identity element" is the unique
 //                    element in the algebraic space that doesn't affect the value of other elements
-//                    when operated with any of them.
+//                    when operated against them, and the concept is similar to zero vector in
+//                    vector space
+//                    (http://pages.cs.wisc.edu/~matthewb/pages/notes/pdf/linearalgebra/VectorSpaces.pdf).
 // 2) indexable -- boolean value indicating whether indices of the operated elements could be
 // recorded. Usually, Min/Max operator could
 //                 need to record the indices of elements. For operator like Add/Mul, no need to
diff --git a/src/reducetensor.cpp b/src/reducetensor.cpp
index 0d893d2dee..3b40d3e365 100644
--- a/src/reducetensor.cpp
+++ b/src/reducetensor.cpp
@@ -44,8 +44,8 @@
 #include <sstream>
 
 // headers from composable kernel, to get consistent ID mapping
-#include "../composable_kernel/composable_kernel/include/utility/data_type_enum.hpp"
-#include "../composable_kernel/composable_kernel/include/utility/reduction_enums.hpp"
+#include <../composable_kernel/composable_kernel/include/utility/data_type_enum.hpp>
+#include <../composable_kernel/composable_kernel/include/utility/reduction_enums.hpp>
 
 MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_DYNAMIC_REDUCTION);
 
@@ -219,7 +219,7 @@ inline int GetDataTypeSize(miopenDataType_t t)
 
 }; // end of namespace detail
 
-namespace detailS {
+namespace detailStatic {
 
 struct get_tunable_reduction_kernel_constants
 {
@@ -290,9 +290,9 @@ inline int GetReduceTensorOpId(miopenReduceTensorOp_t t)
     };
 };
 
-}; // end of namespace detailS
+}; // end of namespace detailStatic
 
-namespace detailD {
+namespace detailDynamic {
 
 static ck::DataTypeEnum_t mapDataTypeId(miopenDataType_t t)
 {
@@ -477,7 +477,7 @@ static std::string get_kernel_file_name(const bool isFirstCall,
     return (outs.str());
 };
 
-}; // end of namespace detailD
+}; // end of namespace detailDynamic
 
 ReduceTensorDescriptor::ReduceTensorDescriptor(miopenReduceTensorOp_t reduceTensorOp,
                                                miopenDataType_t reduceTensorCompType,
@@ -702,7 +702,7 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
             }
         };
 
-        detailS::get_tunable_reduction_kernel_constants get_constants(reduceImpl);
+        detailStatic::get_tunable_reduction_kernel_constants get_constants(reduceImpl);
 
         int GredThreadBufferLength       = get_constants.GredThreadBufferLength;
         int GredAccessesPerThreadInBlock = get_constants.GredAccessesPerThreadInBlock;
@@ -713,9 +713,12 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
         param = std::string(" -std=c++14 ");
         param += " -DCK_PARAM_BLOCKSIZE=" + std::to_string(blockSize);
         param += " -DCK_PARAM_BLKGROUPSIZE=" + std::to_string(blkGroupSize);
-        param += " -DCK_PARAM_SRC_DATATYPE=" + std::to_string(detailS::GetDataTypeId(srcDataType));
-        param += " -DCK_PARAM_DST_DATATYPE=" + std::to_string(detailS::GetDataTypeId(dstDataType));
-        param += " -DCK_PARAM_REDUCE_COMPTYPE=" + std::to_string(detailS::GetDataTypeId(compType));
+        param +=
+            " -DCK_PARAM_SRC_DATATYPE=" + std::to_string(detailStatic::GetDataTypeId(srcDataType));
+        param +=
+            " -DCK_PARAM_DST_DATATYPE=" + std::to_string(detailStatic::GetDataTypeId(dstDataType));
+        param +=
+            " -DCK_PARAM_REDUCE_COMPTYPE=" + std::to_string(detailStatic::GetDataTypeId(compType));
 
         param += " -DCK_PARAM_SRC_DESC_LENGTHS=";
         for(int i = 0; i < inDescLengths.size(); i++)
@@ -778,7 +781,8 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
         else
             param += " -DCK_PARAM_INVARIANT_DIMS= ";
 
-        param += " -DCK_PARAM_REDUCE_OP=" + std::to_string(detailS::GetReduceTensorOpId(reduceOp));
+        param +=
+            " -DCK_PARAM_REDUCE_OP=" + std::to_string(detailStatic::GetReduceTensorOpId(reduceOp));
         param += " -DCK_PARAM_NAN_PROPAGATE=" +
                  std::to_string(nanPropaOpt == MIOPEN_PROPAGATE_NAN ? 1 : 0);
         param += " -DCK_PARAM_REDUCE_INDICES=" +
@@ -930,17 +934,17 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
 
         param = solver::ck_utility::get_ck_common_compiler_flag(handle);
 
-        param +=
-            detailD::get_definition_string_from_type_enums(srcDataType, compType, dstDataType) +
-            " " + detailD::get_definition_string_from_tunable(tunable);
+        param += detailDynamic::get_definition_string_from_type_enums(
+                     srcDataType, compType, dstDataType) +
+                 " " + detailDynamic::get_definition_string_from_tunable(tunable);
 
         if(!reduceAllDims)
             param += " -DCK_PARAM_NUM_TOREDUCE_DIMS=" + std::to_string(toReduceDims.size());
 
         param += " -DCK_PARAM_REDUCE_OP=" +
-                 std::to_string(static_cast<int>(detailD::mapReduceOpId(reduceOp)));
+                 std::to_string(static_cast<int>(detailDynamic::mapReduceOpId(reduceOp)));
 
-        param += detailD::get_definition_string_from_options(nanPropaOpt, reduceIndicesOpt);
+        param += detailDynamic::get_definition_string_from_options(nanPropaOpt, reduceIndicesOpt);
 
         param += " -DCK_PARAM_IN_DIMS=" + std::to_string(inDescLengths.size());
         param += " -DCK_PARAM_OUT_DIMS=";
@@ -948,13 +952,14 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
 
         float time_reduce = 0.0f;
 
-        network_config =
-            detailD::get_network_config_string_from_type_enums(srcDataType, compType, dstDataType) +
-            "_" + detailD::get_network_config_string_from_tunable(tunable) + "_";
+        network_config = detailDynamic::get_network_config_string_from_type_enums(
+                             srcDataType, compType, dstDataType) +
+                         "_" + detailDynamic::get_network_config_string_from_tunable(tunable) + "_";
 
-        network_config += std::to_string(static_cast<int>(detailD::mapReduceOpId(reduceOp))) + "_";
         network_config +=
-            detailD::get_network_config_string_from_options(nanPropaOpt, reduceIndicesOpt);
+            std::to_string(static_cast<int>(detailDynamic::mapReduceOpId(reduceOp))) + "_";
+        network_config +=
+            detailDynamic::get_network_config_string_from_options(nanPropaOpt, reduceIndicesOpt);
 
         network_config += "I" + std::to_string(inDescLengths.size()) + "_";
 
@@ -962,14 +967,14 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
         network_config += std::to_string(toReduceDims.size()) + "_";
         network_config += "BSIZE_" + std::to_string(tunable->BlockSize);
 
-        auto use_padding = detailD::get_padding_need(reduceImpl,
-                                                     invariantLength,
-                                                     toReduceLength,
-                                                     gridSize,
-                                                     tunable->BlockSize,
-                                                     handle.GetWavefrontWidth(),
-                                                     blkGroupSize,
-                                                     tunable);
+        auto use_padding = detailDynamic::get_padding_need(reduceImpl,
+                                                           invariantLength,
+                                                           toReduceLength,
+                                                           gridSize,
+                                                           tunable->BlockSize,
+                                                           handle.GetWavefrontWidth(),
+                                                           blkGroupSize,
+                                                           tunable);
 
         std::string param1 =
             param +
@@ -977,7 +982,7 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
             " -DCK_PARAM_DST1D_PADDING=" + std::to_string(static_cast<int>(use_padding.second));
 
         const std::string program_name1 =
-            detailD::get_kernel_file_name(true, reduceImpl, reduceAllDims);
+            detailDynamic::get_kernel_file_name(true, reduceImpl, reduceAllDims);
         std::string kernel_name1     = "gridwise_generic_reduce_1_prepare";
         std::string network_config_1 = network_config + "_1_P" + std::to_string(reduceImpl) +
                                        std::to_string(static_cast<int>(use_padding.first)) +
@@ -1057,14 +1062,14 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
             const std::vector<size_t> vgd2_2 = {
                 static_cast<size_t>(gridSize_2) * tunable->BlockSize, size_t{1}, size_t{1}};
             const auto reduceImpl2  = configurator.GetReductionMethod_2(toReduceLength_2);
-            const auto use_padding2 = detailD::get_padding_need(reduceImpl2,
-                                                                invariantLength,
-                                                                toReduceLength_2,
-                                                                gridSize_2,
-                                                                tunable->BlockSize,
-                                                                handle.GetWavefrontWidth(),
-                                                                1,
-                                                                tunable);
+            const auto use_padding2 = detailDynamic::get_padding_need(reduceImpl2,
+                                                                      invariantLength,
+                                                                      toReduceLength_2,
+                                                                      gridSize_2,
+                                                                      tunable->BlockSize,
+                                                                      handle.GetWavefrontWidth(),
+                                                                      1,
+                                                                      tunable);
 
             std::string param2 = param + " -DCK_PARAM_SRC2D_PADDING=" +
                                  std::to_string(static_cast<int>(use_padding2.first)) +
@@ -1072,7 +1077,7 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
                                  std::to_string(static_cast<int>(use_padding2.second));
 
             std::string program_name2 =
-                detailD::get_kernel_file_name(false, reduceImpl2, reduceAllDims);
+                detailDynamic::get_kernel_file_name(false, reduceImpl2, reduceAllDims);
             std::string kernel_name2     = "gridwise_generic_reduce_2_prepare";
             std::string network_config_2 = network_config + "_2_P" + std::to_string(reduceImpl2) +
                                            std::to_string(static_cast<int>(use_padding2.first)) +

From 997d97eca09d0be5b4208d6ce1f4edc598bd9723 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 27 Sep 2021 09:40:11 +0000
Subject: [PATCH 57/57] Remove unused codes and not-needed template parameters
 in the kernel wrappers

---
 ...n_first_call_blockwise_reduce_all_dims.cpp |  6 ----
 ..._first_call_multiblock_reduce_all_dims.cpp |  6 ----
 ..._first_call_threadwise_reduce_all_dims.cpp |  6 ----
 ...on_first_call_warpwise_reduce_all_dims.cpp | 34 +++++--------------
 ..._second_call_blockwise_reduce_all_dims.cpp | 16 +++------
 ...ond_call_blockwise_reduce_partial_dims.cpp | 28 +++------------
 ...second_call_threadwise_reduce_all_dims.cpp | 16 +++------
 ...nd_call_threadwise_reduce_partial_dims.cpp | 32 +++--------------
 ...n_second_call_warpwise_reduce_all_dims.cpp | 16 +++------
 ...cond_call_warpwise_reduce_partial_dims.cpp | 24 +++----------
 10 files changed, 35 insertions(+), 149 deletions(-)

diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp
index 089e775152..ca6b415910 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp
@@ -43,9 +43,6 @@ using compType =
 constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
 
 constexpr index_t srcDims = CK_PARAM_IN_DIMS;
-constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
-
-using toReduceDims = typename arithmetic_sequence_gen<0, srcDims, 1>::type;
 
 constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
 constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
@@ -58,9 +55,6 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
 constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
 constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
 
-static_assert(dstDims == 1,
-              "If all source dimensions are reduced, the dest should have only one dimension !!");
-
 constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
 constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
 
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp
index a425af0dff..81899dfb02 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp
@@ -43,9 +43,6 @@ using compType =
 constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
 
 constexpr index_t srcDims = CK_PARAM_IN_DIMS;
-constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
-
-using toReduceDims = typename arithmetic_sequence_gen<0, srcDims, 1>::type;
 
 constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
 constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
@@ -58,9 +55,6 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
 constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
 constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
 
-static_assert(dstDims == 1,
-              "If all source dimensions are reduced, the dest should have only one dimension !!");
-
 constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
 constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
 
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp
index 6dd806567c..e63a1254e4 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp
@@ -43,9 +43,6 @@ using compType =
 constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
 
 constexpr index_t srcDims = CK_PARAM_IN_DIMS;
-constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
-
-using toReduceDims = typename arithmetic_sequence_gen<0, srcDims, 1>::type;
 
 constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
 constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
@@ -58,9 +55,6 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
 constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
 constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
 
-static_assert(dstDims == 1,
-              "If all source dimensions are reduced, the dest should have only one dimension !!");
-
 constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
 constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
 
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp
index e4387151d5..4a607372e9 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp
@@ -43,9 +43,6 @@ using compType =
 constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
 
 constexpr index_t srcDims = CK_PARAM_IN_DIMS;
-constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
-
-using toReduceDims = typename arithmetic_sequence_gen<0, srcDims, 1>::type;
 
 constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
 constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
@@ -58,9 +55,6 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
 constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
 constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
 
-static_assert(dstDims == 1,
-              "If all source dimensions are reduced, the dest should have only one dimension !!");
-
 constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
 constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
 
@@ -177,17 +171,15 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
     }
 };
 
-template <index_t srcDims, index_t dstDims, typename toReduceDims>
+template <index_t srcDims>
 struct get_ref_desc_types
 {
     static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
-    static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 1>::type{};
 
     // don't have to use accurate strides to get an expected referrence type
     static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
         make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
-    static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
-        make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
+    static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(make_tuple(1), make_tuple(1));
 
     static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor(
         ref_srcDesc,
@@ -202,12 +194,6 @@ struct get_ref_desc_types
                                     make_tuple(Sequence<0>{}),
                                     make_tuple(Sequence<0, 1>{}));
 
-    static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
-        ref_dstDesc,
-        make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
-        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
-        make_tuple(Sequence<0>{}));
-
     static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
     static constexpr auto ref_toReduceLen  = ref_src2dDesc.GetLength(Number<1>{});
 
@@ -220,23 +206,19 @@ struct get_ref_desc_types
                                              make_tuple(Sequence<0>{}, Sequence<1>{})));
 
     using refType_dst1dDesc_padded =
-        decltype(transform_tensor_descriptor(ref_dst1dDesc,
+        decltype(transform_tensor_descriptor(ref_dstDesc,
                                              make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
                                              make_tuple(Sequence<0>{}),
                                              make_tuple(Sequence<0>{})));
 
     using refType_src2dDesc = decltype(ref_src2dDesc);
-    using refType_dst1dDesc = decltype(ref_dst1dDesc);
+    using refType_dst1dDesc = decltype(ref_dstDesc);
 };
 
-using refType_src2dDesc =
-    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_src2dDesc;
-using refType_dst1dDesc =
-    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_dst1dDesc;
-using refType_src2dDesc_padded_12
-    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_src2dDesc_padded_12;
-using refType_dst1dDesc_padded =
-    typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_dst1dDesc_padded;
+using refType_src2dDesc = typename get_ref_desc_types<srcDims>::refType_src2dDesc;
+using refType_dst1dDesc = typename get_ref_desc_types<srcDims>::refType_dst1dDesc;
+using refType_src2dDesc_padded_12 typename get_ref_desc_types<srcDims>::refType_src2dDesc_padded_12;
+using refType_dst1dDesc_padded = typename get_ref_desc_types<srcDims>::refType_dst1dDesc_padded;
 
 template <bool need_padding>
 static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_all_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_all_dims.cpp
index fd3350d43d..7e9d46612e 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_all_dims.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_all_dims.cpp
@@ -42,9 +42,6 @@ using compType =
 
 constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
 
-constexpr index_t srcDims = CK_PARAM_IN_DIMS;
-constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
-
 constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
 constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
                                              ? NanPropagation_t::NOT_PROPAGATE_NAN
@@ -56,9 +53,6 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
 constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
 constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
 
-static_assert(dstDims == 1,
-              "If all source dimensions are reduced, the dest should have only one dimension !!");
-
 constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
 constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
 
@@ -108,7 +102,6 @@ gridwise_generic_reduce_2_prepare(int GridSize, int BlkGroupSize, void* __restri
         *static_cast<decltype(dstDesc)*>(p_dst1dDesc) = dstDesc;
 };
 
-template <index_t srcDims>
 struct get_ref_desc_types
 {
     static constexpr auto ref_tupleDstLengths = make_tuple(8);
@@ -139,11 +132,10 @@ struct get_ref_desc_types
                                              make_tuple(Sequence<0>{})));
 };
 
-using refType_src2dDesc = typename get_ref_desc_types<srcDims>::refType_src2dDesc;
-using refType_dst1dDesc = typename get_ref_desc_types<srcDims>::refType_dst1dDesc;
-using refType_src2dDesc_padded_34 =
-    typename get_ref_desc_types<srcDims>::refType_src2dDesc_padded_34;
-using refType_dst1dDesc_padded = typename get_ref_desc_types<srcDims>::refType_dst1dDesc_padded;
+using refType_src2dDesc           = typename get_ref_desc_types::refType_src2dDesc;
+using refType_dst1dDesc           = typename get_ref_desc_types::refType_dst1dDesc;
+using refType_src2dDesc_padded_34 = typename get_ref_desc_types::refType_src2dDesc_padded_34;
+using refType_dst1dDesc_padded    = typename get_ref_desc_types::refType_dst1dDesc_padded;
 
 template <bool need_padding>
 static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_partial_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_partial_dims.cpp
index 7fc2ce48ce..3f37d01e21 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_partial_dims.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_partial_dims.cpp
@@ -42,15 +42,8 @@ using compType =
 
 constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
 
-constexpr index_t srcDims = CK_PARAM_IN_DIMS;
 constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
 
-constexpr index_t num_toReduceDims  = CK_PARAM_NUM_TOREDUCE_DIMS;
-constexpr index_t num_invariantDims = srcDims - num_toReduceDims;
-
-using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type;
-using toReduceDims  = typename arithmetic_sequence_gen<num_invariantDims, srcDims, 1>::type;
-
 constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
 constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
                                              ? NanPropagation_t::NOT_PROPAGATE_NAN
@@ -62,12 +55,6 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
 constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
 constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
 
-constexpr index_t num_toReduceDims  = CK_PARAM_NUM_TOREDUCE_DIMS;
-constexpr index_t num_invariantDims = srcDims - num_toReduceDims;
-
-using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type;
-using toReduceDims  = typename arithmetic_sequence_gen<num_invariantDims, srcDims, 1>::type;
-
 constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
 constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
 
@@ -164,7 +151,7 @@ extern "C" __global__ void gridwise_generic_reduce_2_prepare(int GridSize,
         *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
 };
 
-template <index_t srcDims, index_t dstDims, typename invariantDims, typename toReduceDims>
+template <index_t dstDims>
 struct get_ref_desc_types
 {
     static constexpr auto ref_tupleDstLengths =
@@ -202,16 +189,11 @@ struct get_ref_desc_types
                                              make_tuple(Sequence<0>{})));
 };
 
-using refType_src2dDesc =
-    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_src2dDesc;
-using refType_dst1dDesc =
-    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_dst1dDesc;
+using refType_src2dDesc = typename get_ref_desc_types<dstDims>::refType_src2dDesc;
+using refType_dst1dDesc = typename get_ref_desc_types<dstDims>::refType_dst1dDesc;
 using refType_src2dDesc_padded_34 =
-    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
-        refType_src2dDesc_padded_34;
-using refType_dst1dDesc_padded =
-    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
-        refType_dst1dDesc_padded;
+    typename get_ref_desc_types<dstDims>::refType_src2dDesc_padded_34;
+using refType_dst1dDesc_padded = typename get_ref_desc_types<dstDims>::refType_dst1dDesc_padded;
 
 template <bool need_padding>
 static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_all_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_all_dims.cpp
index 7e8c43797e..77841d1312 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_all_dims.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_all_dims.cpp
@@ -42,9 +42,6 @@ using compType =
 
 constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
 
-constexpr index_t srcDims = CK_PARAM_IN_DIMS;
-constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
-
 using toReduceDims  = Sequence<CK_PARAM_TOREDUCE_DIMS>;
 using invariantDims = Sequence<CK_PARAM_INVARIANT_DIMS>; // this could be empty
 
@@ -59,9 +56,6 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
 constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
 constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
 
-static_assert(dstDims == 1,
-              "If all source dimensions are reduced, the dest should have only one dimension !!");
-
 constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
 constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
 
@@ -125,7 +119,6 @@ gridwise_generic_reduce_2_prepare(int GridSize, int BlkGroupSize, void* __restri
     }
 };
 
-template <index_t srcDims>
 struct get_ref_desc_types
 {
     static constexpr auto ref_tupleDstLengths = make_tuple(8);
@@ -156,11 +149,10 @@ struct get_ref_desc_types
                                              make_tuple(Sequence<0>{})));
 };
 
-using refType_src2dDesc = typename get_ref_desc_types<srcDims>::refType_src2dDesc;
-using refType_dst1dDesc = typename get_ref_desc_types<srcDims>::refType_dst1dDesc;
-using refType_src2dDesc_padded_12 =
-    typename get_ref_desc_types<srcDims>::refType_src2dDesc_padded_12;
-using refType_dst1dDesc_padded = typename get_ref_desc_types<srcDims>::refType_dst1dDesc_padded;
+using refType_src2dDesc           = typename get_ref_desc_types::refType_src2dDesc;
+using refType_dst1dDesc           = typename get_ref_desc_types::refType_dst1dDesc;
+using refType_src2dDesc_padded_12 = typename get_ref_desc_types::refType_src2dDesc_padded_12;
+using refType_dst1dDesc_padded    = typename get_ref_desc_types::refType_dst1dDesc_padded;
 
 template <bool need_padding>
 static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_partial_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_partial_dims.cpp
index 8d8755b615..2de461ad0f 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_partial_dims.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_partial_dims.cpp
@@ -42,15 +42,8 @@ using compType =
 
 constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
 
-constexpr index_t srcDims = CK_PARAM_IN_DIMS;
 constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
 
-constexpr index_t num_toReduceDims  = CK_PARAM_NUM_TOREDUCE_DIMS;
-constexpr index_t num_invariantDims = srcDims - num_toReduceDims;
-
-using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type;
-using toReduceDims  = typename arithmetic_sequence_gen<num_invariantDims, srcDims, 1>::type;
-
 constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
 constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
                                              ? NanPropagation_t::NOT_PROPAGATE_NAN
@@ -62,16 +55,6 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
 constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
 constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
 
-////////////////////////////////////////////////////////////////////////////////////////
-using specDims = typename sequence_merge<invariantDims, toReduceDims>::type;
-
-static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
-              "Wrong invariant and/or toReduce dimensions!");
-
-// The number of invariant dimensions can be zero if all dimension are to be reduced
-static_assert(invariantDims::Size() > 0 || dstDims == 1,
-              "If all source dimensions are reduced, the dest should have only one dimension !!");
-
 constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
 constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
 
@@ -182,7 +165,7 @@ extern "C" __global__ void gridwise_generic_reduce_2_prepare(int GridSize,
     }
 };
 
-template <index_t srcDims, index_t dstDims, typename invariantDims, typename toReduceDims>
+template <index_t dstDims>
 struct get_ref_desc_types
 {
     static constexpr auto ref_tupleDstLengths =
@@ -220,16 +203,11 @@ struct get_ref_desc_types
                                              make_tuple(Sequence<0>{})));
 };
 
-using refType_src2dDesc =
-    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_src2dDesc;
-using refType_dst1dDesc =
-    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_dst1dDesc;
+using refType_src2dDesc = typename get_ref_desc_types<dstDims>::refType_src2dDesc;
+using refType_dst1dDesc = typename get_ref_desc_types<dstDims>::refType_dst1dDesc;
 using refType_src2dDesc_padded_12 =
-    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
-        refType_src2dDesc_padded_12;
-using refType_dst1dDesc_padded =
-    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
-        refType_dst1dDesc_padded;
+    typename get_ref_desc_types<dstDims>::refType_src2dDesc_padded_12;
+using refType_dst1dDesc_padded = typename get_ref_desc_types<dstDims>::refType_dst1dDesc_padded;
 
 template <bool need_padding>
 static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_all_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_all_dims.cpp
index 328e65012b..1ba5e49657 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_all_dims.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_all_dims.cpp
@@ -42,9 +42,6 @@ using compType =
 
 constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
 
-constexpr index_t srcDims = CK_PARAM_IN_DIMS;
-constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
-
 constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
 constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
                                              ? NanPropagation_t::NOT_PROPAGATE_NAN
@@ -56,9 +53,6 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
 constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
 constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
 
-static_assert(dstDims == 1,
-              "If all source dimensions are reduced, the dest should have only one dimension !!");
-
 constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
 constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
 
@@ -123,7 +117,6 @@ gridwise_generic_reduce_2_prepare(int GridSize, int BlkGroupSize, void* __restri
     }
 };
 
-template <index_t srcDims>
 struct get_ref_desc_types
 {
     static constexpr auto ref_tupleDstLengths = make_tuple(8);
@@ -154,11 +147,10 @@ struct get_ref_desc_types
                                              make_tuple(Sequence<0>{})));
 };
 
-using refType_src2dDesc = typename get_ref_desc_types<srcDims>::refType_src2dDesc;
-using refType_dst1dDesc = typename get_ref_desc_types<srcDims>::refType_dst1dDesc;
-using refType_src2dDesc_padded_12 =
-    typename get_ref_desc_types<srcDims>::refType_src2dDesc_padded_12;
-using refType_dst1dDesc_padded = typename get_ref_desc_types<srcDims>::refType_dst1dDesc_padded;
+using refType_src2dDesc           = typename get_ref_desc_types::refType_src2dDesc;
+using refType_dst1dDesc           = typename get_ref_desc_types::refType_dst1dDesc;
+using refType_src2dDesc_padded_12 = typename get_ref_desc_types::refType_src2dDesc_padded_12;
+using refType_dst1dDesc_padded    = typename get_ref_desc_types::refType_dst1dDesc_padded;
 
 template <bool need_padding>
 static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_partial_dims.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_partial_dims.cpp
index 612f33f006..aef1545f11 100644
--- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_partial_dims.cpp
+++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_partial_dims.cpp
@@ -42,15 +42,8 @@ using compType =
 
 constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
 
-constexpr index_t srcDims = CK_PARAM_IN_DIMS;
 constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
 
-constexpr index_t num_toReduceDims  = CK_PARAM_NUM_TOREDUCE_DIMS;
-constexpr index_t num_invariantDims = srcDims - num_toReduceDims;
-
-using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type;
-using toReduceDims  = typename arithmetic_sequence_gen<num_invariantDims, srcDims, 1>::type;
-
 constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
 constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
                                              ? NanPropagation_t::NOT_PROPAGATE_NAN
@@ -62,8 +55,6 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
 constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
 constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
 
-static_assert(num_toReduceDims > 0, "At least one dimension need be reduced!!!");
-
 constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
 constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
 
@@ -175,7 +166,7 @@ extern "C" __global__ void gridwise_generic_reduce_2_prepare(int GridSize,
     }
 };
 
-template <index_t srcDims, index_t dstDims, typename invariantDims, typename toReduceDims>
+template <index_t dstDims>
 struct get_ref_desc_types
 {
     static constexpr auto ref_tupleDstLengths =
@@ -213,16 +204,11 @@ struct get_ref_desc_types
                                              make_tuple(Sequence<0>{})));
 };
 
-using refType_src2dDesc =
-    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_src2dDesc;
-using refType_dst1dDesc =
-    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_dst1dDesc;
+using refType_src2dDesc = typename get_ref_desc_types<dstDims>::refType_src2dDesc;
+using refType_dst1dDesc = typename get_ref_desc_types<dstDims>::refType_dst1dDesc;
 using refType_src2dDesc_padded_12 =
-    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
-        refType_src2dDesc_padded_12;
-using refType_dst1dDesc_padded =
-    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
-        refType_dst1dDesc_padded;
+    typename get_ref_desc_types<dstDims>::refType_src2dDesc_padded_12;
+using refType_dst1dDesc_padded = typename get_ref_desc_types<dstDims>::refType_dst1dDesc_padded;
 
 template <bool need_padding>
 static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)