diff --git a/SYCL/Matrix/element_wise_all_ops_cuda.cpp b/SYCL/Matrix/element_wise_all_ops_cuda.cpp index 832194e3d9..2f5322cca1 100644 --- a/SYCL/Matrix/element_wise_all_ops_cuda.cpp +++ b/SYCL/Matrix/element_wise_all_ops_cuda.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // REQUIRES: cuda -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=3 %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 %s -o %t.out // RUN: %t.out #include @@ -64,9 +64,9 @@ void matrix_verify_op(queue q, big_matrix &C, auto sg = spmd_item.get_sub_group(); - joint_matrix sub_a; - joint_matrix sub_b; - joint_matrix sub_c; + joint_matrix sub_a; + joint_matrix sub_b; + joint_matrix sub_c; joint_matrix_fill(sg, sub_a, 3); joint_matrix_fill(sg, sub_b, 1); @@ -104,7 +104,7 @@ void matrix_verify_op(queue q, big_matrix &C, accC.get_pointer() + (sg_startx * M) * (N * nWGperDim) + sg_starty / SG_SZ * N, - (N * nWGperDim)); + (N * nWGperDim), layout::row_major); }); // parallel for }).wait(); } diff --git a/SYCL/Matrix/element_wise_all_ops_cuda_legacy.cpp b/SYCL/Matrix/element_wise_all_ops_cuda_legacy.cpp new file mode 100644 index 0000000000..c73da53888 --- /dev/null +++ b/SYCL/Matrix/element_wise_all_ops_cuda_legacy.cpp @@ -0,0 +1,184 @@ +//==----------- element_wise_all_ops_cuda.cpp - DPC++ joint_matrix---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: cuda + +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out +// RUN: %t.out + +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; +using sycl::ext::oneapi::experimental::bfloat16; + +#define SG_SZ 32 +constexpr size_t nWGperDim = 2; + +class Logical {}; + +template +class KernelName; + +template struct big_matrix { +public: + T *mat; + +public: + T *get_data() { return mat; } + void set_data(T *data) { mat = data; } + big_matrix(T *data) : mat(data) {} +}; + +template +void assert_ops_ref(T *C, const float ref) { + for (size_t i = 0; i < M; i++) + for (size_t j = 0; j < N; j++) { + auto diff = C[i + j * M] - ref; + assert(std::fabs(static_cast(diff)) < + std::numeric_limits::epsilon()); + } +} +template +void matrix_verify_op(queue q, big_matrix &C, + nd_range<2> &r, const float ref, Operation Op) { + { + buffer bufC(C.get_data(), range<2>(N * nWGperDim, M * nWGperDim)); + + q.submit([&](handler &cgh) { + accessor accC(bufC, + cgh); + + cgh.parallel_for>( + r, [accC, + Op](nd_item<2> spmd_item) [[sycl::reqd_sub_group_size(SG_SZ)]] { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + auto sg = spmd_item.get_sub_group(); + + joint_matrix sub_a; + joint_matrix sub_b; + joint_matrix sub_c; + + joint_matrix_fill(sg, sub_a, 3); + joint_matrix_fill(sg, sub_b, 1); + joint_matrix_fill(sg, sub_c, -80); + + auto wi_slice_a = sub_a.get_wi_data(); + for (int i = 0; i < wi_slice_a.length(); i++) { + if constexpr (std::is_same_v) { + if (wi_slice_a[i]) { + if (wi_slice_a[i] > 2.0 || wi_slice_a[i] >= 3.0 || + wi_slice_a[i] < 4.0 || wi_slice_a[i] <= 3.0) { + T val = (wi_slice_a[i] != (2.0)) ? wi_slice_a[i] + : static_cast(2.0); + val = ((val) - (1)); + val = ((val) + (1)); + if (wi_slice_a[i] == (2.0)) { + val = ((val) - (2)); + val = ((val) * (3)); + val = ((val) / (2)); + + } else { + val = ((val) + (2)); + } + wi_slice_a[i] = val; + } + } + } else { + wi_slice_a[i] = Op(wi_slice_a[i], 2); + } + } + + sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); + + joint_matrix_store(sg, sub_c, + accC.get_pointer() + + (sg_startx * M) * (N * nWGperDim) + + sg_starty / SG_SZ * N, + (N * nWGperDim)); + }); // parallel for + }).wait(); + } + assert_ops_ref(C.get_data(), ref); +} + +static constexpr size_t MATRIX_M = 16 * nWGperDim; +static constexpr size_t MATRIX_N = 16 * nWGperDim; + +int main() { + + float D[MATRIX_M][MATRIX_N]; + big_matrix MD_f((float *)&D); + + queue q; + auto computeCapability = + std::stof(q.get_device().get_info()); + nd_range<2> r({nWGperDim, nWGperDim * SG_SZ}, {1, 1 * SG_SZ}); + + if (computeCapability >= 7.0) { + matrix_verify_op(q, MD_f, r, 0.0, + std::plus{}); + matrix_verify_op(q, MD_f, r, 0.0, Logical{}); + matrix_verify_op(q, MD_f, r, 16.0, + std::multiplies{}); + matrix_verify_op(q, MD_f, r, -56.0, + std::divides{}); + matrix_verify_op(q, MD_f, r, -64.0, + std::minus{}); + } + + if (computeCapability >= 7.2) { + int32_t D_i[MATRIX_M][MATRIX_N]; + big_matrix MD_i((int32_t *)&D_i); + matrix_verify_op(q, MD_i, r, 0, + std::plus{}); + matrix_verify_op(q, MD_i, r, 16, + std::multiplies{}); + matrix_verify_op(q, MD_i, r, -64, + std::minus{}); + matrix_verify_op(q, MD_i, r, 0, + std::plus{}); + matrix_verify_op(q, MD_i, r, 0.0, Logical{}); + matrix_verify_op(q, MD_i, r, 16, + std::multiplies{}); + matrix_verify_op(q, MD_i, r, -64, + std::minus{}); + } + + if (computeCapability >= 8.0) { + + matrix_verify_op(q, MD_f, r, 0.0, + std::plus{}); + matrix_verify_op(q, MD_f, r, 0.0, Logical{}); + matrix_verify_op(q, MD_f, r, 16.0, + std::multiplies{}); + matrix_verify_op(q, MD_f, r, -56.0, + std::divides{}); + matrix_verify_op(q, MD_f, r, -64.0, + std::minus{}); + + double D_d[MATRIX_M / 2][MATRIX_N / 2]; + big_matrix MD_d((double *)&D_d); + + matrix_verify_op(q, MD_d, r, -60.0, + std::plus{}); + matrix_verify_op(q, MD_d, r, -60.0, Logical{}); + matrix_verify_op(q, MD_d, r, -56.0, + std::multiplies{}); + matrix_verify_op(q, MD_d, r, -74.0, + std::divides{}); + matrix_verify_op(q, MD_d, r, -76.0, + std::minus{}); + } + + return 0; +} diff --git a/SYCL/Matrix/element_wise_wi_marray.cpp b/SYCL/Matrix/element_wise_wi_marray.cpp index 8bce3fe880..91ce3cc9b6 100644 --- a/SYCL/Matrix/element_wise_wi_marray.cpp +++ b/SYCL/Matrix/element_wise_wi_marray.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // REQUIRES: cuda -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=3 %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 %s -o %t.out // RUN: %t.out #include @@ -30,8 +30,8 @@ template void verify_wi_marray(queue q) { [ERR](nd_item<2> spmd_item) [[sycl::reqd_sub_group_size(SG_SZ)]] { auto sg = spmd_item.get_sub_group(); - joint_matrix sub_a; - joint_matrix sub_a_2; + joint_matrix sub_a; + joint_matrix sub_a_2; joint_matrix_fill(sg, sub_a, -1); joint_matrix_fill(sg, sub_a_2, -1); diff --git a/SYCL/Matrix/element_wise_wi_marray_legacy.cpp b/SYCL/Matrix/element_wise_wi_marray_legacy.cpp new file mode 100644 index 0000000000..8bce3fe880 --- /dev/null +++ b/SYCL/Matrix/element_wise_wi_marray_legacy.cpp @@ -0,0 +1,67 @@ +//==----------- element_wise_wi_marray.cpp - DPC++ joint_matrix------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: cuda + +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=3 %s -o %t.out +// RUN: %t.out + +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental; +using namespace sycl::ext::oneapi::experimental::matrix; + +#define SG_SZ 32 + +template void verify_wi_marray(queue q) { + int err = 0; + { + buffer err_buf(&err, 1); + q.submit([&](handler &cgh) { + accessor ERR(err_buf, cgh); + + cgh.parallel_for( + nd_range<2>({1, 1 * SG_SZ}, {1, 1 * SG_SZ}), + [ERR](nd_item<2> spmd_item) [[sycl::reqd_sub_group_size(SG_SZ)]] { + auto sg = spmd_item.get_sub_group(); + + joint_matrix sub_a; + joint_matrix sub_a_2; + + joint_matrix_fill(sg, sub_a, -1); + joint_matrix_fill(sg, sub_a_2, -1); + + auto wi_slice_a = sub_a.get_wi_data(); + for (int i = 0; i < wi_slice_a.length(); i++) { + wi_slice_a[i] = fabs(wi_slice_a[i]); + } + sub_a_2.wi_marray = fabs(sub_a_2.wi_marray); + + for (int i = 0; i < sub_a_2.wi_marray.size(); i++) { + if (sub_a_2.wi_marray[i] != wi_slice_a[i]) { + ERR[0] = 1; + } + } + }); // parallel for + }).wait(); + } + assert(err == 0); +} + +int main() { + + queue q; + auto computeCapability = + std::stof(q.get_device().get_info()); + + if (computeCapability >= 8.0) { + verify_wi_marray(q); + } + + return 0; +} diff --git a/SYCL/Matrix/joint_matrix_tensorcores.cpp b/SYCL/Matrix/joint_matrix_tensorcores.cpp new file mode 100644 index 0000000000..d401c3133e --- /dev/null +++ b/SYCL/Matrix/joint_matrix_tensorcores.cpp @@ -0,0 +1,277 @@ + +// REQUIRES: cuda +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 %s -o %t.out +// RUN: %t.out +// +// This tests the latest unified matrix extension interfaces. +// Specifying the sm version via the --cuda-gpu-arch flag is necessary +// for the Nvidia case. DPC++ JIT compilation is not +// supported for the Nvidia matrix extension, although some JIT optimizations +// are performed at the level of the PTX assembly code. + +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental; +using namespace sycl::ext::oneapi::experimental::matrix; +constexpr float bf16_eps = 0.00390625; + +// Example usage of Nvidia matrix multiply. +// Optimizations such as memory paddings for avoiding bank conflicts are not +// included in this test which aids clarity for what is going on. This example +// forms a "Big matrix" corresponding to a single "TILE" using cuda example +// terminology. Multiple TILES can be used to construct yet larger matrices. +// This example uses row_major a, b, and accumulator matrices. + +// M, N, K define the unit sizes of dimensions of the three types (a, b, +// accumulator) of matrices per subgroup operation: +// M: number of rows of "C"/"D" (Accumulator) sub-matrices, +// number of cols of "B" sub-matrix. +// N: number of cols of "C"/"D" (Accumulator) sub-matrices, +// number of rows of "A" sub-matrix. +// K: number of cols of "A"/number of rows of "B" sub-matrices. + +// the number of threads per MMA subgroup is always 32 for Nvidia. +constexpr int N_THREADS_PER_MATRIX_OP = 32; + +// number of submatrices per row of accumulator ("C", "D") matrices. +constexpr int SUB_TILES_M = 2; +// number of submatrices per col of accumulator matrices. +constexpr int SUB_TILES_N = 3; +// number of submatrices per col of "A"/per row of "B", matrices. +constexpr int SUB_TILES_K = 4; + +template +class TypeHelper; + +template +using KernelName = class TypeHelper; + +template +T2 matrix_ref_mn(const int &m, const int &n, T1 *A, T1 *B, T2 *C) { + T2 res = C[m * Big_N + n]; + + if constexpr (std::is_same::value) { + for (int k = 0; k < Big_K; k++) + res += A[m * Big_K + k] * B[k * Big_N + n]; + } else { + for (int k = 0; k < Big_K; k++) + res += + static_cast(A[m * Big_K + k]) * static_cast(B[k * Big_N + n]); + } + + return res; +} + +template > +void test(queue &q) { + // total number of M dimension matrix elements for the "Big matrix". + constexpr auto Big_M = Sub_Tiles_M * M; + // total number of N dimension matrix elements for the "Big matrix". + constexpr auto Big_N = Sub_Tiles_N * N; + // total number of K dimension matrix elements for the "Big matrix". + constexpr auto Big_K = Sub_Tiles_K * K; + + std::remove_const_t A[Big_M * Big_K]; + std::remove_const_t B[Big_K * Big_N]; + std::remove_const_t C[Big_M * Big_N]; + std::remove_const_t D[Big_M * Big_N]; + + for (int i = 0; i < Big_M * Big_N; i++) { + C[i] = 1; + D[i] = 0; + } + + if constexpr (!std::is_same, bfloat16>::value) { + for (int i = 0; i < Big_M * Big_K; i++) { + A[i] = i % 100; + } + + for (int i = 0; i < Big_K * Big_N; i++) { + B[i] = i % 100; + } + } + { + if constexpr (std::is_same, bfloat16>::value) { + + buffer bufA(A, range<1>(Big_M * Big_K)); + buffer bufB(B, range<1>(Big_K * Big_N)); + q.submit([&](handler &cgh) { + accessor accA(bufA, + cgh); + + cgh.parallel_for>( + range<1>(Big_M * Big_K), [=](item<1> item) { + auto i = item.get_linear_id(); + accA[i] = 0.1f * (i % 10); + }); + }); + q.submit([&](handler &cgh) { + accessor accB(bufB, + cgh); + + cgh.parallel_for>( + range<1>(Big_K * Big_N), [=](item<1> item) { + auto i = item.get_linear_id(); + accB[i] = 0.1f * (i % 10); + }); + }); + } + + buffer bufA(A, range<1>(Big_M * Big_K)); + buffer bufB(B, range<1>(Big_K * Big_N)); + buffer bufC(C, range<1>(Big_M * Big_N)); + buffer, 1> bufD(D, range<1>(Big_M * Big_N)); + + q.submit([&](handler &cgh) { + accessor accA(bufA, cgh); + accessor accB(bufB, cgh); + accessor accC(bufC, cgh); + accessor, 1, access::mode::write, target::device> + accD(bufD, cgh); + + range<2> LocalRange = {1, N_THREADS_PER_MATRIX_OP}; + range<2> GlobalRange = {Sub_Tiles_M, + Sub_Tiles_N * N_THREADS_PER_MATRIX_OP}; + + cgh.parallel_for>( + nd_range<2>(GlobalRange, LocalRange), [=](nd_item<2> item) { + sub_group sg = item.get_sub_group(); + // row id of current submatrix of BIG C matrix + const auto m = item.get_group().get_group_id()[0]; + // column id of current submatrix of BIG C matrix + const auto n = item.get_group().get_group_id()[1]; + + joint_matrix sub_a; + joint_matrix sub_b; + joint_matrix, use::accumulator, M, N> sub_c; + + joint_matrix_load(sg, sub_c, + accC.get_pointer() + (m * M) * Big_N + n * N, + Big_N, layout::row_major); + // k = row/col id of current submatrix of BIG A/B matrices + for (int k = 0; k < Sub_Tiles_K; k++) { + joint_matrix_load(sg, sub_a, + accA.get_pointer() + (k * K) + (m * M * Big_K), + Big_K); + + joint_matrix_load(sg, sub_b, + accB.get_pointer() + (k * K * Big_N) + (n * N), + Big_N); + + // round values to correct precision if using tf32 + if constexpr (std::is_same::value) { + auto wi_size = sub_a.wi_marray.size(); + assert(wi_size == sub_b.wi_marray.size()); + for (auto i = 0; i < wi_size; ++i) { + sub_a.wi_marray[i] = round_to_tf32(sub_a.wi_marray[i]); + sub_b.wi_marray[i] = round_to_tf32(sub_b.wi_marray[i]); + } + } + + sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); + } + joint_matrix_store(sg, sub_c, + accD.get_pointer() + (m * M) * Big_N + n * N, + Big_N, layout::row_major); + }); + }); + q.wait(); + } + + for (int m = 0; m < Big_M; m++) { + for (int n = 0; n < Big_N; n++) { + if constexpr (std::is_same, bfloat16>::value) { + auto res_device = matrix_ref_mn(m, n, A, B, C); + assert(fabs(2 * (D[m * Big_N + n] - res_device)) / + (D[m * Big_N + n] + res_device) < + bf16_eps * 2); + } else { + assert( + (D[m * Big_N + n] == matrix_ref_mn(m, n, A, B, C))); + } + } + } +}; + +int main() { + + queue Q; + auto computeCapability = + std::stof(Q.get_device().get_info()); + + if (computeCapability >= 7.0) { + // A/B half, Accumulator float + test(Q); + test(Q); + test(Q); + + test(Q); + test(Q); + test(Q); + + // A/B/Accumulator half + test(Q); + test(Q); + test(Q); + + test(Q); + test(Q); + test(Q); + } + if (computeCapability >= 7.2) { + test(Q); + test(Q); + test(Q); + + test(Q); + test(Q); + test(Q); + + test( + Q); + test(Q); + test(Q); + + test(Q); + test(Q); + test(Q); + } + if (computeCapability >= 8.0) { + test(Q); + test(Q); + + test(Q); + test(Q); + test(Q); + + test(Q); + test(Q); + test(Q); + + // A/B tf32 + test(Q); + test(Q); + } + return 0; +}; diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcores_legacy.cpp similarity index 99% rename from SYCL/Matrix/joint_matrix_tensorcore.cpp rename to SYCL/Matrix/joint_matrix_tensorcores_legacy.cpp index 72483bf282..d98a389f1e 100644 --- a/SYCL/Matrix/joint_matrix_tensorcore.cpp +++ b/SYCL/Matrix/joint_matrix_tensorcores_legacy.cpp @@ -2,6 +2,7 @@ // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=3 %s -o %t.out // RUN: %t.out // +// This tests the deprecated legacy matrix extension interfaces // Specifying the sm version via the --cuda-gpu-arch flag is necessary // for the Nvidia case. DPC++ JIT compilation is not // supported for the Nvidia matrix extension, although some JIT optimizations