From 59d0e7e51e89126c465eacae48342cf612d0d3e6 Mon Sep 17 00:00:00 2001 From: "arnamoy.bhattacharyya" Date: Wed, 12 Oct 2022 16:06:51 -0400 Subject: [PATCH 01/13] [SYCL][Matrix] Add initial get_coord API. This patch adds initial API for retrieval of coordinates from a work item element. --- sycl/include/CL/__spirv/spirv_ops.hpp | 8 + .../sycl/ext/oneapi/matrix/matrix-jit-use.hpp | 13 ++ .../matrix/matrix-bfloat16-test-coord.cpp | 205 ++++++++++++++++++ 3 files changed, 226 insertions(+) create mode 100644 sycl/test/matrix/matrix-bfloat16-test-coord.cpp diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp index 21181b4080399..d2593fcfb9580 100644 --- a/sycl/include/CL/__spirv/spirv_ops.hpp +++ b/sycl/include/CL/__spirv/spirv_ops.hpp @@ -123,6 +123,14 @@ template +extern SYCL_EXTERNAL std::tuple +__spirv_JointMatrixWorkItemElemCoord(JOINT_MATRIX_INTEL(T, R, C, L, S, U) *, + size_t i); + template #include #include +#include namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { @@ -256,6 +257,18 @@ class wi_element { wi_element(joint_matrix &Mat, std::size_t i) : M(Mat), idx(i) {} + + // Functions + std::tuple get_coord() { +#ifdef __SYCL_DEVICE_ONLY__ + return __spirv_JointMatrixWorkItemElemCoord(M.spvm, idx); +#else + throw runtime_error("joint matrix is not supported on host device.", + PI_ERROR_INVALID_DEVICE); +#endif // __SYCL_DEVICE_ONLY__ + } + + // Various Operations operator T() { #ifdef __SYCL_DEVICE_ONLY__ return __spirv_VectorExtractDynamic(M.spvm, idx); diff --git a/sycl/test/matrix/matrix-bfloat16-test-coord.cpp b/sycl/test/matrix/matrix-bfloat16-test-coord.cpp new file mode 100644 index 0000000000000..c0d746a81922e --- /dev/null +++ b/sycl/test/matrix/matrix-bfloat16-test-coord.cpp @@ -0,0 +1,205 @@ +// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -DSYCL_EXT_ONEAPI_MATRIX_VERSION=2 -S -emit-llvm %s -o %t.out +#include +#include + +using namespace sycl::ext::oneapi::experimental::matrix; +using bfloat16 = sycl::ext::oneapi::experimental::bfloat16; + +static constexpr auto TILE_SZ = 16; +static constexpr auto TM = TILE_SZ - 1; +static constexpr auto TN = TILE_SZ - 1; +static constexpr auto TK = 2 * TILE_SZ - 2; + +static constexpr auto SG_SZ = 16; + +template struct big_matrix { +public: + T *mat; + +public: + T *get_data() { return mat; } + void set_data(T *data) { mat = data; } + big_matrix(T *data) : mat(data) {} +}; + +static constexpr size_t MATRIX_M = TM * 2; +static constexpr size_t MATRIX_N = TN * 2; +static constexpr size_t MATRIX_K = TK * 2; +bfloat16 A[MATRIX_M][MATRIX_K]; +bfloat16 B[MATRIX_K / 2][MATRIX_N * 2]; +unsigned short Aref[MATRIX_M][MATRIX_K]; +unsigned short Bref[MATRIX_K / 2][MATRIX_N * 2]; +float C[MATRIX_M][MATRIX_N]; +float D[MATRIX_M][MATRIX_N]; +int32_t *res_local_row; + +template +void matrix_multiply(big_matrix &C, + big_matrix &A, + big_matrix &B) { + size_t M = NUM_ROWS_C; + size_t N = NUM_COLS_C; + size_t K = NUM_COLS_A; + // B => K/4 x N*4, A => M x K, C => M, N + // stride should be X's cols, e.g., B's stirde = N*4 + assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B * 2); + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + sycl::buffer bufA(A.get_data(), sycl::range<2>(M, K)); + sycl::buffer bufB(B.get_data(), sycl::range<2>(K, N)); + sycl::buffer bufC((float *)C.get_data(), sycl::range<2>(M, N)); + + sycl::buffer res_local_row_buf(res_local_row, + sycl::range<1>(MATRIX_M)); + + sycl::queue q; + q.submit([&](sycl::handler &cgh) { + auto accC = bufC.get_access(cgh); + auto accA = bufA.get_access(cgh); + auto accB = bufB.get_access(cgh); + auto res_local_row_acc = + res_local_row_buf.get_access(cgh); + + cgh.parallel_for( + sycl::nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), + [accA, accB, accC, M, N, K, + res_local_row_acc](sycl::nd_item<2> spmd_item) + + { + // The submatrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup no + // code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sycl::ext::oneapi::sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a(sg); + // For B, since current implementation does not support non-packed + // layout, users need to specify the updated VNNI sizes along with + // the packed_b layout. By default, the layout is row_major and size + // is (TK, TN). + joint_matrix sub_b(sg); + joint_matrix sub_c(sg); + + joint_matrix_load(sg, sub_c, + accC.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N, layout::row_major); + for (int k = 0; k < K / TK; k += 1) { // + joint_matrix_load( + sg, sub_a, accA.get_pointer() + (sg_startx * TM) * K + k * TK, + K, layout::row_major); + // Assuming B data is already in VNNI format. + joint_matrix_load(sg, sub_b, + accB.get_pointer() + (k * TK / 2) * (N * 2) + + sg_starty / SG_SZ * TN * 2, + N * 2, layout::packed_b); + sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); + } + joint_matrix_store(sg, sub_c, + accC.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N, layout::row_major); + // Element wise operation + auto tCData = sub_c.get_wi_data(); + + for (int i = 0; i < tCData.length(); ++i) { + size_t row, col; + std::tie(row, col) = tCData[i].get_coord(); + res_local_row_acc[row] += tCData[i]; + } + }); // parallel for + }).wait(); +} + +float make_fp32(short x) { + unsigned int y = x; + y = y << 16; + float *res = reinterpret_cast(&y); + return *res; +} + +unsigned short make_bf16(float x) { + int *res = reinterpret_cast(&x); + *res = *res >> 16; + return (unsigned short)*res; +} + +void matrix_multiply_ref(int *A_mem, int *B_mem, int *C_mem, int M, int N, + int K) { + // tiling + for (int m = 0; m < M; m++) + for (int n = 0; n < N; n++) { + for (int k = 0; k < K; k++) { + short *va = (short *)(A_mem + m * K + k); + short *vb = (short *)(B_mem + k * N + n); + float acc = *((float *)(C_mem + m * N + n)); + // FIXME: Should we do reduce-add in another version? + for (int i = 0; i < 2; i++) { + acc += (make_fp32(va[i]) * make_fp32(vb[i])); + } + *((float *)(C_mem + m * N + n)) = acc; + } + } +} + +int main() { + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_K; j++) { + // Ee create bfloat16 from unsigned short since float-to-bfloat's + // conversion is not allowed. + A[i][j] = bfloat16::from_bits(make_bf16(1.0f * (i + j))); + Aref[i][j] = make_bf16(1.0f * (i + j)); + } + } + for (int i = 0; i < MATRIX_K / 2; i++) { + for (int j = 0; j < MATRIX_N * 2; j++) { + B[i][j] = bfloat16::from_bits((make_bf16(2.0f * i + 3.0f * j))); + Bref[i][j] = make_bf16(2.0f * i + 3.0f * j); + } + } + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) { + C[i][j] = 1.0; + D[i][j] = 1.0; + } + } + + big_matrix MC((float *)&C); + big_matrix MD((float *)&D); + big_matrix MA((bfloat16 *)&A); + big_matrix MB((bfloat16 *)&B); + + res_local_row = (int32_t *)calloc(MATRIX_M, sizeof(int32_t)); + + matrix_multiply(MC, MA, MB); + matrix_multiply_ref((int32_t *)Aref, (int32_t *)Bref, (int32_t *)D, MATRIX_M, + MATRIX_N, MATRIX_K / 2); + + bool res = true; + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) { + if (C[i][j] != D[i][j]) + res = false; + } + } + if (res) + std::cout << "passed\n"; + else + std::cout << "failed\n"; + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) + std::cout << C[i][j] << ", "; + std::cout << "\n"; + } + std::cout << std::endl; + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) + std::cout << D[i][j] << ", "; + std::cout << "\n"; + } +} From 135d82b827b763f439c61bcaba9e815264e543a8 Mon Sep 17 00:00:00 2001 From: "arnamoy.bhattacharyya" Date: Fri, 14 Oct 2022 10:39:45 -0400 Subject: [PATCH 02/13] Reviewers comments --- .../sycl/ext/oneapi/matrix/matrix-jit-use.hpp | 21 ++++++++++++++++++- .../matrix/matrix-bfloat16-test-coord.cpp | 14 +++++++++---- 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp index 9c1623c64e429..b8b20d9296afe 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp @@ -258,7 +258,6 @@ class wi_element { std::size_t i) : M(Mat), idx(i) {} - // Functions std::tuple get_coord() { #ifdef __SYCL_DEVICE_ONLY__ return __spirv_JointMatrixWorkItemElemCoord(M.spvm, idx); @@ -352,6 +351,16 @@ class wi_element { wi_element(joint_matrix &Mat, std::size_t i) : M(Mat), idx(i) {} + + std::tuple get_coord() { +#ifdef __SYCL_DEVICE_ONLY__ + return __spirv_JointMatrixWorkItemElemCoord(M.spvm, idx); +#else + throw runtime_error("joint matrix is not supported on host device.", + PI_ERROR_INVALID_DEVICE); +#endif // __SYCL_DEVICE_ONLY__ + } + operator uint16_t() { #ifdef __SYCL_DEVICE_ONLY__ return __spirv_VectorExtractDynamic(M.spvm, idx); @@ -502,6 +511,16 @@ class wi_element &Mat, std::size_t i) : M(Mat), idx(i) {} + + std::tuple get_coord() { +#ifdef __SYCL_DEVICE_ONLY__ + return __spirv_JointMatrixWorkItemElemCoord(M.spvm, idx); +#else + throw runtime_error("joint matrix is not supported on host device.", + PI_ERROR_INVALID_DEVICE); +#endif // __SYCL_DEVICE_ONLY__ + } + operator sycl::ext::oneapi::experimental::bfloat16() { #ifdef __SYCL_DEVICE_ONLY__ return __spirv_VectorExtractDynamic(M.spvm, idx); diff --git a/sycl/test/matrix/matrix-bfloat16-test-coord.cpp b/sycl/test/matrix/matrix-bfloat16-test-coord.cpp index c0d746a81922e..ad5b14f1d1c50 100644 --- a/sycl/test/matrix/matrix-bfloat16-test-coord.cpp +++ b/sycl/test/matrix/matrix-bfloat16-test-coord.cpp @@ -1,4 +1,4 @@ -// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -DSYCL_EXT_ONEAPI_MATRIX_VERSION=2 -S -emit-llvm %s -o %t.out +// RUN: %clangxx -fsycl -O2 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=2 %s -o %t.out #include #include @@ -32,6 +32,7 @@ unsigned short Bref[MATRIX_K / 2][MATRIX_N * 2]; float C[MATRIX_M][MATRIX_N]; float D[MATRIX_M][MATRIX_N]; int32_t *res_local_row; +int32_t *res_local_row_orig; template &C, // Element wise operation auto tCData = sub_c.get_wi_data(); - for (int i = 0; i < tCData.length(); ++i) { - size_t row, col; - std::tie(row, col) = tCData[i].get_coord(); + for (int i = 0; i < tCData.length(); ++i) { + auto [row, col] = tCData[i].get_coord(); res_local_row_acc[row] += tCData[i]; } }); // parallel for @@ -143,6 +143,7 @@ void matrix_multiply_ref(int *A_mem, int *B_mem, int *C_mem, int M, int N, acc += (make_fp32(va[i]) * make_fp32(vb[i])); } *((float *)(C_mem + m * N + n)) = acc; + res_local_row_orig[m] += acc; } } } @@ -175,6 +176,7 @@ int main() { big_matrix MB((bfloat16 *)&B); res_local_row = (int32_t *)calloc(MATRIX_M, sizeof(int32_t)); + res_local_row_orig = (int32_t *)calloc(MATRIX_M, sizeof(int32_t)); matrix_multiply(MC, MA, MB); matrix_multiply_ref((int32_t *)Aref, (int32_t *)Bref, (int32_t *)D, MATRIX_M, @@ -187,6 +189,10 @@ int main() { res = false; } } + for (int i = 0; i < MATRIX_M; i++) { + if (res_local_row[i] != res_local_row_orig[i]) + res = false; + } if (res) std::cout << "passed\n"; else From 95adb669c497e4bec5db975f30205b47dc47718c Mon Sep 17 00:00:00 2001 From: "arnamoy.bhattacharyya" Date: Fri, 14 Oct 2022 10:44:20 -0400 Subject: [PATCH 03/13] clang-format --- sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp | 2 +- sycl/test/matrix/matrix-bfloat16-test-coord.cpp | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp index b8b20d9296afe..1d4251b40c16e 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp @@ -360,7 +360,7 @@ class wi_element { PI_ERROR_INVALID_DEVICE); #endif // __SYCL_DEVICE_ONLY__ } - + operator uint16_t() { #ifdef __SYCL_DEVICE_ONLY__ return __spirv_VectorExtractDynamic(M.spvm, idx); diff --git a/sycl/test/matrix/matrix-bfloat16-test-coord.cpp b/sycl/test/matrix/matrix-bfloat16-test-coord.cpp index ad5b14f1d1c50..4da7bafb29dc3 100644 --- a/sycl/test/matrix/matrix-bfloat16-test-coord.cpp +++ b/sycl/test/matrix/matrix-bfloat16-test-coord.cpp @@ -1,4 +1,4 @@ -// RUN: %clangxx -fsycl -O2 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=2 %s -o %t.out +// RUN: %clangxx -fsycl -O2 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=2 %s -o %t.out #include #include @@ -108,7 +108,7 @@ void matrix_multiply(big_matrix &C, // Element wise operation auto tCData = sub_c.get_wi_data(); - for (int i = 0; i < tCData.length(); ++i) { + for (int i = 0; i < tCData.length(); ++i) { auto [row, col] = tCData[i].get_coord(); res_local_row_acc[row] += tCData[i]; } @@ -190,8 +190,8 @@ int main() { } } for (int i = 0; i < MATRIX_M; i++) { - if (res_local_row[i] != res_local_row_orig[i]) - res = false; + if (res_local_row[i] != res_local_row_orig[i]) + res = false; } if (res) std::cout << "passed\n"; From c7e6000931319cfb222c4bb9b7dac4fa16723eef Mon Sep 17 00:00:00 2001 From: "arnamoy.bhattacharyya" Date: Mon, 17 Oct 2022 16:59:38 -0400 Subject: [PATCH 04/13] Using olc_vec type in the spirv operation and creating a tuple using the vec to get the coordinates. --- sycl/include/CL/__spirv/spirv_ops.hpp | 2 +- .../sycl/ext/oneapi/matrix/matrix-jit-use.hpp | 15 +- .../matrix-bfloat16-test-coord-basic.cpp | 220 ++++++++++++++++++ ...pp => matrix-bfloat16-test-coord-gemm.cpp} | 0 4 files changed, 233 insertions(+), 4 deletions(-) create mode 100644 sycl/test/matrix/matrix-bfloat16-test-coord-basic.cpp rename sycl/test/matrix/{matrix-bfloat16-test-coord.cpp => matrix-bfloat16-test-coord-gemm.cpp} (100%) diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp index d2593fcfb9580..a7fe392fc0b06 100644 --- a/sycl/include/CL/__spirv/spirv_ops.hpp +++ b/sycl/include/CL/__spirv/spirv_ops.hpp @@ -127,7 +127,7 @@ template -extern SYCL_EXTERNAL std::tuple +extern SYCL_EXTERNAL __ocl_vec_t __spirv_JointMatrixWorkItemElemCoord(JOINT_MATRIX_INTEL(T, R, C, L, S, U) *, size_t i); diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp index 1d4251b40c16e..cab61d3d89a6d 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp @@ -260,7 +260,10 @@ class wi_element { std::tuple get_coord() { #ifdef __SYCL_DEVICE_ONLY__ - return __spirv_JointMatrixWorkItemElemCoord(M.spvm, idx); + __ocl_vec_t co_ord = __spirv_JointMatrixWorkItemElemCoord(M.spvm, idx); + const int32_t row = co_ord[0]; + const int32_t col = co_ord[1]; + return std::make_tuple(row, col); #else throw runtime_error("joint matrix is not supported on host device.", PI_ERROR_INVALID_DEVICE); @@ -354,7 +357,10 @@ class wi_element { std::tuple get_coord() { #ifdef __SYCL_DEVICE_ONLY__ - return __spirv_JointMatrixWorkItemElemCoord(M.spvm, idx); + __ocl_vec_t co_ord = __spirv_JointMatrixWorkItemElemCoord(M.spvm, idx); + const int32_t row = co_ord[0]; + const int32_t col = co_ord[1]; + return std::make_tuple(row, col); #else throw runtime_error("joint matrix is not supported on host device.", PI_ERROR_INVALID_DEVICE); @@ -514,7 +520,10 @@ class wi_element get_coord() { #ifdef __SYCL_DEVICE_ONLY__ - return __spirv_JointMatrixWorkItemElemCoord(M.spvm, idx); + __ocl_vec_t co_ord = __spirv_JointMatrixWorkItemElemCoord(M.spvm, idx); + const int32_t row = co_ord[0]; + const int32_t col = co_ord[1]; + return std::make_tuple(row, col); #else throw runtime_error("joint matrix is not supported on host device.", PI_ERROR_INVALID_DEVICE); diff --git a/sycl/test/matrix/matrix-bfloat16-test-coord-basic.cpp b/sycl/test/matrix/matrix-bfloat16-test-coord-basic.cpp new file mode 100644 index 0000000000000..ff2a96eac5d6d --- /dev/null +++ b/sycl/test/matrix/matrix-bfloat16-test-coord-basic.cpp @@ -0,0 +1,220 @@ +// RUN: %clangxx -fsycl -O2 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=2 %s -o %t.out +#include +#include + +using namespace sycl::ext::oneapi::experimental::matrix; +using bfloat16 = sycl::ext::oneapi::experimental::bfloat16; + +static constexpr auto TILE_SZ = 16; +static constexpr auto TM = TILE_SZ - 1; +static constexpr auto TN = TILE_SZ - 1; +static constexpr auto TK = 2 * TILE_SZ - 2; + +static constexpr auto SG_SZ = 16; + +template struct big_matrix { +public: + T *mat; + +public: + T *get_data() { return mat; } + void set_data(T *data) { mat = data; } + big_matrix(T *data) : mat(data) {} +}; + +static constexpr size_t MATRIX_M = TM * 2; +static constexpr size_t MATRIX_N = TN * 2; +static constexpr size_t MATRIX_K = TK * 2; +bfloat16 A[MATRIX_M][MATRIX_K]; +bfloat16 B[MATRIX_K / 2][MATRIX_N * 2]; +unsigned short Aref[MATRIX_M][MATRIX_K]; +unsigned short Bref[MATRIX_K / 2][MATRIX_N * 2]; +float C[MATRIX_M][MATRIX_N]; +float D[MATRIX_M][MATRIX_N]; +int32_t *res_local_rowA; +int32_t *res_local_colB; +int32_t *res_local_rowC; +int32_t *res_local_row_origA; +int32_t *res_local_col_origB; +int32_t *res_local_row_origC; +template +void matrix_coord(big_matrix &C, + big_matrix &A, + big_matrix &B) { + size_t M = NUM_ROWS_C; + size_t N = NUM_COLS_C; + size_t K = NUM_COLS_A; + // B => K/4 x N*4, A => M x K, C => M, N + // stride should be X's cols, e.g., B's stirde = N*4 + assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B * 2); + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + sycl::buffer bufA(A.get_data(), sycl::range<2>(M, K)); + sycl::buffer bufB(B.get_data(), sycl::range<2>(K, N)); + sycl::buffer bufC((float *)C.get_data(), sycl::range<2>(M, N)); + + sycl::buffer res_local_row_bufA(res_local_rowA, + sycl::range<1>(MATRIX_M)); + sycl::buffer res_local_col_bufB(res_local_colB, + sycl::range<1>(MATRIX_N)); + sycl::buffer res_local_row_bufC(res_local_rowC, + sycl::range<1>(MATRIX_M)); + + sycl::queue q; + q.submit([&](sycl::handler &cgh) { + auto accC = bufC.get_access(cgh); + auto accA = bufA.get_access(cgh); + auto accB = bufB.get_access(cgh); + + auto res_local_row_accA = + res_local_row_bufA.get_access(cgh); + auto res_local_col_accB = + res_local_col_bufB.get_access(cgh); + auto res_local_row_accC = + res_local_row_bufC.get_access(cgh); + + cgh.parallel_for( + sycl::nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), + [accA, accB, accC, M, N, K, + res_local_row_accA, res_local_col_accB, res_local_row_accC](sycl::nd_item<2> spmd_item) + + { + sycl::ext::oneapi::sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a(sg); + // For B, since current implementation does not support non-packed + // layout, users need to specify the updated VNNI sizes along with + // the packed_b layout. By default, the layout is row_major and size + // is (TK, TN). + joint_matrix sub_b(sg); + joint_matrix sub_c(sg); + + joint_matrix_fill(sg, sub_a, 1); + joint_matrix_fill(sg, sub_b, 2); + joint_matrix_fill(sg, sub_c, 3); + // Element wise operation + auto tAData = sub_a.get_wi_data(); + auto tBData = sub_b.get_wi_data(); + auto tCData = sub_c.get_wi_data(); + + for (int i = 0; i < tAData.length(); ++i) { + auto [row, col] = tAData[i].get_coord(); + res_local_row_accA[row] += tAData[i]; + } + + for (int i = 0; i < tBData.length(); ++i) { + auto [row, col] = tBData[i].get_coord(); + res_local_col_accB[col] += tBData[i]; + } + + for (int i = 0; i < tCData.length(); ++i) { + auto [row, col] = tCData[i].get_coord(); + res_local_row_accC[row] += tCData[i]; + } + }); // parallel for + }).wait(); +} + +float make_fp32(short x) { + unsigned int y = x; + y = y << 16; + float *res = reinterpret_cast(&y); + return *res; +} + +unsigned short make_bf16(float x) { + int *res = reinterpret_cast(&x); + *res = *res >> 16; + return (unsigned short)*res; +} + +void matrix_coord_ref(int *A_mem, int *B_mem, int *C_mem, int M, int N, + int K) { + for (int m = 0; m < M; m++) + for (int k = 0; k < K; k++) { + short *va = (short *)(A_mem + m * K + k); + res_local_row_origA[m] += *va; + } + + for (int k = 0; k < K; k++) + for (int n = 0; n < N; n++) { + short *vb = (short *)(B_mem + k * N + n); + res_local_col_origB[n] += *vb; + } + + for (int m = 0; m < M; m++) + for (int n = 0; n < N; n++) { + short *vc = (short *)(C_mem + m * N + n); + res_local_row_origC[m] += *vc; + } +} + +int main() { + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_K; j++) { + // Ee create bfloat16 from unsigned short since float-to-bfloat's + // conversion is not allowed. + A[i][j] = bfloat16::from_bits(make_bf16(1.0f)); + Aref[i][j] = make_bf16(1.0f); + } + } + for (int i = 0; i < MATRIX_K / 2; i++) { + for (int j = 0; j < MATRIX_N * 2; j++) { + B[i][j] = bfloat16::from_bits((make_bf16(2.0f))); + Bref[i][j] = make_bf16(2.0f); + } + } + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) { + C[i][j] = 3.0; + D[i][j] = 3.0; + } + } + + big_matrix MC((float *)&C); + big_matrix MD((float *)&D); + big_matrix MA((bfloat16 *)&A); + big_matrix MB((bfloat16 *)&B); + + res_local_rowA = (int32_t *)calloc(MATRIX_M, sizeof(int32_t)); + res_local_colB = (int32_t *)calloc(MATRIX_N, sizeof(int32_t)); + res_local_rowC = (int32_t *)calloc(MATRIX_M, sizeof(int32_t)); + + res_local_row_origA = (int32_t *)calloc(MATRIX_M, sizeof(int32_t)); + res_local_col_origB = (int32_t *)calloc(MATRIX_N, sizeof(int32_t)); + res_local_row_origC = (int32_t *)calloc(MATRIX_M, sizeof(int32_t)); + + matrix_coord(MC, MA, MB); + matrix_coord_ref((int32_t *)Aref, (int32_t *)Bref, (int32_t *)D, MATRIX_M, + MATRIX_N, MATRIX_K / 2); + + bool res = true; + for (int i = 0; i < MATRIX_M; i++) { + if (res_local_rowA[i] != res_local_row_origA[i]) + res = false; + } + for (int i = 0; i < MATRIX_K; i++) { + if (res_local_colB[i] != res_local_col_origB[i]) + res = false; + } + for (int i = 0; i < MATRIX_M; i++) { + if (res_local_rowC[i] != res_local_row_origC[i]) + res = false; + } + if (res) + std::cout << "passed\n"; + else + std::cout << "failed\n"; + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) + std::cout << C[i][j] << ", "; + std::cout << "\n"; + } + std::cout << std::endl; + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) + std::cout << D[i][j] << ", "; + std::cout << "\n"; + } +} diff --git a/sycl/test/matrix/matrix-bfloat16-test-coord.cpp b/sycl/test/matrix/matrix-bfloat16-test-coord-gemm.cpp similarity index 100% rename from sycl/test/matrix/matrix-bfloat16-test-coord.cpp rename to sycl/test/matrix/matrix-bfloat16-test-coord-gemm.cpp From 7742fd90f4d3c8a71a7b55671dc0ebff073a8efc Mon Sep 17 00:00:00 2001 From: "arnamoy.bhattacharyya" Date: Mon, 17 Oct 2022 17:00:55 -0400 Subject: [PATCH 05/13] clang-format --- .../sycl/ext/oneapi/matrix/matrix-jit-use.hpp | 9 ++- .../matrix-bfloat16-test-coord-basic.cpp | 55 +++++++++---------- 2 files changed, 33 insertions(+), 31 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp index cab61d3d89a6d..a1e8392c6da9d 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp @@ -260,7 +260,8 @@ class wi_element { std::tuple get_coord() { #ifdef __SYCL_DEVICE_ONLY__ - __ocl_vec_t co_ord = __spirv_JointMatrixWorkItemElemCoord(M.spvm, idx); + __ocl_vec_t co_ord = + __spirv_JointMatrixWorkItemElemCoord(M.spvm, idx); const int32_t row = co_ord[0]; const int32_t col = co_ord[1]; return std::make_tuple(row, col); @@ -357,7 +358,8 @@ class wi_element { std::tuple get_coord() { #ifdef __SYCL_DEVICE_ONLY__ - __ocl_vec_t co_ord = __spirv_JointMatrixWorkItemElemCoord(M.spvm, idx); + __ocl_vec_t co_ord = + __spirv_JointMatrixWorkItemElemCoord(M.spvm, idx); const int32_t row = co_ord[0]; const int32_t col = co_ord[1]; return std::make_tuple(row, col); @@ -520,7 +522,8 @@ class wi_element get_coord() { #ifdef __SYCL_DEVICE_ONLY__ - __ocl_vec_t co_ord = __spirv_JointMatrixWorkItemElemCoord(M.spvm, idx); + __ocl_vec_t co_ord = + __spirv_JointMatrixWorkItemElemCoord(M.spvm, idx); const int32_t row = co_ord[0]; const int32_t col = co_ord[1]; return std::make_tuple(row, col); diff --git a/sycl/test/matrix/matrix-bfloat16-test-coord-basic.cpp b/sycl/test/matrix/matrix-bfloat16-test-coord-basic.cpp index ff2a96eac5d6d..78e08cc29ac65 100644 --- a/sycl/test/matrix/matrix-bfloat16-test-coord-basic.cpp +++ b/sycl/test/matrix/matrix-bfloat16-test-coord-basic.cpp @@ -41,8 +41,8 @@ template void matrix_coord(big_matrix &C, - big_matrix &A, - big_matrix &B) { + big_matrix &A, + big_matrix &B) { size_t M = NUM_ROWS_C; size_t N = NUM_COLS_C; size_t K = NUM_COLS_A; @@ -56,11 +56,11 @@ void matrix_coord(big_matrix &C, sycl::buffer bufC((float *)C.get_data(), sycl::range<2>(M, N)); sycl::buffer res_local_row_bufA(res_local_rowA, - sycl::range<1>(MATRIX_M)); + sycl::range<1>(MATRIX_M)); sycl::buffer res_local_col_bufB(res_local_colB, - sycl::range<1>(MATRIX_N)); + sycl::range<1>(MATRIX_N)); sycl::buffer res_local_row_bufC(res_local_rowC, - sycl::range<1>(MATRIX_M)); + sycl::range<1>(MATRIX_M)); sycl::queue q; q.submit([&](sycl::handler &cgh) { @@ -77,8 +77,8 @@ void matrix_coord(big_matrix &C, cgh.parallel_for( sycl::nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [accA, accB, accC, M, N, K, - res_local_row_accA, res_local_col_accB, res_local_row_accC](sycl::nd_item<2> spmd_item) + [accA, accB, accC, M, N, K, res_local_row_accA, res_local_col_accB, + res_local_row_accC](sycl::nd_item<2> spmd_item) { sycl::ext::oneapi::sub_group sg = spmd_item.get_sub_group(); @@ -92,7 +92,7 @@ void matrix_coord(big_matrix &C, joint_matrix_fill(sg, sub_a, 1); joint_matrix_fill(sg, sub_b, 2); - joint_matrix_fill(sg, sub_c, 3); + joint_matrix_fill(sg, sub_c, 3); // Element wise operation auto tAData = sub_a.get_wi_data(); auto tBData = sub_b.get_wi_data(); @@ -129,25 +129,24 @@ unsigned short make_bf16(float x) { return (unsigned short)*res; } -void matrix_coord_ref(int *A_mem, int *B_mem, int *C_mem, int M, int N, - int K) { - for (int m = 0; m < M; m++) - for (int k = 0; k < K; k++) { - short *va = (short *)(A_mem + m * K + k); - res_local_row_origA[m] += *va; - } - - for (int k = 0; k < K; k++) - for (int n = 0; n < N; n++) { - short *vb = (short *)(B_mem + k * N + n); - res_local_col_origB[n] += *vb; - } - - for (int m = 0; m < M; m++) - for (int n = 0; n < N; n++) { - short *vc = (short *)(C_mem + m * N + n); - res_local_row_origC[m] += *vc; - } +void matrix_coord_ref(int *A_mem, int *B_mem, int *C_mem, int M, int N, int K) { + for (int m = 0; m < M; m++) + for (int k = 0; k < K; k++) { + short *va = (short *)(A_mem + m * K + k); + res_local_row_origA[m] += *va; + } + + for (int k = 0; k < K; k++) + for (int n = 0; n < N; n++) { + short *vb = (short *)(B_mem + k * N + n); + res_local_col_origB[n] += *vb; + } + + for (int m = 0; m < M; m++) + for (int n = 0; n < N; n++) { + short *vc = (short *)(C_mem + m * N + n); + res_local_row_origC[m] += *vc; + } } int main() { @@ -187,7 +186,7 @@ int main() { matrix_coord(MC, MA, MB); matrix_coord_ref((int32_t *)Aref, (int32_t *)Bref, (int32_t *)D, MATRIX_M, - MATRIX_N, MATRIX_K / 2); + MATRIX_N, MATRIX_K / 2); bool res = true; for (int i = 0; i < MATRIX_M; i++) { From b4e3ef53a04e6617e9e08cd7ca5d4ca7af5768d8 Mon Sep 17 00:00:00 2001 From: "arnamoy.bhattacharyya" Date: Tue, 18 Oct 2022 10:52:05 -0400 Subject: [PATCH 06/13] Review comments --- sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp index a1e8392c6da9d..936bb5a6d5190 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp @@ -358,10 +358,10 @@ class wi_element { std::tuple get_coord() { #ifdef __SYCL_DEVICE_ONLY__ - __ocl_vec_t co_ord = + __ocl_vec_t coord = __spirv_JointMatrixWorkItemElemCoord(M.spvm, idx); - const int32_t row = co_ord[0]; - const int32_t col = co_ord[1]; + const int32_t row = coord[0]; + const int32_t col = coord[1]; return std::make_tuple(row, col); #else throw runtime_error("joint matrix is not supported on host device.", From 57a97cfec570906723c90b563f6a986e00c5a575 Mon Sep 17 00:00:00 2001 From: "arnamoy.bhattacharyya" Date: Mon, 24 Oct 2022 12:48:31 -0400 Subject: [PATCH 07/13] Makeaccess through USM, also update the basic kernel with use of bfloat16, fix theCPU kernel --- .../matrix-bfloat16-test-coord-basic.cpp | 461 +++++++++++++----- .../matrix-bfloat16-test-coord-gemm.cpp | 6 +- 2 files changed, 330 insertions(+), 137 deletions(-) diff --git a/sycl/test/matrix/matrix-bfloat16-test-coord-basic.cpp b/sycl/test/matrix/matrix-bfloat16-test-coord-basic.cpp index 78e08cc29ac65..c396cb7a92833 100644 --- a/sycl/test/matrix/matrix-bfloat16-test-coord-basic.cpp +++ b/sycl/test/matrix/matrix-bfloat16-test-coord-basic.cpp @@ -1,16 +1,234 @@ // RUN: %clangxx -fsycl -O2 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=2 %s -o %t.out +// #include +// #include + +// using namespace sycl::ext::oneapi::experimental::matrix; +// using bfloat16 = sycl::ext::oneapi::experimental::bfloat16; + +// static constexpr auto TILE_SZ = 16; +// static constexpr auto TM = TILE_SZ - 1; +// static constexpr auto TN = TILE_SZ - 1; +// static constexpr auto TK = 2 * TILE_SZ - 2; + +// static constexpr auto SG_SZ = 16; + +// template struct big_matrix { +// public: +// T *mat; + +// public: +// T *get_data() { return mat; } +// void set_data(T *data) { mat = data; } +// big_matrix(T *data) : mat(data) {} +// }; + +// static constexpr size_t MATRIX_M = TM * 2; +// static constexpr size_t MATRIX_N = TN * 2; +// static constexpr size_t MATRIX_K = TK * 2; +// bfloat16 A[MATRIX_M][MATRIX_K]; +// bfloat16 B[MATRIX_K / 2][MATRIX_N * 2]; +// unsigned short Aref[MATRIX_M][MATRIX_K]; +// unsigned short Bref[MATRIX_K / 2][MATRIX_N * 2]; +// float C[MATRIX_M][MATRIX_N]; +// float D[MATRIX_M][MATRIX_N]; +// int32_t *res_local_rowA; +// int32_t *res_local_colB; +// int32_t *res_local_rowC; +// int32_t *res_local_row_origA; +// int32_t *res_local_col_origB; +// int32_t *res_local_row_origC; +// template +// void matrix_coord(big_matrix &C, +// big_matrix &A, +// big_matrix &B) { +// size_t M = NUM_ROWS_C; +// size_t N = NUM_COLS_C; +// size_t K = NUM_COLS_A; +// // B => K/4 x N*4, A => M x K, C => M, N +// // stride should be X's cols, e.g., B's stirde = N*4 +// assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B * 2); +// size_t NDRangeM = M / TM; +// size_t NDRangeN = N / TN; +// sycl::buffer bufA(A.get_data(), sycl::range<2>(M, K)); +// sycl::buffer bufB(B.get_data(), sycl::range<2>(K, N)); +// sycl::buffer bufC((float *)C.get_data(), sycl::range<2>(M, N)); + +// sycl::buffer res_local_row_bufA(res_local_rowA, +// sycl::range<1>(MATRIX_M)); +// sycl::buffer res_local_col_bufB(res_local_colB, +// sycl::range<1>(MATRIX_N)); +// sycl::buffer res_local_row_bufC(res_local_rowC, +// sycl::range<1>(MATRIX_M)); + +// sycl::queue q; +// q.submit([&](sycl::handler &cgh) { +// auto accC = bufC.get_access(cgh); +// auto accA = bufA.get_access(cgh); +// auto accB = bufB.get_access(cgh); + +// auto res_local_row_accA = +// res_local_row_bufA.get_access(cgh); +// auto res_local_col_accB = +// res_local_col_bufB.get_access(cgh); +// auto res_local_row_accC = +// res_local_row_bufC.get_access(cgh); + +// cgh.parallel_for( +// sycl::nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), +// [accA, accB, accC, M, N, K, res_local_row_accA, res_local_col_accB, +// res_local_row_accC](sycl::nd_item<2> spmd_item) + +// { +// sycl::ext::oneapi::sub_group sg = spmd_item.get_sub_group(); +// joint_matrix sub_a(sg); +// // For B, since current implementation does not support non-packed +// // layout, users need to specify the updated VNNI sizes along with +// // the packed_b layout. By default, the layout is row_major and size +// // is (TK, TN). +// joint_matrix sub_b(sg); +// joint_matrix sub_c(sg); + +// joint_matrix_fill(sg, sub_a, 1); +// joint_matrix_fill(sg, sub_b, 2); +// joint_matrix_fill(sg, sub_c, 3); +// // Element wise operation +// auto tAData = sub_a.get_wi_data(); +// auto tBData = sub_b.get_wi_data(); +// auto tCData = sub_c.get_wi_data(); + +// for (int i = 0; i < tAData.length(); ++i) { +// auto [row, col] = tAData[i].get_coord(); +// res_local_row_accA[row] += tAData[i]; +// } + +// for (int i = 0; i < tBData.length(); ++i) { +// auto [row, col] = tBData[i].get_coord(); +// res_local_col_accB[col] += tBData[i]; +// } + +// for (int i = 0; i < tCData.length(); ++i) { +// auto [row, col] = tCData[i].get_coord(); +// res_local_row_accC[row] += tCData[i]; +// } +// }); // parallel for +// }).wait(); +// } + +// float make_fp32(short x) { +// unsigned int y = x; +// y = y << 16; +// float *res = reinterpret_cast(&y); +// return *res; +// } + +// unsigned short make_bf16(float x) { +// int *res = reinterpret_cast(&x); +// *res = *res >> 16; +// return (unsigned short)*res; +// } + +// void matrix_coord_ref(int *A_mem, int *B_mem, int *C_mem, int M, int N, int K) { +// for (int m = 0; m < M; m++) +// for (int k = 0; k < K; k++) { +// short *va = (short *)(A_mem + m * K + k); +// res_local_row_origA[m] += *va; +// } + +// for (int k = 0; k < K; k++) +// for (int n = 0; n < N; n++) { +// short *vb = (short *)(B_mem + k * N + n); +// res_local_col_origB[n] += *vb; +// } + +// for (int m = 0; m < M; m++) +// for (int n = 0; n < N; n++) { +// short *vc = (short *)(C_mem + m * N + n); +// res_local_row_origC[m] += *vc; +// } +// } + +// int main() { +// for (int i = 0; i < MATRIX_M; i++) { +// for (int j = 0; j < MATRIX_K; j++) { +// // Ee create bfloat16 from unsigned short since float-to-bfloat's +// // conversion is not allowed. +// A[i][j] = bfloat16::from_bits(make_bf16(1.0f)); +// Aref[i][j] = make_bf16(1.0f); +// } +// } +// for (int i = 0; i < MATRIX_K / 2; i++) { +// for (int j = 0; j < MATRIX_N * 2; j++) { +// B[i][j] = bfloat16::from_bits((make_bf16(2.0f))); +// Bref[i][j] = make_bf16(2.0f); +// } +// } +// for (int i = 0; i < MATRIX_M; i++) { +// for (int j = 0; j < MATRIX_N; j++) { +// C[i][j] = 3.0; +// D[i][j] = 3.0; +// } +// } + +// big_matrix MC((float *)&C); +// big_matrix MD((float *)&D); +// big_matrix MA((bfloat16 *)&A); +// big_matrix MB((bfloat16 *)&B); + +// res_local_rowA = (int32_t *)calloc(MATRIX_M, sizeof(int32_t)); +// res_local_colB = (int32_t *)calloc(MATRIX_N, sizeof(int32_t)); +// res_local_rowC = (int32_t *)calloc(MATRIX_M, sizeof(int32_t)); + +// res_local_row_origA = (int32_t *)calloc(MATRIX_M, sizeof(int32_t)); +// res_local_col_origB = (int32_t *)calloc(MATRIX_N, sizeof(int32_t)); +// res_local_row_origC = (int32_t *)calloc(MATRIX_M, sizeof(int32_t)); + +// matrix_coord(MC, MA, MB); +// matrix_coord_ref((int32_t *)Aref, (int32_t *)Bref, (int32_t *)D, MATRIX_M, +// MATRIX_N, MATRIX_K / 2); + +// bool res = true; +// for (int i = 0; i < MATRIX_M; i++) { +// if (res_local_rowA[i] != res_local_row_origA[i]) +// res = false; +// } +// for (int i = 0; i < MATRIX_K; i++) { +// if (res_local_colB[i] != res_local_col_origB[i]) +// res = false; +// } +// for (int i = 0; i < MATRIX_M; i++) { +// if (res_local_rowC[i] != res_local_row_origC[i]) +// res = false; +// } +// if (res) +// std::cout << "passed\n"; +// else +// std::cout << "failed\n"; +// for (int i = 0; i < MATRIX_M; i++) { +// for (int j = 0; j < MATRIX_N; j++) +// std::cout << C[i][j] << ", "; +// std::cout << "\n"; +// } +// std::cout << std::endl; +// for (int i = 0; i < MATRIX_M; i++) { +// for (int j = 0; j < MATRIX_N; j++) +// std::cout << D[i][j] << ", "; +// std::cout << "\n"; +// } +// } + #include #include using namespace sycl::ext::oneapi::experimental::matrix; using bfloat16 = sycl::ext::oneapi::experimental::bfloat16; -static constexpr auto TILE_SZ = 16; -static constexpr auto TM = TILE_SZ - 1; -static constexpr auto TN = TILE_SZ - 1; -static constexpr auto TK = 2 * TILE_SZ - 2; +#define SG_SZ 16 -static constexpr auto SG_SZ = 16; +#define TM 8 +#define TN SG_SZ +#define TK 16 template struct big_matrix { public: @@ -22,95 +240,87 @@ template struct big_matrix { big_matrix(T *data) : mat(data) {} }; -static constexpr size_t MATRIX_M = TM * 2; -static constexpr size_t MATRIX_N = TN * 2; -static constexpr size_t MATRIX_K = TK * 2; -bfloat16 A[MATRIX_M][MATRIX_K]; -bfloat16 B[MATRIX_K / 2][MATRIX_N * 2]; -unsigned short Aref[MATRIX_M][MATRIX_K]; -unsigned short Bref[MATRIX_K / 2][MATRIX_N * 2]; -float C[MATRIX_M][MATRIX_N]; -float D[MATRIX_M][MATRIX_N]; -int32_t *res_local_rowA; -int32_t *res_local_colB; -int32_t *res_local_rowC; -int32_t *res_local_row_origA; -int32_t *res_local_col_origB; -int32_t *res_local_row_origC; -template -void matrix_coord(big_matrix &C, - big_matrix &A, - big_matrix &B) { - size_t M = NUM_ROWS_C; - size_t N = NUM_COLS_C; - size_t K = NUM_COLS_A; - // B => K/4 x N*4, A => M x K, C => M, N - // stride should be X's cols, e.g., B's stirde = N*4 - assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B * 2); - size_t NDRangeM = M / TM; - size_t NDRangeN = N / TN; - sycl::buffer bufA(A.get_data(), sycl::range<2>(M, K)); - sycl::buffer bufB(B.get_data(), sycl::range<2>(K, N)); - sycl::buffer bufC((float *)C.get_data(), sycl::range<2>(M, N)); - - sycl::buffer res_local_row_bufA(res_local_rowA, - sycl::range<1>(MATRIX_M)); - sycl::buffer res_local_col_bufB(res_local_colB, - sycl::range<1>(MATRIX_N)); - sycl::buffer res_local_row_bufC(res_local_rowC, - sycl::range<1>(MATRIX_M)); +static constexpr size_t MATRIX_M = 2 * TM; +static constexpr size_t MATRIX_N = 2 * TN; +static constexpr size_t MATRIX_K = 2 * TK; +bfloat16 A_ref[MATRIX_M][MATRIX_K]; +bfloat16 B_ref[MATRIX_K][MATRIX_N]; +float C_ref[MATRIX_M][MATRIX_N]; - sycl::queue q; - q.submit([&](sycl::handler &cgh) { - auto accC = bufC.get_access(cgh); - auto accA = bufA.get_access(cgh); - auto accB = bufB.get_access(cgh); - - auto res_local_row_accA = - res_local_row_bufA.get_access(cgh); - auto res_local_col_accB = - res_local_col_bufB.get_access(cgh); - auto res_local_row_accC = - res_local_row_bufC.get_access(cgh); - - cgh.parallel_for( +bfloat16 *A; +bfloat16 *B; +float* C; + +// float *res_local_rowA; +// float *res_local_colB; +// float *res_local_rowC; + +float *res_local_row_origA; +float *res_local_col_origB; +float *res_local_row_origC; + +void matrix_coord(sycl::queue &q, float* resA, float *resB, float *resC) { + + size_t NDRangeM = MATRIX_M / TM; + size_t NDRangeN = MATRIX_N / TN; + + +auto pA = sycl::multi_ptr(A); +auto pB = sycl::multi_ptr(B); +auto pC = sycl::multi_ptr(C); + + +q.submit([&](sycl::handler &cgh) { + + cgh.parallel_for( sycl::nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [accA, accB, accC, M, N, K, res_local_row_accA, res_local_col_accB, - res_local_row_accC](sycl::nd_item<2> spmd_item) + [=](sycl::nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] + + { + size_t M = MATRIX_M; + size_t N = MATRIX_N; + size_t K = MATRIX_K; - { sycl::ext::oneapi::sub_group sg = spmd_item.get_sub_group(); + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + joint_matrix sub_a(sg); - // For B, since current implementation does not support non-packed - // layout, users need to specify the updated VNNI sizes along with - // the packed_b layout. By default, the layout is row_major and size - // is (TK, TN). joint_matrix sub_b(sg); joint_matrix sub_c(sg); - joint_matrix_fill(sg, sub_a, 1); - joint_matrix_fill(sg, sub_b, 2); - joint_matrix_fill(sg, sub_c, 3); - // Element wise operation - auto tAData = sub_a.get_wi_data(); - auto tBData = sub_b.get_wi_data(); + joint_matrix_load(sg, sub_c, + pC + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N, layout::row_major); auto tCData = sub_c.get_wi_data(); - - for (int i = 0; i < tAData.length(); ++i) { - auto [row, col] = tAData[i].get_coord(); - res_local_row_accA[row] += tAData[i]; + for (int i = 0; i < tCData.length(); ++i) { + auto [row, col] = tCData[i].get_coord(); + resC[row] += tCData[i]; } - for (int i = 0; i < tBData.length(); ++i) { - auto [row, col] = tBData[i].get_coord(); - res_local_col_accB[col] += tBData[i]; - } + for (int k = 0; k < K / TK; k += 1) { // + joint_matrix_load( + sg, sub_a, pA + (sg_startx * TM) * K + k * TK, + K, layout::row_major); + auto tAData = sub_a.get_wi_data(); + for (int i = 0; i < tAData.length(); ++i) { + auto [row, col] = tAData[i].get_coord(); + resA[row] += tAData[i]; + } - for (int i = 0; i < tCData.length(); ++i) { - auto [row, col] = tCData[i].get_coord(); - res_local_row_accC[row] += tCData[i]; + // Assuming B data is already in VNNI format. + joint_matrix_load(sg, sub_b, + pB + (k * TK / 2) * (N * 2) + + sg_starty / SG_SZ * TN * 2, + N * 2, layout::packed_b); + auto tBData = sub_b.get_wi_data(); + for (int i = 0; i < tBData.length(); ++i) { + auto [row, col] = tBData[i].get_coord(); + resB[col] += tBData[i]; + } } }); // parallel for }).wait(); @@ -129,66 +339,64 @@ unsigned short make_bf16(float x) { return (unsigned short)*res; } -void matrix_coord_ref(int *A_mem, int *B_mem, int *C_mem, int M, int N, int K) { - for (int m = 0; m < M; m++) - for (int k = 0; k < K; k++) { - short *va = (short *)(A_mem + m * K + k); - res_local_row_origA[m] += *va; - } - - for (int k = 0; k < K; k++) - for (int n = 0; n < N; n++) { - short *vb = (short *)(B_mem + k * N + n); - res_local_col_origB[n] += *vb; - } - - for (int m = 0; m < M; m++) - for (int n = 0; n < N; n++) { - short *vc = (short *)(C_mem + m * N + n); - res_local_row_origC[m] += *vc; +// void matrix_coord_ref(int *A_mem, int *B_mem, int *C_mem, int M, int N, int K) { +void matrix_coord_ref(int M, int N, int K) { + for (int m = 0; m < M; m+=TM) { + for (int n = 0; n < N; n+=TN) { + for (int k = 0; k < K; k+=TK) { + for (int mm = 0; mm < TM; mm++) { + for (int nn = 0; nn < TN; nn++) { + res_local_row_origC[m*TM + mm]+= C_ref[m*TM+mm][n*TN+nn]; + for (int kk = 0; kk < TK; kk++) { + res_local_row_origA[k*TK + kk]+= A_ref[m*TM+mm][k*TK+kk]; + res_local_col_origB[n*TN + nn]+= B_ref[k*TK+kk][n*TN+nn]; + } + } + } + } } + } } int main() { for (int i = 0; i < MATRIX_M; i++) { for (int j = 0; j < MATRIX_K; j++) { - // Ee create bfloat16 from unsigned short since float-to-bfloat's + // Create bfloat16 from unsigned short since float-to-bfloat's // conversion is not allowed. - A[i][j] = bfloat16::from_bits(make_bf16(1.0f)); - Aref[i][j] = make_bf16(1.0f); + A_ref[i][j] = bfloat16::from_bits(make_bf16(1.0f)); } } - for (int i = 0; i < MATRIX_K / 2; i++) { - for (int j = 0; j < MATRIX_N * 2; j++) { - B[i][j] = bfloat16::from_bits((make_bf16(2.0f))); - Bref[i][j] = make_bf16(2.0f); + for (int i = 0; i < MATRIX_K; i++) { + for (int j = 0; j < MATRIX_N; j++) { + B_ref[i][j] = bfloat16::from_bits((make_bf16(2.0f))); } } for (int i = 0; i < MATRIX_M; i++) { for (int j = 0; j < MATRIX_N; j++) { - C[i][j] = 3.0; - D[i][j] = 3.0; + C_ref[i][j] = 3.0; } } + + sycl::queue q; + A = malloc_shared(MATRIX_M * MATRIX_K, q); + B = malloc_shared(MATRIX_K * MATRIX_N, q); + C = malloc_shared(MATRIX_M * MATRIX_N, q); - big_matrix MC((float *)&C); - big_matrix MD((float *)&D); - big_matrix MA((bfloat16 *)&A); - big_matrix MB((bfloat16 *)&B); - res_local_rowA = (int32_t *)calloc(MATRIX_M, sizeof(int32_t)); - res_local_colB = (int32_t *)calloc(MATRIX_N, sizeof(int32_t)); - res_local_rowC = (int32_t *)calloc(MATRIX_M, sizeof(int32_t)); + float* res_local_rowA = malloc_shared(MATRIX_M, q); + float* res_local_colB = malloc_shared(MATRIX_N, q); + float* res_local_rowC = malloc_shared(MATRIX_M, q); - res_local_row_origA = (int32_t *)calloc(MATRIX_M, sizeof(int32_t)); - res_local_col_origB = (int32_t *)calloc(MATRIX_N, sizeof(int32_t)); - res_local_row_origC = (int32_t *)calloc(MATRIX_M, sizeof(int32_t)); + res_local_row_origA = (float *)calloc(MATRIX_M, sizeof(float)); // globals + res_local_col_origB = (float *)calloc(MATRIX_N, sizeof(float)); + res_local_row_origC = (float *)calloc(MATRIX_M, sizeof(float)); - matrix_coord(MC, MA, MB); - matrix_coord_ref((int32_t *)Aref, (int32_t *)Bref, (int32_t *)D, MATRIX_M, - MATRIX_N, MATRIX_K / 2); + matrix_coord(q, res_local_rowA, res_local_colB, res_local_rowC); + matrix_coord_ref(MATRIX_M, + MATRIX_N, MATRIX_K); bool res = true; + for (int i = 0; i < MATRIX_M; i++) { if (res_local_rowA[i] != res_local_row_origA[i]) res = false; @@ -205,15 +413,4 @@ int main() { std::cout << "passed\n"; else std::cout << "failed\n"; - for (int i = 0; i < MATRIX_M; i++) { - for (int j = 0; j < MATRIX_N; j++) - std::cout << C[i][j] << ", "; - std::cout << "\n"; - } - std::cout << std::endl; - for (int i = 0; i < MATRIX_M; i++) { - for (int j = 0; j < MATRIX_N; j++) - std::cout << D[i][j] << ", "; - std::cout << "\n"; - } } diff --git a/sycl/test/matrix/matrix-bfloat16-test-coord-gemm.cpp b/sycl/test/matrix/matrix-bfloat16-test-coord-gemm.cpp index 4da7bafb29dc3..4bd86f36c7e2f 100644 --- a/sycl/test/matrix/matrix-bfloat16-test-coord-gemm.cpp +++ b/sycl/test/matrix/matrix-bfloat16-test-coord-gemm.cpp @@ -100,11 +100,7 @@ void matrix_multiply(big_matrix &C, sg_starty / SG_SZ * TN * 2, N * 2, layout::packed_b); sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); - } - joint_matrix_store(sg, sub_c, - accC.get_pointer() + (sg_startx * TM) * N + - sg_starty / SG_SZ * TN, - N, layout::row_major); + } // Element wise operation auto tCData = sub_c.get_wi_data(); From 1c5ace58e452bfb6e1fc3cb2241330f10a763bd4 Mon Sep 17 00:00:00 2001 From: "arnamoy.bhattacharyya" Date: Mon, 24 Oct 2022 12:50:12 -0400 Subject: [PATCH 08/13] clang-format --- .../matrix-bfloat16-test-coord-basic.cpp | 89 ++++++++++--------- .../matrix-bfloat16-test-coord-gemm.cpp | 2 +- 2 files changed, 46 insertions(+), 45 deletions(-) diff --git a/sycl/test/matrix/matrix-bfloat16-test-coord-basic.cpp b/sycl/test/matrix/matrix-bfloat16-test-coord-basic.cpp index c396cb7a92833..ca5d4bdb8cc41 100644 --- a/sycl/test/matrix/matrix-bfloat16-test-coord-basic.cpp +++ b/sycl/test/matrix/matrix-bfloat16-test-coord-basic.cpp @@ -85,7 +85,8 @@ // joint_matrix sub_a(sg); // // For B, since current implementation does not support non-packed // // layout, users need to specify the updated VNNI sizes along with -// // the packed_b layout. By default, the layout is row_major and size +// // the packed_b layout. By default, the layout is row_major and +// size // // is (TK, TN). // joint_matrix sub_b(sg); // joint_matrix sub_c(sg); @@ -129,7 +130,8 @@ // return (unsigned short)*res; // } -// void matrix_coord_ref(int *A_mem, int *B_mem, int *C_mem, int M, int N, int K) { +// void matrix_coord_ref(int *A_mem, int *B_mem, int *C_mem, int M, int N, int +// K) { // for (int m = 0; m < M; m++) // for (int k = 0; k < K; k++) { // short *va = (short *)(A_mem + m * K + k); @@ -240,16 +242,16 @@ template struct big_matrix { big_matrix(T *data) : mat(data) {} }; -static constexpr size_t MATRIX_M = 2 * TM; +static constexpr size_t MATRIX_M = 2 * TM; static constexpr size_t MATRIX_N = 2 * TN; -static constexpr size_t MATRIX_K = 2 * TK; +static constexpr size_t MATRIX_K = 2 * TK; bfloat16 A_ref[MATRIX_M][MATRIX_K]; bfloat16 B_ref[MATRIX_K][MATRIX_N]; float C_ref[MATRIX_M][MATRIX_N]; bfloat16 *A; bfloat16 *B; -float* C; +float *C; // float *res_local_rowA; // float *res_local_colB; @@ -259,27 +261,27 @@ float *res_local_row_origA; float *res_local_col_origB; float *res_local_row_origC; -void matrix_coord(sycl::queue &q, float* resA, float *resB, float *resC) { +void matrix_coord(sycl::queue &q, float *resA, float *resB, float *resC) { size_t NDRangeM = MATRIX_M / TM; size_t NDRangeN = MATRIX_N / TN; + auto pA = + sycl::multi_ptr(A); + auto pB = + sycl::multi_ptr(B); + auto pC = + sycl::multi_ptr(C); -auto pA = sycl::multi_ptr(A); -auto pB = sycl::multi_ptr(B); -auto pC = sycl::multi_ptr(C); + q.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), [= + ](sycl::nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] - -q.submit([&](sycl::handler &cgh) { - - cgh.parallel_for( - sycl::nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [=](sycl::nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] - - { + { size_t M = MATRIX_M; size_t N = MATRIX_N; - size_t K = MATRIX_K; + size_t K = MATRIX_K; sycl::ext::oneapi::sub_group sg = spmd_item.get_sub_group(); const auto global_idx = spmd_item.get_global_id(0); @@ -292,24 +294,22 @@ q.submit([&](sycl::handler &cgh) { joint_matrix sub_c(sg); joint_matrix_load(sg, sub_c, - pC + (sg_startx * TM) * N + - sg_starty / SG_SZ * TN, + pC + (sg_startx * TM) * N + sg_starty / SG_SZ * TN, N, layout::row_major); auto tCData = sub_c.get_wi_data(); - for (int i = 0; i < tCData.length(); ++i) { + for (int i = 0; i < tCData.length(); ++i) { auto [row, col] = tCData[i].get_coord(); resC[row] += tCData[i]; } for (int k = 0; k < K / TK; k += 1) { // - joint_matrix_load( - sg, sub_a, pA + (sg_startx * TM) * K + k * TK, - K, layout::row_major); + joint_matrix_load(sg, sub_a, pA + (sg_startx * TM) * K + k * TK, K, + layout::row_major); auto tAData = sub_a.get_wi_data(); for (int i = 0; i < tAData.length(); ++i) { - auto [row, col] = tAData[i].get_coord(); - resA[row] += tAData[i]; - } + auto [row, col] = tAData[i].get_coord(); + resA[row] += tAData[i]; + } // Assuming B data is already in VNNI format. joint_matrix_load(sg, sub_b, @@ -318,8 +318,8 @@ q.submit([&](sycl::handler &cgh) { N * 2, layout::packed_b); auto tBData = sub_b.get_wi_data(); for (int i = 0; i < tBData.length(); ++i) { - auto [row, col] = tBData[i].get_coord(); - resB[col] += tBData[i]; + auto [row, col] = tBData[i].get_coord(); + resB[col] += tBData[i]; } } }); // parallel for @@ -339,17 +339,20 @@ unsigned short make_bf16(float x) { return (unsigned short)*res; } -// void matrix_coord_ref(int *A_mem, int *B_mem, int *C_mem, int M, int N, int K) { +// void matrix_coord_ref(int *A_mem, int *B_mem, int *C_mem, int M, int N, int +// K) { void matrix_coord_ref(int M, int N, int K) { - for (int m = 0; m < M; m+=TM) { - for (int n = 0; n < N; n+=TN) { - for (int k = 0; k < K; k+=TK) { + for (int m = 0; m < M; m += TM) { + for (int n = 0; n < N; n += TN) { + for (int k = 0; k < K; k += TK) { for (int mm = 0; mm < TM; mm++) { for (int nn = 0; nn < TN; nn++) { - res_local_row_origC[m*TM + mm]+= C_ref[m*TM+mm][n*TN+nn]; + res_local_row_origC[m * TM + mm] += C_ref[m * TM + mm][n * TN + nn]; for (int kk = 0; kk < TK; kk++) { - res_local_row_origA[k*TK + kk]+= A_ref[m*TM+mm][k*TK+kk]; - res_local_col_origB[n*TN + nn]+= B_ref[k*TK+kk][n*TN+nn]; + res_local_row_origA[k * TK + kk] += + A_ref[m * TM + mm][k * TK + kk]; + res_local_col_origB[n * TN + nn] += + B_ref[k * TK + kk][n * TN + nn]; } } } @@ -376,27 +379,25 @@ int main() { C_ref[i][j] = 3.0; } } - + sycl::queue q; A = malloc_shared(MATRIX_M * MATRIX_K, q); B = malloc_shared(MATRIX_K * MATRIX_N, q); C = malloc_shared(MATRIX_M * MATRIX_N, q); - - float* res_local_rowA = malloc_shared(MATRIX_M, q); - float* res_local_colB = malloc_shared(MATRIX_N, q); - float* res_local_rowC = malloc_shared(MATRIX_M, q); + float *res_local_rowA = malloc_shared(MATRIX_M, q); + float *res_local_colB = malloc_shared(MATRIX_N, q); + float *res_local_rowC = malloc_shared(MATRIX_M, q); res_local_row_origA = (float *)calloc(MATRIX_M, sizeof(float)); // globals res_local_col_origB = (float *)calloc(MATRIX_N, sizeof(float)); res_local_row_origC = (float *)calloc(MATRIX_M, sizeof(float)); matrix_coord(q, res_local_rowA, res_local_colB, res_local_rowC); - matrix_coord_ref(MATRIX_M, - MATRIX_N, MATRIX_K); + matrix_coord_ref(MATRIX_M, MATRIX_N, MATRIX_K); bool res = true; - + for (int i = 0; i < MATRIX_M; i++) { if (res_local_rowA[i] != res_local_row_origA[i]) res = false; diff --git a/sycl/test/matrix/matrix-bfloat16-test-coord-gemm.cpp b/sycl/test/matrix/matrix-bfloat16-test-coord-gemm.cpp index 4bd86f36c7e2f..f2cbce92b126f 100644 --- a/sycl/test/matrix/matrix-bfloat16-test-coord-gemm.cpp +++ b/sycl/test/matrix/matrix-bfloat16-test-coord-gemm.cpp @@ -100,7 +100,7 @@ void matrix_multiply(big_matrix &C, sg_starty / SG_SZ * TN * 2, N * 2, layout::packed_b); sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); - } + } // Element wise operation auto tCData = sub_c.get_wi_data(); From 4529e667c2dce109a471e7471cb99615ade49bbd Mon Sep 17 00:00:00 2001 From: "arnamoy.bhattacharyya" Date: Fri, 4 Nov 2022 11:01:34 -0400 Subject: [PATCH 09/13] Reviewer comments --- sycl/include/CL/__spirv/spirv_ops.hpp | 4 +- .../sycl/ext/oneapi/matrix/matrix-jit-use.hpp | 22 +- .../matrix-bfloat16-test-coord-basic.cpp | 442 +++--------------- .../matrix-bfloat16-test-coord-gemm.cpp | 17 +- 4 files changed, 98 insertions(+), 387 deletions(-) diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp index a7fe392fc0b06..d07e6ec7245a2 100644 --- a/sycl/include/CL/__spirv/spirv_ops.hpp +++ b/sycl/include/CL/__spirv/spirv_ops.hpp @@ -127,8 +127,8 @@ template -extern SYCL_EXTERNAL __ocl_vec_t -__spirv_JointMatrixWorkItemElemCoord(JOINT_MATRIX_INTEL(T, R, C, L, S, U) *, +extern SYCL_EXTERNAL __ocl_vec_t +__spirv_JointMatrixGetElementCoordINTEL(JOINT_MATRIX_INTEL(T, R, C, L, S, U) *, size_t i); template get_coord() { #ifdef __SYCL_DEVICE_ONLY__ - __ocl_vec_t co_ord = + __ocl_vec_t coord = __spirv_JointMatrixWorkItemElemCoord(M.spvm, idx); - const int32_t row = co_ord[0]; - const int32_t col = co_ord[1]; + const int32_t row = coord[0]; + const int32_t col = coord[1]; return std::make_tuple(row, col); #else throw runtime_error("joint matrix is not supported on host device.", @@ -356,12 +356,12 @@ class wi_element { std::size_t i) : M(Mat), idx(i) {} - std::tuple get_coord() { + std::tuple get_coord() { #ifdef __SYCL_DEVICE_ONLY__ - __ocl_vec_t coord = - __spirv_JointMatrixWorkItemElemCoord(M.spvm, idx); - const int32_t row = coord[0]; - const int32_t col = coord[1]; + __ocl_vec_t coord = + __spirv_JointMatrixGetElementCoordINTEL(M.spvm, idx); + const uint32_t row = coord[0]; + const uint32_t col = coord[1]; return std::make_tuple(row, col); #else throw runtime_error("joint matrix is not supported on host device.", @@ -522,10 +522,10 @@ class wi_element get_coord() { #ifdef __SYCL_DEVICE_ONLY__ - __ocl_vec_t co_ord = + __ocl_vec_t coord = __spirv_JointMatrixWorkItemElemCoord(M.spvm, idx); - const int32_t row = co_ord[0]; - const int32_t col = co_ord[1]; + const int32_t row = coord[0]; + const int32_t col = coord[1]; return std::make_tuple(row, col); #else throw runtime_error("joint matrix is not supported on host device.", diff --git a/sycl/test/matrix/matrix-bfloat16-test-coord-basic.cpp b/sycl/test/matrix/matrix-bfloat16-test-coord-basic.cpp index ca5d4bdb8cc41..e668862c9cea8 100644 --- a/sycl/test/matrix/matrix-bfloat16-test-coord-basic.cpp +++ b/sycl/test/matrix/matrix-bfloat16-test-coord-basic.cpp @@ -1,236 +1,20 @@ // RUN: %clangxx -fsycl -O2 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=2 %s -o %t.out -// #include -// #include -// using namespace sycl::ext::oneapi::experimental::matrix; -// using bfloat16 = sycl::ext::oneapi::experimental::bfloat16; - -// static constexpr auto TILE_SZ = 16; -// static constexpr auto TM = TILE_SZ - 1; -// static constexpr auto TN = TILE_SZ - 1; -// static constexpr auto TK = 2 * TILE_SZ - 2; - -// static constexpr auto SG_SZ = 16; - -// template struct big_matrix { -// public: -// T *mat; - -// public: -// T *get_data() { return mat; } -// void set_data(T *data) { mat = data; } -// big_matrix(T *data) : mat(data) {} -// }; - -// static constexpr size_t MATRIX_M = TM * 2; -// static constexpr size_t MATRIX_N = TN * 2; -// static constexpr size_t MATRIX_K = TK * 2; -// bfloat16 A[MATRIX_M][MATRIX_K]; -// bfloat16 B[MATRIX_K / 2][MATRIX_N * 2]; -// unsigned short Aref[MATRIX_M][MATRIX_K]; -// unsigned short Bref[MATRIX_K / 2][MATRIX_N * 2]; -// float C[MATRIX_M][MATRIX_N]; -// float D[MATRIX_M][MATRIX_N]; -// int32_t *res_local_rowA; -// int32_t *res_local_colB; -// int32_t *res_local_rowC; -// int32_t *res_local_row_origA; -// int32_t *res_local_col_origB; -// int32_t *res_local_row_origC; -// template -// void matrix_coord(big_matrix &C, -// big_matrix &A, -// big_matrix &B) { -// size_t M = NUM_ROWS_C; -// size_t N = NUM_COLS_C; -// size_t K = NUM_COLS_A; -// // B => K/4 x N*4, A => M x K, C => M, N -// // stride should be X's cols, e.g., B's stirde = N*4 -// assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B * 2); -// size_t NDRangeM = M / TM; -// size_t NDRangeN = N / TN; -// sycl::buffer bufA(A.get_data(), sycl::range<2>(M, K)); -// sycl::buffer bufB(B.get_data(), sycl::range<2>(K, N)); -// sycl::buffer bufC((float *)C.get_data(), sycl::range<2>(M, N)); - -// sycl::buffer res_local_row_bufA(res_local_rowA, -// sycl::range<1>(MATRIX_M)); -// sycl::buffer res_local_col_bufB(res_local_colB, -// sycl::range<1>(MATRIX_N)); -// sycl::buffer res_local_row_bufC(res_local_rowC, -// sycl::range<1>(MATRIX_M)); - -// sycl::queue q; -// q.submit([&](sycl::handler &cgh) { -// auto accC = bufC.get_access(cgh); -// auto accA = bufA.get_access(cgh); -// auto accB = bufB.get_access(cgh); - -// auto res_local_row_accA = -// res_local_row_bufA.get_access(cgh); -// auto res_local_col_accB = -// res_local_col_bufB.get_access(cgh); -// auto res_local_row_accC = -// res_local_row_bufC.get_access(cgh); - -// cgh.parallel_for( -// sycl::nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), -// [accA, accB, accC, M, N, K, res_local_row_accA, res_local_col_accB, -// res_local_row_accC](sycl::nd_item<2> spmd_item) - -// { -// sycl::ext::oneapi::sub_group sg = spmd_item.get_sub_group(); -// joint_matrix sub_a(sg); -// // For B, since current implementation does not support non-packed -// // layout, users need to specify the updated VNNI sizes along with -// // the packed_b layout. By default, the layout is row_major and -// size -// // is (TK, TN). -// joint_matrix sub_b(sg); -// joint_matrix sub_c(sg); - -// joint_matrix_fill(sg, sub_a, 1); -// joint_matrix_fill(sg, sub_b, 2); -// joint_matrix_fill(sg, sub_c, 3); -// // Element wise operation -// auto tAData = sub_a.get_wi_data(); -// auto tBData = sub_b.get_wi_data(); -// auto tCData = sub_c.get_wi_data(); - -// for (int i = 0; i < tAData.length(); ++i) { -// auto [row, col] = tAData[i].get_coord(); -// res_local_row_accA[row] += tAData[i]; -// } - -// for (int i = 0; i < tBData.length(); ++i) { -// auto [row, col] = tBData[i].get_coord(); -// res_local_col_accB[col] += tBData[i]; -// } - -// for (int i = 0; i < tCData.length(); ++i) { -// auto [row, col] = tCData[i].get_coord(); -// res_local_row_accC[row] += tCData[i]; -// } -// }); // parallel for -// }).wait(); -// } - -// float make_fp32(short x) { -// unsigned int y = x; -// y = y << 16; -// float *res = reinterpret_cast(&y); -// return *res; -// } - -// unsigned short make_bf16(float x) { -// int *res = reinterpret_cast(&x); -// *res = *res >> 16; -// return (unsigned short)*res; -// } - -// void matrix_coord_ref(int *A_mem, int *B_mem, int *C_mem, int M, int N, int -// K) { -// for (int m = 0; m < M; m++) -// for (int k = 0; k < K; k++) { -// short *va = (short *)(A_mem + m * K + k); -// res_local_row_origA[m] += *va; -// } - -// for (int k = 0; k < K; k++) -// for (int n = 0; n < N; n++) { -// short *vb = (short *)(B_mem + k * N + n); -// res_local_col_origB[n] += *vb; -// } - -// for (int m = 0; m < M; m++) -// for (int n = 0; n < N; n++) { -// short *vc = (short *)(C_mem + m * N + n); -// res_local_row_origC[m] += *vc; -// } -// } - -// int main() { -// for (int i = 0; i < MATRIX_M; i++) { -// for (int j = 0; j < MATRIX_K; j++) { -// // Ee create bfloat16 from unsigned short since float-to-bfloat's -// // conversion is not allowed. -// A[i][j] = bfloat16::from_bits(make_bf16(1.0f)); -// Aref[i][j] = make_bf16(1.0f); -// } -// } -// for (int i = 0; i < MATRIX_K / 2; i++) { -// for (int j = 0; j < MATRIX_N * 2; j++) { -// B[i][j] = bfloat16::from_bits((make_bf16(2.0f))); -// Bref[i][j] = make_bf16(2.0f); -// } -// } -// for (int i = 0; i < MATRIX_M; i++) { -// for (int j = 0; j < MATRIX_N; j++) { -// C[i][j] = 3.0; -// D[i][j] = 3.0; -// } -// } - -// big_matrix MC((float *)&C); -// big_matrix MD((float *)&D); -// big_matrix MA((bfloat16 *)&A); -// big_matrix MB((bfloat16 *)&B); - -// res_local_rowA = (int32_t *)calloc(MATRIX_M, sizeof(int32_t)); -// res_local_colB = (int32_t *)calloc(MATRIX_N, sizeof(int32_t)); -// res_local_rowC = (int32_t *)calloc(MATRIX_M, sizeof(int32_t)); - -// res_local_row_origA = (int32_t *)calloc(MATRIX_M, sizeof(int32_t)); -// res_local_col_origB = (int32_t *)calloc(MATRIX_N, sizeof(int32_t)); -// res_local_row_origC = (int32_t *)calloc(MATRIX_M, sizeof(int32_t)); - -// matrix_coord(MC, MA, MB); -// matrix_coord_ref((int32_t *)Aref, (int32_t *)Bref, (int32_t *)D, MATRIX_M, -// MATRIX_N, MATRIX_K / 2); - -// bool res = true; -// for (int i = 0; i < MATRIX_M; i++) { -// if (res_local_rowA[i] != res_local_row_origA[i]) -// res = false; -// } -// for (int i = 0; i < MATRIX_K; i++) { -// if (res_local_colB[i] != res_local_col_origB[i]) -// res = false; -// } -// for (int i = 0; i < MATRIX_M; i++) { -// if (res_local_rowC[i] != res_local_row_origC[i]) -// res = false; -// } -// if (res) -// std::cout << "passed\n"; -// else -// std::cout << "failed\n"; -// for (int i = 0; i < MATRIX_M; i++) { -// for (int j = 0; j < MATRIX_N; j++) -// std::cout << C[i][j] << ", "; -// std::cout << "\n"; -// } -// std::cout << std::endl; -// for (int i = 0; i < MATRIX_M; i++) { -// for (int j = 0; j < MATRIX_N; j++) -// std::cout << D[i][j] << ", "; -// std::cout << "\n"; -// } -// } +// this code calculates the sum of rows into a global array of number of rows +// elements. First, partial reduction is computed inside each SG, then atomic +// add is used to reduce between SG leaders. The get_coord() API is used for +// retrieving the row #include #include +using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -using bfloat16 = sycl::ext::oneapi::experimental::bfloat16; #define SG_SZ 16 -#define TM 8 #define TN SG_SZ -#define TK 16 +#define TK 32 template struct big_matrix { public: @@ -242,176 +26,94 @@ template struct big_matrix { big_matrix(T *data) : mat(data) {} }; -static constexpr size_t MATRIX_M = 2 * TM; -static constexpr size_t MATRIX_N = 2 * TN; -static constexpr size_t MATRIX_K = 2 * TK; -bfloat16 A_ref[MATRIX_M][MATRIX_K]; -bfloat16 B_ref[MATRIX_K][MATRIX_N]; -float C_ref[MATRIX_M][MATRIX_N]; - -bfloat16 *A; -bfloat16 *B; -float *C; - -// float *res_local_rowA; -// float *res_local_colB; -// float *res_local_rowC; - -float *res_local_row_origA; -float *res_local_col_origB; -float *res_local_row_origC; - -void matrix_coord(sycl::queue &q, float *resA, float *resB, float *resC) { - - size_t NDRangeM = MATRIX_M / TM; - size_t NDRangeN = MATRIX_N / TN; - - auto pA = - sycl::multi_ptr(A); - auto pB = - sycl::multi_ptr(B); - auto pC = - sycl::multi_ptr(C); +template +void sum_rows_ref( + accessor B, + accessor + sum_rows) { + int sum_rows_ref[M] = {0}; + for (size_t i = 0; i < M; i++) { + for (size_t j = 0; j < N; j++) { + sum_rows_ref[i] += B[i][j]; + } + auto diff = sum_rows[i] - sum_rows_ref[i]; + assert(std::fabs(static_cast(diff)) <= + std::numeric_limits::epsilon()); + } +} - q.submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl::nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), [= - ](sycl::nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] +template +void matrix_sum_rows(queue q, big_matrix &B, nd_range<2> &r) { + buffer bufB(B.get_data(), range<2>(M, N)); + // size of vector is known because SG size of set by the user in this case + int sum_rows[M] = {0}; + buffer sum_rows_v(sum_rows, M); // there are total of tK/4 * 2, 16 rows + q.submit([&](handler &cgh) { + auto accB = bufB.get_access(cgh); - { - size_t M = MATRIX_M; - size_t N = MATRIX_N; - size_t K = MATRIX_K; + auto v = sum_rows_v.get_access(cgh); - sycl::ext::oneapi::sub_group sg = spmd_item.get_sub_group(); + cgh.parallel_for( + r, [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { const auto global_idx = spmd_item.get_global_id(0); const auto global_idy = spmd_item.get_global_id(1); const auto sg_startx = global_idx - spmd_item.get_local_id(0); const auto sg_starty = global_idy - spmd_item.get_local_id(1); - joint_matrix sub_a(sg); - joint_matrix sub_b(sg); - joint_matrix sub_c(sg); + ext::oneapi::sub_group sg = spmd_item.get_sub_group(); - joint_matrix_load(sg, sub_c, - pC + (sg_startx * TM) * N + sg_starty / SG_SZ * TN, - N, layout::row_major); - auto tCData = sub_c.get_wi_data(); - for (int i = 0; i < tCData.length(); ++i) { - auto [row, col] = tCData[i].get_coord(); - resC[row] += tCData[i]; - } + joint_matrix sub_b(sg); - for (int k = 0; k < K / TK; k += 1) { // - joint_matrix_load(sg, sub_a, pA + (sg_startx * TM) * K + k * TK, K, - layout::row_major); - auto tAData = sub_a.get_wi_data(); - for (int i = 0; i < tAData.length(); ++i) { - auto [row, col] = tAData[i].get_coord(); - resA[row] += tAData[i]; - } + joint_matrix_load(sg, sub_b, + accB.get_pointer() + (global_idx * (TK / 4) * N) + + sg_starty / SG_SZ * TN * 4, + N, layout::packed_b); - // Assuming B data is already in VNNI format. - joint_matrix_load(sg, sub_b, - pB + (k * TK / 2) * (N * 2) + - sg_starty / SG_SZ * TN * 2, - N * 2, layout::packed_b); - auto tBData = sub_b.get_wi_data(); - for (int i = 0; i < tBData.length(); ++i) { + + + int32_t sum_local_rows[M] = {0}; + auto tBData = sub_b.get_wi_data(); + + // each WI calculates local sum of rows + for (int i = 0; i < tBData.length(); ++i) { + // row and col holds global co_ordinates of the matrix auto [row, col] = tBData[i].get_coord(); - resB[col] += tBData[i]; - } - } + sum_local_rows[row] += tBData[i]; + + sum_local_rows[row] = reduce_over_group( + sg, sum_local_rows[row], + sycl::plus<>()); + // only Groups leader perform the global reduction + if (global_idy % SG_SZ == 0) { + atomic_fetch_add(v[row], + sum_local_rows[row]); + } + } }); // parallel for }).wait(); + sum_rows_ref(bufB.get_access(), + sum_rows_v.get_access()); } -float make_fp32(short x) { - unsigned int y = x; - y = y << 16; - float *res = reinterpret_cast(&y); - return *res; -} +static constexpr size_t MATRIX_K = TK / 4 * 2; +static constexpr size_t MATRIX_N = TN * 4 * 2; +int8_t B[MATRIX_K][MATRIX_N]; -unsigned short make_bf16(float x) { - int *res = reinterpret_cast(&x); - *res = *res >> 16; - return (unsigned short)*res; -} +int main() { + big_matrix MB((int8_t *)&B); -// void matrix_coord_ref(int *A_mem, int *B_mem, int *C_mem, int M, int N, int -// K) { -void matrix_coord_ref(int M, int N, int K) { - for (int m = 0; m < M; m += TM) { - for (int n = 0; n < N; n += TN) { - for (int k = 0; k < K; k += TK) { - for (int mm = 0; mm < TM; mm++) { - for (int nn = 0; nn < TN; nn++) { - res_local_row_origC[m * TM + mm] += C_ref[m * TM + mm][n * TN + nn]; - for (int kk = 0; kk < TK; kk++) { - res_local_row_origA[k * TK + kk] += - A_ref[m * TM + mm][k * TK + kk]; - res_local_col_origB[n * TN + nn] += - B_ref[k * TK + kk][n * TN + nn]; - } - } - } - } - } - } -} + size_t NDRangeK = MATRIX_K / (TK / 4); + size_t NDRangeN = (MATRIX_N / 4) / TN; + queue q; + nd_range<2> r({NDRangeK, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}); -int main() { - for (int i = 0; i < MATRIX_M; i++) { - for (int j = 0; j < MATRIX_K; j++) { - // Create bfloat16 from unsigned short since float-to-bfloat's - // conversion is not allowed. - A_ref[i][j] = bfloat16::from_bits(make_bf16(1.0f)); - } - } for (int i = 0; i < MATRIX_K; i++) { for (int j = 0; j < MATRIX_N; j++) { - B_ref[i][j] = bfloat16::from_bits((make_bf16(2.0f))); + B[i][j] = i; } } - for (int i = 0; i < MATRIX_M; i++) { - for (int j = 0; j < MATRIX_N; j++) { - C_ref[i][j] = 3.0; - } - } - - sycl::queue q; - A = malloc_shared(MATRIX_M * MATRIX_K, q); - B = malloc_shared(MATRIX_K * MATRIX_N, q); - C = malloc_shared(MATRIX_M * MATRIX_N, q); - - float *res_local_rowA = malloc_shared(MATRIX_M, q); - float *res_local_colB = malloc_shared(MATRIX_N, q); - float *res_local_rowC = malloc_shared(MATRIX_M, q); - res_local_row_origA = (float *)calloc(MATRIX_M, sizeof(float)); // globals - res_local_col_origB = (float *)calloc(MATRIX_N, sizeof(float)); - res_local_row_origC = (float *)calloc(MATRIX_M, sizeof(float)); + matrix_sum_rows(q, MB, r); - matrix_coord(q, res_local_rowA, res_local_colB, res_local_rowC); - matrix_coord_ref(MATRIX_M, MATRIX_N, MATRIX_K); - - bool res = true; - - for (int i = 0; i < MATRIX_M; i++) { - if (res_local_rowA[i] != res_local_row_origA[i]) - res = false; - } - for (int i = 0; i < MATRIX_K; i++) { - if (res_local_colB[i] != res_local_col_origB[i]) - res = false; - } - for (int i = 0; i < MATRIX_M; i++) { - if (res_local_rowC[i] != res_local_row_origC[i]) - res = false; - } - if (res) - std::cout << "passed\n"; - else - std::cout << "failed\n"; -} + return 0; +} \ No newline at end of file diff --git a/sycl/test/matrix/matrix-bfloat16-test-coord-gemm.cpp b/sycl/test/matrix/matrix-bfloat16-test-coord-gemm.cpp index f2cbce92b126f..e19c2b657d8d4 100644 --- a/sycl/test/matrix/matrix-bfloat16-test-coord-gemm.cpp +++ b/sycl/test/matrix/matrix-bfloat16-test-coord-gemm.cpp @@ -60,13 +60,12 @@ void matrix_multiply(big_matrix &C, auto accC = bufC.get_access(cgh); auto accA = bufA.get_access(cgh); auto accB = bufB.get_access(cgh); - auto res_local_row_acc = - res_local_row_buf.get_access(cgh); + auto v = res_local_row_buf.get_access(cgh); cgh.parallel_for( sycl::nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), [accA, accB, accC, M, N, K, - res_local_row_acc](sycl::nd_item<2> spmd_item) + v](sycl::nd_item<2> spmd_item) { // The submatrix API has to be accessed by all the workitems in a @@ -103,10 +102,20 @@ void matrix_multiply(big_matrix &C, } // Element wise operation auto tCData = sub_c.get_wi_data(); + int32_t sum_local_rows[MATRIX_M] = {0}; for (int i = 0; i < tCData.length(); ++i) { auto [row, col] = tCData[i].get_coord(); - res_local_row_acc[row] += tCData[i]; + sum_local_rows[row] += tCData[i]; + + sum_local_rows[row] = sycl::reduce_over_group( + sg, sum_local_rows[row], + sycl::plus<>()); + // only Groups leader perform the global reduction + if (global_idy % SG_SZ == 0) { + atomic_fetch_add(v[row], + sum_local_rows[row]); + } } }); // parallel for }).wait(); From 643aafc1254cf1816c16c339870c1ade034ba22c Mon Sep 17 00:00:00 2001 From: "arnamoy.bhattacharyya" Date: Fri, 4 Nov 2022 11:03:03 -0400 Subject: [PATCH 10/13] Clang-format --- sycl/include/CL/__spirv/spirv_ops.hpp | 2 +- .../matrix-bfloat16-test-coord-basic.cpp | 28 ++++++++----------- .../matrix-bfloat16-test-coord-gemm.cpp | 15 ++++------ 3 files changed, 19 insertions(+), 26 deletions(-) diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp index d07e6ec7245a2..e12de1eebc4e8 100644 --- a/sycl/include/CL/__spirv/spirv_ops.hpp +++ b/sycl/include/CL/__spirv/spirv_ops.hpp @@ -129,7 +129,7 @@ template extern SYCL_EXTERNAL __ocl_vec_t __spirv_JointMatrixGetElementCoordINTEL(JOINT_MATRIX_INTEL(T, R, C, L, S, U) *, - size_t i); + size_t i); template &B, nd_range<2> &r) { sg_starty / SG_SZ * TN * 4, N, layout::packed_b); - - - int32_t sum_local_rows[M] = {0}; + int32_t sum_local_rows[M] = {0}; auto tBData = sub_b.get_wi_data(); // each WI calculates local sum of rows for (int i = 0; i < tBData.length(); ++i) { - // row and col holds global co_ordinates of the matrix - auto [row, col] = tBData[i].get_coord(); - sum_local_rows[row] += tBData[i]; - - sum_local_rows[row] = reduce_over_group( - sg, sum_local_rows[row], - sycl::plus<>()); - // only Groups leader perform the global reduction - if (global_idy % SG_SZ == 0) { - atomic_fetch_add(v[row], - sum_local_rows[row]); - } - } + // row and col holds global co_ordinates of the matrix + auto [row, col] = tBData[i].get_coord(); + sum_local_rows[row] += tBData[i]; + + sum_local_rows[row] = + reduce_over_group(sg, sum_local_rows[row], sycl::plus<>()); + // only Groups leader perform the global reduction + if (global_idy % SG_SZ == 0) { + atomic_fetch_add(v[row], sum_local_rows[row]); + } + } }); // parallel for }).wait(); sum_rows_ref(bufB.get_access(), diff --git a/sycl/test/matrix/matrix-bfloat16-test-coord-gemm.cpp b/sycl/test/matrix/matrix-bfloat16-test-coord-gemm.cpp index e19c2b657d8d4..c2a176e738e3b 100644 --- a/sycl/test/matrix/matrix-bfloat16-test-coord-gemm.cpp +++ b/sycl/test/matrix/matrix-bfloat16-test-coord-gemm.cpp @@ -64,8 +64,7 @@ void matrix_multiply(big_matrix &C, cgh.parallel_for( sycl::nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [accA, accB, accC, M, N, K, - v](sycl::nd_item<2> spmd_item) + [accA, accB, accC, M, N, K, v](sycl::nd_item<2> spmd_item) { // The submatrix API has to be accessed by all the workitems in a @@ -109,13 +108,11 @@ void matrix_multiply(big_matrix &C, sum_local_rows[row] += tCData[i]; sum_local_rows[row] = sycl::reduce_over_group( - sg, sum_local_rows[row], - sycl::plus<>()); - // only Groups leader perform the global reduction - if (global_idy % SG_SZ == 0) { - atomic_fetch_add(v[row], - sum_local_rows[row]); - } + sg, sum_local_rows[row], sycl::plus<>()); + // only Groups leader perform the global reduction + if (global_idy % SG_SZ == 0) { + atomic_fetch_add(v[row], sum_local_rows[row]); + } } }); // parallel for }).wait(); From 8e73c9d5339f8e98ed8c6efb16a74664429b9a29 Mon Sep 17 00:00:00 2001 From: "arnamoy.bhattacharyya" Date: Fri, 4 Nov 2022 11:09:17 -0400 Subject: [PATCH 11/13] More comments addressed. --- .../sycl/ext/oneapi/matrix/matrix-jit-use.hpp | 16 ++++++++-------- .../matrix/matrix-bfloat16-test-coord-gemm.cpp | 4 ---- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp index 1364017a4e0f8..5c43678fa7506 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp @@ -258,12 +258,12 @@ class wi_element { std::size_t i) : M(Mat), idx(i) {} - std::tuple get_coord() { + std::tuple get_coord() { #ifdef __SYCL_DEVICE_ONLY__ - __ocl_vec_t coord = + __ocl_vec_t coord = __spirv_JointMatrixWorkItemElemCoord(M.spvm, idx); - const int32_t row = coord[0]; - const int32_t col = coord[1]; + const uint32_t row = coord[0]; + const uint32_t col = coord[1]; return std::make_tuple(row, col); #else throw runtime_error("joint matrix is not supported on host device.", @@ -520,12 +520,12 @@ class wi_element get_coord() { + std::tuple get_coord() { #ifdef __SYCL_DEVICE_ONLY__ - __ocl_vec_t coord = + __ocl_vec_t coord = __spirv_JointMatrixWorkItemElemCoord(M.spvm, idx); - const int32_t row = coord[0]; - const int32_t col = coord[1]; + const uint32_t row = coord[0]; + const uint32_t col = coord[1]; return std::make_tuple(row, col); #else throw runtime_error("joint matrix is not supported on host device.", diff --git a/sycl/test/matrix/matrix-bfloat16-test-coord-gemm.cpp b/sycl/test/matrix/matrix-bfloat16-test-coord-gemm.cpp index c2a176e738e3b..ced2a5bd5f146 100644 --- a/sycl/test/matrix/matrix-bfloat16-test-coord-gemm.cpp +++ b/sycl/test/matrix/matrix-bfloat16-test-coord-gemm.cpp @@ -77,10 +77,6 @@ void matrix_multiply(big_matrix &C, sycl::ext::oneapi::sub_group sg = spmd_item.get_sub_group(); joint_matrix sub_a(sg); - // For B, since current implementation does not support non-packed - // layout, users need to specify the updated VNNI sizes along with - // the packed_b layout. By default, the layout is row_major and size - // is (TK, TN). joint_matrix sub_b(sg); joint_matrix sub_c(sg); From b2ca8e46d435e5789e0920629ee22358ee114dd1 Mon Sep 17 00:00:00 2001 From: "arnamoy.bhattacharyya" Date: Fri, 4 Nov 2022 11:12:55 -0400 Subject: [PATCH 12/13] Fixing small error --- sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp index 5c43678fa7506..e7a5b6dcfcf53 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp @@ -261,7 +261,7 @@ class wi_element { std::tuple get_coord() { #ifdef __SYCL_DEVICE_ONLY__ __ocl_vec_t coord = - __spirv_JointMatrixWorkItemElemCoord(M.spvm, idx); + __spirv_JointMatrixGetElementCoordINTEL(M.spvm, idx); const uint32_t row = coord[0]; const uint32_t col = coord[1]; return std::make_tuple(row, col); @@ -523,7 +523,7 @@ class wi_element get_coord() { #ifdef __SYCL_DEVICE_ONLY__ __ocl_vec_t coord = - __spirv_JointMatrixWorkItemElemCoord(M.spvm, idx); + __spirv_JointMatrixGetElementCoordINTEL(M.spvm, idx); const uint32_t row = coord[0]; const uint32_t col = coord[1]; return std::make_tuple(row, col); From d2dfda6e7d144eb168c9aa6306c0c240033f70f0 Mon Sep 17 00:00:00 2001 From: "arnamoy.bhattacharyya" Date: Fri, 11 Nov 2022 10:13:52 -0500 Subject: [PATCH 13/13] Adding XFAIL to test cases when we run. Will take away when the full pipeline is supported. --- sycl/test/matrix/matrix-bfloat16-test-coord-basic.cpp | 2 ++ sycl/test/matrix/matrix-bfloat16-test-coord-gemm.cpp | 3 +++ 2 files changed, 5 insertions(+) diff --git a/sycl/test/matrix/matrix-bfloat16-test-coord-basic.cpp b/sycl/test/matrix/matrix-bfloat16-test-coord-basic.cpp index 65d12b6cf6fa4..a03a2ef274f0a 100644 --- a/sycl/test/matrix/matrix-bfloat16-test-coord-basic.cpp +++ b/sycl/test/matrix/matrix-bfloat16-test-coord-basic.cpp @@ -1,4 +1,6 @@ // RUN: %clangxx -fsycl -O2 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=2 %s -o %t.out +// RUN: %t.out +// XFAIL: * // this code calculates the sum of rows into a global array of number of rows // elements. First, partial reduction is computed inside each SG, then atomic diff --git a/sycl/test/matrix/matrix-bfloat16-test-coord-gemm.cpp b/sycl/test/matrix/matrix-bfloat16-test-coord-gemm.cpp index ced2a5bd5f146..d5da6343fc138 100644 --- a/sycl/test/matrix/matrix-bfloat16-test-coord-gemm.cpp +++ b/sycl/test/matrix/matrix-bfloat16-test-coord-gemm.cpp @@ -1,4 +1,7 @@ // RUN: %clangxx -fsycl -O2 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=2 %s -o %t.out +// RUN: %t.out +// XFAIL: * + #include #include