diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp index 21181b4080399..e12de1eebc4e8 100644 --- a/sycl/include/CL/__spirv/spirv_ops.hpp +++ b/sycl/include/CL/__spirv/spirv_ops.hpp @@ -123,6 +123,14 @@ template +extern SYCL_EXTERNAL __ocl_vec_t +__spirv_JointMatrixGetElementCoordINTEL(JOINT_MATRIX_INTEL(T, R, C, L, S, U) *, + size_t i); + template #include #include +#include namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { @@ -256,6 +257,21 @@ class wi_element { wi_element(joint_matrix &Mat, std::size_t i) : M(Mat), idx(i) {} + + std::tuple get_coord() { +#ifdef __SYCL_DEVICE_ONLY__ + __ocl_vec_t coord = + __spirv_JointMatrixGetElementCoordINTEL(M.spvm, idx); + const uint32_t row = coord[0]; + const uint32_t col = coord[1]; + return std::make_tuple(row, col); +#else + throw runtime_error("joint matrix is not supported on host device.", + PI_ERROR_INVALID_DEVICE); +#endif // __SYCL_DEVICE_ONLY__ + } + + // Various Operations operator T() { #ifdef __SYCL_DEVICE_ONLY__ return __spirv_VectorExtractDynamic(M.spvm, idx); @@ -339,6 +355,20 @@ class wi_element { wi_element(joint_matrix &Mat, std::size_t i) : M(Mat), idx(i) {} + + std::tuple get_coord() { +#ifdef __SYCL_DEVICE_ONLY__ + __ocl_vec_t coord = + __spirv_JointMatrixGetElementCoordINTEL(M.spvm, idx); + const uint32_t row = coord[0]; + const uint32_t col = coord[1]; + return std::make_tuple(row, col); +#else + throw runtime_error("joint matrix is not supported on host device.", + PI_ERROR_INVALID_DEVICE); +#endif // __SYCL_DEVICE_ONLY__ + } + operator uint16_t() { #ifdef __SYCL_DEVICE_ONLY__ return __spirv_VectorExtractDynamic(M.spvm, idx); @@ -489,6 +519,20 @@ class wi_element &Mat, std::size_t i) : M(Mat), idx(i) {} + + std::tuple get_coord() { +#ifdef __SYCL_DEVICE_ONLY__ + __ocl_vec_t coord = + __spirv_JointMatrixGetElementCoordINTEL(M.spvm, idx); + const uint32_t row = coord[0]; + const uint32_t col = coord[1]; + return std::make_tuple(row, col); +#else + throw runtime_error("joint matrix is not supported on host device.", + PI_ERROR_INVALID_DEVICE); +#endif // __SYCL_DEVICE_ONLY__ + } + operator sycl::ext::oneapi::experimental::bfloat16() { #ifdef __SYCL_DEVICE_ONLY__ return __spirv_VectorExtractDynamic(M.spvm, idx); diff --git a/sycl/test/matrix/matrix-bfloat16-test-coord-basic.cpp b/sycl/test/matrix/matrix-bfloat16-test-coord-basic.cpp new file mode 100644 index 0000000000000..a03a2ef274f0a --- /dev/null +++ b/sycl/test/matrix/matrix-bfloat16-test-coord-basic.cpp @@ -0,0 +1,117 @@ +// RUN: %clangxx -fsycl -O2 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=2 %s -o %t.out +// RUN: %t.out +// XFAIL: * + +// this code calculates the sum of rows into a global array of number of rows +// elements. First, partial reduction is computed inside each SG, then atomic +// add is used to reduce between SG leaders. The get_coord() API is used for +// retrieving the row + +#include +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; + +#define SG_SZ 16 + +#define TN SG_SZ +#define TK 32 + +template struct big_matrix { +public: + T *mat; + +public: + T *get_data() { return mat; } + void set_data(T *data) { mat = data; } + big_matrix(T *data) : mat(data) {} +}; + +template +void sum_rows_ref( + accessor B, + accessor + sum_rows) { + int sum_rows_ref[M] = {0}; + for (size_t i = 0; i < M; i++) { + for (size_t j = 0; j < N; j++) { + sum_rows_ref[i] += B[i][j]; + } + auto diff = sum_rows[i] - sum_rows_ref[i]; + assert(std::fabs(static_cast(diff)) <= + std::numeric_limits::epsilon()); + } +} + +template +void matrix_sum_rows(queue q, big_matrix &B, nd_range<2> &r) { + buffer bufB(B.get_data(), range<2>(M, N)); + // size of vector is known because SG size of set by the user in this case + int sum_rows[M] = {0}; + buffer sum_rows_v(sum_rows, M); // there are total of tK/4 * 2, 16 rows + q.submit([&](handler &cgh) { + auto accB = bufB.get_access(cgh); + + auto v = sum_rows_v.get_access(cgh); + + cgh.parallel_for( + r, [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + ext::oneapi::sub_group sg = spmd_item.get_sub_group(); + + joint_matrix sub_b(sg); + + joint_matrix_load(sg, sub_b, + accB.get_pointer() + (global_idx * (TK / 4) * N) + + sg_starty / SG_SZ * TN * 4, + N, layout::packed_b); + + int32_t sum_local_rows[M] = {0}; + auto tBData = sub_b.get_wi_data(); + + // each WI calculates local sum of rows + for (int i = 0; i < tBData.length(); ++i) { + // row and col holds global co_ordinates of the matrix + auto [row, col] = tBData[i].get_coord(); + sum_local_rows[row] += tBData[i]; + + sum_local_rows[row] = + reduce_over_group(sg, sum_local_rows[row], sycl::plus<>()); + // only Groups leader perform the global reduction + if (global_idy % SG_SZ == 0) { + atomic_fetch_add(v[row], sum_local_rows[row]); + } + } + }); // parallel for + }).wait(); + sum_rows_ref(bufB.get_access(), + sum_rows_v.get_access()); +} + +static constexpr size_t MATRIX_K = TK / 4 * 2; +static constexpr size_t MATRIX_N = TN * 4 * 2; +int8_t B[MATRIX_K][MATRIX_N]; + +int main() { + big_matrix MB((int8_t *)&B); + + size_t NDRangeK = MATRIX_K / (TK / 4); + size_t NDRangeN = (MATRIX_N / 4) / TN; + queue q; + nd_range<2> r({NDRangeK, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}); + + for (int i = 0; i < MATRIX_K; i++) { + for (int j = 0; j < MATRIX_N; j++) { + B[i][j] = i; + } + } + + matrix_sum_rows(q, MB, r); + + return 0; +} \ No newline at end of file diff --git a/sycl/test/matrix/matrix-bfloat16-test-coord-gemm.cpp b/sycl/test/matrix/matrix-bfloat16-test-coord-gemm.cpp new file mode 100644 index 0000000000000..d5da6343fc138 --- /dev/null +++ b/sycl/test/matrix/matrix-bfloat16-test-coord-gemm.cpp @@ -0,0 +1,212 @@ +// RUN: %clangxx -fsycl -O2 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=2 %s -o %t.out +// RUN: %t.out +// XFAIL: * + +#include +#include + +using namespace sycl::ext::oneapi::experimental::matrix; +using bfloat16 = sycl::ext::oneapi::experimental::bfloat16; + +static constexpr auto TILE_SZ = 16; +static constexpr auto TM = TILE_SZ - 1; +static constexpr auto TN = TILE_SZ - 1; +static constexpr auto TK = 2 * TILE_SZ - 2; + +static constexpr auto SG_SZ = 16; + +template struct big_matrix { +public: + T *mat; + +public: + T *get_data() { return mat; } + void set_data(T *data) { mat = data; } + big_matrix(T *data) : mat(data) {} +}; + +static constexpr size_t MATRIX_M = TM * 2; +static constexpr size_t MATRIX_N = TN * 2; +static constexpr size_t MATRIX_K = TK * 2; +bfloat16 A[MATRIX_M][MATRIX_K]; +bfloat16 B[MATRIX_K / 2][MATRIX_N * 2]; +unsigned short Aref[MATRIX_M][MATRIX_K]; +unsigned short Bref[MATRIX_K / 2][MATRIX_N * 2]; +float C[MATRIX_M][MATRIX_N]; +float D[MATRIX_M][MATRIX_N]; +int32_t *res_local_row; +int32_t *res_local_row_orig; + +template +void matrix_multiply(big_matrix &C, + big_matrix &A, + big_matrix &B) { + size_t M = NUM_ROWS_C; + size_t N = NUM_COLS_C; + size_t K = NUM_COLS_A; + // B => K/4 x N*4, A => M x K, C => M, N + // stride should be X's cols, e.g., B's stirde = N*4 + assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B * 2); + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + sycl::buffer bufA(A.get_data(), sycl::range<2>(M, K)); + sycl::buffer bufB(B.get_data(), sycl::range<2>(K, N)); + sycl::buffer bufC((float *)C.get_data(), sycl::range<2>(M, N)); + + sycl::buffer res_local_row_buf(res_local_row, + sycl::range<1>(MATRIX_M)); + + sycl::queue q; + q.submit([&](sycl::handler &cgh) { + auto accC = bufC.get_access(cgh); + auto accA = bufA.get_access(cgh); + auto accB = bufB.get_access(cgh); + auto v = res_local_row_buf.get_access(cgh); + + cgh.parallel_for( + sycl::nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), + [accA, accB, accC, M, N, K, v](sycl::nd_item<2> spmd_item) + + { + // The submatrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup no + // code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sycl::ext::oneapi::sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a(sg); + joint_matrix sub_b(sg); + joint_matrix sub_c(sg); + + joint_matrix_load(sg, sub_c, + accC.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N, layout::row_major); + for (int k = 0; k < K / TK; k += 1) { // + joint_matrix_load( + sg, sub_a, accA.get_pointer() + (sg_startx * TM) * K + k * TK, + K, layout::row_major); + // Assuming B data is already in VNNI format. + joint_matrix_load(sg, sub_b, + accB.get_pointer() + (k * TK / 2) * (N * 2) + + sg_starty / SG_SZ * TN * 2, + N * 2, layout::packed_b); + sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); + } + // Element wise operation + auto tCData = sub_c.get_wi_data(); + int32_t sum_local_rows[MATRIX_M] = {0}; + + for (int i = 0; i < tCData.length(); ++i) { + auto [row, col] = tCData[i].get_coord(); + sum_local_rows[row] += tCData[i]; + + sum_local_rows[row] = sycl::reduce_over_group( + sg, sum_local_rows[row], sycl::plus<>()); + // only Groups leader perform the global reduction + if (global_idy % SG_SZ == 0) { + atomic_fetch_add(v[row], sum_local_rows[row]); + } + } + }); // parallel for + }).wait(); +} + +float make_fp32(short x) { + unsigned int y = x; + y = y << 16; + float *res = reinterpret_cast(&y); + return *res; +} + +unsigned short make_bf16(float x) { + int *res = reinterpret_cast(&x); + *res = *res >> 16; + return (unsigned short)*res; +} + +void matrix_multiply_ref(int *A_mem, int *B_mem, int *C_mem, int M, int N, + int K) { + // tiling + for (int m = 0; m < M; m++) + for (int n = 0; n < N; n++) { + for (int k = 0; k < K; k++) { + short *va = (short *)(A_mem + m * K + k); + short *vb = (short *)(B_mem + k * N + n); + float acc = *((float *)(C_mem + m * N + n)); + // FIXME: Should we do reduce-add in another version? + for (int i = 0; i < 2; i++) { + acc += (make_fp32(va[i]) * make_fp32(vb[i])); + } + *((float *)(C_mem + m * N + n)) = acc; + res_local_row_orig[m] += acc; + } + } +} + +int main() { + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_K; j++) { + // Ee create bfloat16 from unsigned short since float-to-bfloat's + // conversion is not allowed. + A[i][j] = bfloat16::from_bits(make_bf16(1.0f * (i + j))); + Aref[i][j] = make_bf16(1.0f * (i + j)); + } + } + for (int i = 0; i < MATRIX_K / 2; i++) { + for (int j = 0; j < MATRIX_N * 2; j++) { + B[i][j] = bfloat16::from_bits((make_bf16(2.0f * i + 3.0f * j))); + Bref[i][j] = make_bf16(2.0f * i + 3.0f * j); + } + } + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) { + C[i][j] = 1.0; + D[i][j] = 1.0; + } + } + + big_matrix MC((float *)&C); + big_matrix MD((float *)&D); + big_matrix MA((bfloat16 *)&A); + big_matrix MB((bfloat16 *)&B); + + res_local_row = (int32_t *)calloc(MATRIX_M, sizeof(int32_t)); + res_local_row_orig = (int32_t *)calloc(MATRIX_M, sizeof(int32_t)); + + matrix_multiply(MC, MA, MB); + matrix_multiply_ref((int32_t *)Aref, (int32_t *)Bref, (int32_t *)D, MATRIX_M, + MATRIX_N, MATRIX_K / 2); + + bool res = true; + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) { + if (C[i][j] != D[i][j]) + res = false; + } + } + for (int i = 0; i < MATRIX_M; i++) { + if (res_local_row[i] != res_local_row_orig[i]) + res = false; + } + if (res) + std::cout << "passed\n"; + else + std::cout << "failed\n"; + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) + std::cout << C[i][j] << ", "; + std::cout << "\n"; + } + std::cout << std::endl; + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) + std::cout << D[i][j] << ", "; + std::cout << "\n"; + } +}