diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-jit.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-jit.hpp index af4d1927fd176..946abbf767587 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/matrix-jit.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-jit.hpp @@ -11,7 +11,6 @@ #include #include #include -#include __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { @@ -454,156 +453,6 @@ class wi_element { #undef OP }; -template -class wi_element { - joint_matrix &M; - std::size_t idx; - -public: - wi_element(joint_matrix &Mat, - std::size_t i) - : M(Mat), idx(i) {} - operator sycl::ext::oneapi::experimental::bfloat16() { -#ifdef __SYCL_DEVICE_ONLY__ - return __spirv_VectorExtractDynamic(M.spvm, idx); -#else - throw runtime_error("joint matrix is not supported on host device.", - PI_INVALID_DEVICE); -#endif // __SYCL_DEVICE_ONLY__ - } - - explicit operator bool() { -#ifdef __SYCL_DEVICE_ONLY__ - return std::fabs(static_cast(__spirv_VectorExtractDynamic( - M.spvm, idx))) >= std::numeric_limits::epsilon(); -#else - throw runtime_error("joint matrix is not supported on host device.", - PI_INVALID_DEVICE); -#endif // __SYCL_DEVICE_ONLY__ - } - - wi_element &operator=(const sycl::ext::oneapi::experimental::bfloat16 &rhs) { -#ifdef __SYCL_DEVICE_ONLY__ - M.spvm = __spirv_VectorInsertDynamic(M.spvm, rhs, idx); - return *this; -#else - (void)rhs; - throw runtime_error("joint matrix is not supported on host device.", - PI_INVALID_DEVICE); -#endif // __SYCL_DEVICE_ONLY__ - } - - wi_element & - operator=(const wi_element &rhs) { -#ifdef __SYCL_DEVICE_ONLY__ - M.spvm = __spirv_VectorInsertDynamic( - M.spvm, __spirv_VectorExtractDynamic(rhs.M.spvm, rhs.idx), idx); - return *this; -#else - (void)rhs; - throw runtime_error("joint matrix is not supported on host device.", - PI_INVALID_DEVICE); -#endif // __SYCL_DEVICE_ONLY__ - } - -#if __SYCL_DEVICE_ONLY__ -#define OP(opassign, op) \ - wi_element &operator opassign( \ - const sycl::ext::oneapi::experimental::bfloat16 &rhs) { \ - M.spvm = __spirv_VectorInsertDynamic( \ - M.spvm, __spirv_VectorExtractDynamic(M.spvm, idx) op rhs, idx); \ - return *this; \ - } -#else // __SYCL_DEVICE_ONLY__ -#define OP(opassign, op) \ - wi_element &operator opassign( \ - const sycl::ext::oneapi::experimental::bfloat16 &rhs) { \ - (void)rhs; \ - throw runtime_error("joint matrix is not supported on host device.", \ - PI_INVALID_DEVICE); \ - } -#endif // __SYCL_DEVICE_ONLY__ - OP(+=, +) - OP(-=, -) - OP(*=, *) - OP(/=, /) -#undef OP - -#if __SYCL_DEVICE_ONLY__ -#define OP(type, op) \ - friend type operator op( \ - const wi_element &lhs, \ - const sycl::ext::oneapi::experimental::bfloat16 &rhs) { \ - return __spirv_VectorExtractDynamic(lhs.M.spvm, lhs.idx) op rhs; \ - } \ - friend type operator op( \ - const sycl::ext::oneapi::experimental::bfloat16 &lhs, \ - const wi_element &rhs) { \ - return __spirv_VectorExtractDynamic(rhs.M.spvm, rhs.idx) op lhs; \ - } - OP(sycl::ext::oneapi::experimental::bfloat16, +) - OP(sycl::ext::oneapi::experimental::bfloat16, -) - OP(sycl::ext::oneapi::experimental::bfloat16, *) - OP(sycl::ext::oneapi::experimental::bfloat16, /) -#undef OP -#define OP(type, op) \ - friend type operator op( \ - const wi_element &lhs, \ - const sycl::ext::oneapi::experimental::bfloat16 &rhs) { \ - return type{static_cast(__spirv_VectorExtractDynamic( \ - lhs.M.spvm, lhs.idx)) op static_cast(rhs)}; \ - } \ - friend type operator op( \ - const sycl::ext::oneapi::experimental::bfloat16 &lhs, \ - const wi_element &rhs) { \ - return type{static_cast(__spirv_VectorExtractDynamic( \ - rhs.M.spvm, rhs.idx)) op static_cast(lhs)}; \ - } - OP(bool, ==) - OP(bool, !=) - OP(bool, <) - OP(bool, >) - OP(bool, <=) - OP(bool, >=) -#undef OP -#else // __SYCL_DEVICE_ONLY__ -#define OP(type, op) \ - friend type operator op( \ - const wi_element &, \ - const sycl::ext::oneapi::experimental::bfloat16 &) { \ - throw runtime_error("joint matrix is not supported on host device.", \ - PI_INVALID_DEVICE); \ - } \ - friend type operator op( \ - const sycl::ext::oneapi::experimental::bfloat16 &, \ - const wi_element &) { \ - throw runtime_error("joint matrix is not supported on host device.", \ - PI_INVALID_DEVICE); \ - } - OP(sycl::ext::oneapi::experimental::bfloat16, +) - OP(sycl::ext::oneapi::experimental::bfloat16, -) - OP(sycl::ext::oneapi::experimental::bfloat16, *) - OP(sycl::ext::oneapi::experimental::bfloat16, /) - OP(bool, ==) - OP(bool, !=) - OP(bool, <) - OP(bool, >) - OP(bool, <=) - OP(bool, >=) -#undef OP -#endif // __SYCL_DEVICE_ONLY__ -}; - template class wi_data { diff --git a/sycl/test/matrix/matrix-bfloat16-test.cpp b/sycl/test/matrix/matrix-bfloat16-test.cpp deleted file mode 100644 index fb9995cd0e322..0000000000000 --- a/sycl/test/matrix/matrix-bfloat16-test.cpp +++ /dev/null @@ -1,191 +0,0 @@ -// RUN: %clangxx -fsycl -O2 %s -o %t.out -#include -#if (SYCL_EXT_ONEAPI_MATRIX == 2) -#include - -using namespace sycl::ext::oneapi::experimental::matrix; -using bfloat16 = sycl::ext::oneapi::experimental::bfloat16; - -static constexpr auto TILE_SZ = 16; -static constexpr auto TM = TILE_SZ - 1; -static constexpr auto TN = TILE_SZ - 1; -static constexpr auto TK = 2 * TILE_SZ - 2; - -static constexpr auto SG_SZ = 16; - -template struct big_matrix { -public: - T *mat; - -public: - T *get_data() { return mat; } - void set_data(T *data) { mat = data; } - big_matrix(T *data) : mat(data) {} -}; - -template -void matrix_multiply(big_matrix &C, - big_matrix &A, - big_matrix &B) { - size_t M = NUM_ROWS_C; - size_t N = NUM_COLS_C; - size_t K = NUM_COLS_A; - // B => K/4 x N*4, A => M x K, C => M, N - // stride should be X's cols, e.g., B's stirde = N*4 - assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B * 2); - size_t NDRangeM = M / TM; - size_t NDRangeN = N / TN; - sycl::buffer bufA(A.get_data(), sycl::range<2>(M, K)); - sycl::buffer bufB(B.get_data(), sycl::range<2>(K, N)); - sycl::buffer bufC((float *)C.get_data(), sycl::range<2>(M, N)); - - sycl::queue q; - q.submit([&](sycl::handler &cgh) { - auto accC = bufC.get_access(cgh); - auto accA = bufA.get_access(cgh); - auto accB = bufB.get_access(cgh); - - cgh.parallel_for( - sycl::nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [accA, accB, accC, M, N, K](sycl::nd_item<2> spmd_item) - - { - // The submatrix API has to be accessed by all the workitems in a - // subgroup these functions will be called once by the subgroup no - // code divergence between the workitems - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sycl::ext::oneapi::sub_group sg = spmd_item.get_sub_group(); - joint_matrix sub_a(sg); - // For B, since current implementation does not support non-packed - // layout, users need to specify the updated VNNI sizes along with - // the packed_b layout. By default, the layout is row_major and size - // is (TK, TN). - joint_matrix sub_b(sg); - joint_matrix sub_c(sg); - - // AMX: 8 register tiles : 1k byte size, SMmaxxSKmax =16x64 - // strideX = X's cols, so strideC = N, strideA = K, strideB = N*4 - joint_matrix_load(sg, sub_c, - accC.get_pointer() + (sg_startx * TM) * N + - sg_starty / SG_SZ * TN, - N, matrix_layout::row_major); - for (int k = 0; k < K / TK; k += 1) { // - joint_matrix_load( - sg, sub_a, accA.get_pointer() + (sg_startx * TM) * K + k * TK, - K, matrix_layout::row_major); - // Assuming B data is already in VNNI format. - joint_matrix_load(sg, sub_b, - accB.get_pointer() + (k * TK / 2) * (N * 2) + - sg_starty / SG_SZ * TN * 2, - N * 2, matrix_layout::packed_b); - sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); - } - joint_matrix_store(sg, sub_c, - accC.get_pointer() + (sg_startx * TM) * N + - sg_starty / SG_SZ * TN, - N, matrix_layout::row_major); - }); // parallel for - }).wait(); -} - -static constexpr size_t MATRIX_M = TM * 2; -static constexpr size_t MATRIX_N = TN * 2; -static constexpr size_t MATRIX_K = TK * 2; -bfloat16 A[MATRIX_M][MATRIX_K]; -bfloat16 B[MATRIX_K / 2][MATRIX_N * 2]; -unsigned short Aref[MATRIX_M][MATRIX_K]; -unsigned short Bref[MATRIX_K / 2][MATRIX_N * 2]; -float C[MATRIX_M][MATRIX_N]; -float D[MATRIX_M][MATRIX_N]; - -float make_fp32(short x) { - unsigned int y = x; - y = y << 16; - float *res = reinterpret_cast(&y); - return *res; -} - -unsigned short make_bf16(float x) { - int *res = reinterpret_cast(&x); - *res = *res >> 16; - return (unsigned short)*res; -} - -void matrix_multiply_ref(int *A_mem, int *B_mem, int *C_mem, int M, int N, - int K) { - // tiling - for (int m = 0; m < M; m++) - for (int n = 0; n < N; n++) { - for (int k = 0; k < K; k++) { - short *va = (short *)(A_mem + m * K + k); - short *vb = (short *)(B_mem + k * N + n); - float acc = *((float *)(C_mem + m * N + n)); - // FIXME: Should we do reduce-add in another version? - for (int i = 0; i < 2; i++) { - acc += (make_fp32(va[i]) * make_fp32(vb[i])); - } - *((float *)(C_mem + m * N + n)) = acc; - } - } -} - -int main() { - for (int i = 0; i < MATRIX_M; i++) { - for (int j = 0; j < MATRIX_K; j++) { - // Ee create bfloat16 from unsigned short since float-to-bfloat's - // conversion is not allowed. - A[i][j] = bfloat16::from_bits(make_bf16(1.0f * (i + j))); - Aref[i][j] = make_bf16(1.0f * (i + j)); - } - } - for (int i = 0; i < MATRIX_K / 2; i++) { - for (int j = 0; j < MATRIX_N * 2; j++) { - B[i][j] = bfloat16::from_bits((make_bf16(2.0f * i + 3.0f * j))); - Bref[i][j] = make_bf16(2.0f * i + 3.0f * j); - } - } - for (int i = 0; i < MATRIX_M; i++) { - for (int j = 0; j < MATRIX_N; j++) { - C[i][j] = 1.0; - D[i][j] = 1.0; - } - } - - big_matrix MC((float *)&C); - big_matrix MD((float *)&D); - big_matrix MA((bfloat16 *)&A); - big_matrix MB((bfloat16 *)&B); - matrix_multiply(MC, MA, MB); - matrix_multiply_ref((int32_t *)Aref, (int32_t *)Bref, (int32_t *)D, MATRIX_M, - MATRIX_N, MATRIX_K / 2); - - bool res = true; - for (int i = 0; i < MATRIX_M; i++) { - for (int j = 0; j < MATRIX_N; j++) { - if (C[i][j] != D[i][j]) - res = false; - } - } - if (res) - std::cout << "passed\n"; - else - std::cout << "failed\n"; - for (int i = 0; i < MATRIX_M; i++) { - for (int j = 0; j < MATRIX_N; j++) - std::cout << C[i][j] << ", "; - std::cout << "\n"; - } - std::cout << std::endl; - for (int i = 0; i < MATRIX_M; i++) { - for (int j = 0; j < MATRIX_N; j++) - std::cout << D[i][j] << ", "; - std::cout << "\n"; - } -} -#endif // (SYCL_EXT_ONEAPI_MATRIX == 2)