From 0786c316873f5bbd3951484e9feb154a0cd0571c Mon Sep 17 00:00:00 2001 From: "Sidorov, Dmitry" Date: Thu, 8 Aug 2024 05:48:49 -0700 Subject: [PATCH 01/19] [SYCL][Matrix] Use KHR cooperative matrix instructions instead of Intel's The usage is currently guarded by __SPIRV_USE_COOPERATIVE_MATRIX macro. Signed-off-by: Sidorov, Dmitry --- sycl/include/CL/__spirv/spirv_ops.hpp | 131 +++++++++++ sycl/include/CL/__spirv/spirv_types.hpp | 24 ++ .../sycl/ext/oneapi/matrix/matrix-intel.hpp | 213 ++++++++++++++++++ .../sycl/ext/oneapi/matrix/matrix-unified.hpp | 89 +++++++- .../SPVCooperativeMatrix/element_wise_abc.cpp | 14 ++ .../SPVCooperativeMatrix/element_wise_ops.cpp | 14 ++ .../joint_matrix_bfloat16.cpp | 14 ++ .../joint_matrix_bfloat16_array.cpp | 14 ++ .../joint_matrix_half.cpp | 17 ++ .../joint_matrix_prefetch.cpp | 15 ++ .../joint_matrix_ss_int8.cpp | 14 ++ .../joint_matrix_su_int8.cpp | 14 ++ .../joint_matrix_tf32.cpp | 18 ++ .../joint_matrix_us_int8.cpp | 14 ++ .../joint_matrix_uu_int8.cpp | 14 ++ 15 files changed, 617 insertions(+), 2 deletions(-) create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_abc.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_ops.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_half.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_prefetch.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_ss_int8.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_su_int8.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_tf32.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_us_int8.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_uu_int8.cpp diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp index c8a9516b0309b..b33b8dc20a0af 100644 --- a/sycl/include/CL/__spirv/spirv_ops.hpp +++ b/sycl/include/CL/__spirv/spirv_ops.hpp @@ -27,6 +27,7 @@ extern __DPCPP_SYCL_EXTERNAL float __spirv_RoundFToTF32INTEL(float a); +#ifndef __SPIRV_USE_COOPERATIVE_MATRIX template * __spirv_VectorInsertDynamic(__spv::__spirv_JointMatrixINTEL *, Ts val, size_t i); +#else // __SPIRV_USE_COOPERATIVE_MATRIX +template +extern __DPCPP_SYCL_EXTERNAL + __spv::__spirv_CooperativeMatrixKHR * + __spirv_CooperativeMatrixLoadKHR(T *Ptr, __spv::MatrixLayout Layout = L, + std::size_t Stride = 0, + int MemOperand = 0); + +template +extern __DPCPP_SYCL_EXTERNAL void __spirv_CooperativeMatrixStoreKHR( + T *Ptr, __spv::__spirv_CooperativeMatrixKHR *Object, + __spv::MatrixLayout Layout = L, std::size_t Stride = 0, int MemOperand = 0); + +template +extern __DPCPP_SYCL_EXTERNAL size_t __spirv_CooperativeMatrixLengthKHR( + __spv::__spirv_CooperativeMatrixKHR *); + +template +extern __DPCPP_SYCL_EXTERNAL + __spv::__spirv_CooperativeMatrixKHR * + __spirv_CooperativeMatrixConstructCheckedINTEL(const T Value, size_t Height, + size_t Stride, size_t Width, + size_t CoordX, + size_t CoordY); + +template +extern __DPCPP_SYCL_EXTERNAL + __spv::__spirv_CooperativeMatrixKHR * + __spirv_CooperativeMatrixLoadCheckedINTEL(T *Ptr, std::size_t Stride, + size_t Height, size_t Width, + size_t CoordX, size_t CoordY, + __spv::MatrixLayout Layout = L, + int MemOperand = 0); + +template +extern __DPCPP_SYCL_EXTERNAL void __spirv_CooperativeMatrixStoreCheckedINTEL( + T *Ptr, __spv::__spirv_CooperativeMatrixKHR *Object, + std::size_t Stride, size_t Height, size_t Width, size_t CoordX, + size_t CoordY, __spv::MatrixLayout Layout = L, int MemOperand = 0); + +template +extern __DPCPP_SYCL_EXTERNAL + __spv::__spirv_CooperativeMatrixKHR * + __spirv_CooperativeMatrixMulAddKHR( + __spv::__spirv_CooperativeMatrixKHR *A, + __spv::__spirv_CooperativeMatrixKHR *B, + __spv::__spirv_CooperativeMatrixKHR *C, + size_t Operands = 0); + +template +extern __DPCPP_SYCL_EXTERNAL + __spv::__spirv_CooperativeMatrixKHR * + __spirv_CompositeConstruct(const T v); + +// TODO: replace with __spirv_CooperativeMatrixGetElementCoordINTEL when ready +template +extern __DPCPP_SYCL_EXTERNAL __ocl_vec_t +__spirv_JointMatrixGetElementCoordINTEL( + __spv::__spirv_CooperativeMatrixKHR *, size_t i); + +// AccessChain followed by load/store serves to extract/insert and element +// from/to the matrix +template +extern __DPCPP_SYCL_EXTERNAL Ts * +__spirv_AccessChain(__spv::__spirv_CooperativeMatrixKHR **, + size_t i); + +template +extern __DPCPP_SYCL_EXTERNAL + __spv::__spirv_CooperativeMatrixKHR * + __spirv_CooperativeMatrixConstructCheckedINTEL(int32_t CoordX, + int32_t CoordY, + uint32_t Height, + uint32_t Width, + const T Value); + +template +extern __DPCPP_SYCL_EXTERNAL + __spv::__spirv_CooperativeMatrixKHR * + __spirv_CooperativeMatrixLoadCheckedINTEL( + T *Ptr, int32_t CoordX, int32_t CoordY, __spv::MatrixLayout Layout = L, + uint32_t Height = 0, uint32_t Width = 0, std::size_t Stride = 0, + int MemOperand = 0); + +template +extern __DPCPP_SYCL_EXTERNAL void __spirv_CooperativeMatrixStoreCheckedINTEL( + T *Ptr, int32_t CoordX, int32_t CoordY, + __spv::__spirv_CooperativeMatrixKHR *Object, + __spv::MatrixLayout Layout = L, uint32_t Height = 0, uint32_t Width = 0, + std::size_t Stride = 0, int MemOperand = 0); +#endif // __SPIRV_USE_COOPERATIVE_MATRIX template extern __DPCPP_SYCL_EXTERNAL void __spirv_CooperativeMatrixPrefetchINTEL( diff --git a/sycl/include/CL/__spirv/spirv_types.hpp b/sycl/include/CL/__spirv/spirv_types.hpp index 013c2e1acc564..21563c3b8a67a 100644 --- a/sycl/include/CL/__spirv/spirv_types.hpp +++ b/sycl/include/CL/__spirv/spirv_types.hpp @@ -118,10 +118,34 @@ enum class MatrixLayout : uint32_t { enum class MatrixUse : uint32_t { MatrixA = 0, MatrixB = 1, Accumulator = 2 }; +#ifdef __SPIRV_USE_COOPERATIVE_MATRIX +enum class MatrixOperands : uint32_t { + // SPV_KHR_cooperative_matrix operands + NoneKHR = 0, + MatrixASignedComponentsKHR = 0x1, + MatrixBSignedComponentsKHR = 0x2, + MatrixCSignedComponentsKHR = 0x4, + MatrixResultSignedComponentsKHR = 0x8, + SaturatingAccumulationKHR = 0x10, + // SPV_INTEL_joint_matrix operands + MatrixAAndBTF32ComponentsINTEL = 0x20, + MatrixAAndBBFloat16ComponentsINTEL = 0x40, + MatrixCBFloat16ComponentsINTEL = 0x80, + MatrixResultBFloat16ComponentsINTEL = 0x100 +}; +#endif // __SPIRV_USE_COOPERATIVE_MATRIX + +#ifndef __SPIRV_USE_COOPERATIVE_MATRIX + template struct __spirv_JointMatrixINTEL; +#else +template +struct __spirv_CooperativeMatrixKHR; +#endif // __SPIRV_USE_COOPERATIVE_MATRIX struct __spirv_TaskSequenceINTEL; diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp index 9fb099e7eb8e4..07d5a1939547d 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp @@ -115,8 +115,13 @@ class wi_element { inline __SYCL_ALWAYS_INLINE std::tuple get_coord() { #if defined(__SYCL_DEVICE_ONLY__) +#ifndef __SPIRV_USE_COOPERATIVE_MATRIX __ocl_vec_t coord = __spirv_JointMatrixGetElementCoordINTEL(M.spvm, idx); +#else + __ocl_vec_t coord = + __spirv_CooperativeMatrixGetElementCoordINTEL(M.spvm, idx); +#endif // __SPIRV_USE_COOPERATIVE_MATRIX const size_t row = coord[0]; const size_t col = coord[1]; return std::make_tuple(row, col); @@ -128,12 +133,20 @@ class wi_element { operator storage_element_type() { #ifdef __SYCL_DEVICE_ONLY__ +#ifndef __SPIRV_USE_COOPERATIVE_MATRIX storage_element_type elem = __spirv_VectorExtractDynamic::value, spv_matrix_layout_traits::value, spv_scope_traits::value>(M.spvm, idx); +#else + storage_element_type *ExtractP = + __spirv_AccessChain::value, + spv_scope_traits::value>(&M.spvm, idx); + storage_element_type elem = *ExtractP; +#endif // __SPIRV_USE_COOPERATIVE_MATRIX return elem; #else throw exception(make_error_code(errc::runtime), @@ -143,12 +156,20 @@ class wi_element { explicit operator bool() { #ifdef __SYCL_DEVICE_ONLY__ +#ifndef __SPIRV_USE_COOPERATIVE_MATRIX return __spirv_VectorExtractDynamic::value, spv_matrix_layout_traits::value, spv_scope_traits::value>( M.spvm, idx) != static_cast(0); +#else + storage_element_type *ExtractP = + __spirv_AccessChain::value, + spv_scope_traits::value>(&M.spvm, idx); + return *ExtractP != static_cast(0); +#endif // __SPIRV_USE_COOPERATIVE_MATRIX #else throw exception(make_error_code(errc::runtime), "joint matrix is not supported on host."); @@ -157,8 +178,16 @@ class wi_element { template wi_element &operator=(const T2 &rhs) { #ifdef __SYCL_DEVICE_ONLY__ +#ifndef __SPIRV_USE_COOPERATIVE_MATRIX M.spvm = __spirv_VectorInsertDynamic( M.spvm, static_cast(rhs), idx); +#else + storage_element_type *InsertP = + __spirv_AccessChain::value, + spv_scope_traits::value>(&M.spvm, idx); + *InsertP = static_cast(rhs); +#endif // __SPIRV_USE_COOPERATIVE_MATRIX return *this; #else (void)rhs; @@ -170,6 +199,7 @@ class wi_element { wi_element & operator=(const wi_element &rhs) { #ifdef __SYCL_DEVICE_ONLY__ +#ifndef __SPIRV_USE_COOPERATIVE_MATRIX M.spvm = __spirv_VectorInsertDynamic( M.spvm, __spirv_VectorExtractDynamic::value>(rhs.M.spvm, rhs.idx), idx); +#else + storage_element_type *ExtractP = + __spirv_AccessChain::value, + spv_scope_traits::value>(&rhs.M.spvm, + rhs.idx); + storage_element_type *InsertP = + __spirv_AccessChain::value, + spv_scope_traits::value>(&M.spvm, idx); + *InsertP = *ExtractP; +#endif // __SPIRV_USE_COOPERATIVE_MATRIX return *this; #else (void)rhs; @@ -187,6 +229,7 @@ class wi_element { } #if __SYCL_DEVICE_ONLY__ +#ifndef __SPIRV_USE_COOPERATIVE_MATRIX #define OP(op) \ template wi_element &operator op##=(const T2 & rhs) { \ M.spvm = __spirv_VectorInsertDynamic( \ @@ -201,6 +244,22 @@ class wi_element { idx); \ return *this; \ } +#else // __SPIRV_USE_COOPERATIVE_MATRIX +#define OP(op) \ + template wi_element &operator op##=(const T2 & rhs) { \ + storage_element_type *ExtractP = \ + __spirv_AccessChain::value, \ + spv_scope_traits::value>(&rhs.M.spvm, \ + rhs.idx); \ + storage_element_type *InsertP = \ + __spirv_AccessChain::value, \ + spv_scope_traits::value>(&M.spvm, idx); \ + *InsertP = *ExtractP op static_cast(rhs); \ + return *this; \ + } +#endif // __SPIRV_USE_COOPERATIVE_MATRIX #else // __SYCL_DEVICE_ONLY__ #define OP(op) \ template wi_element &operator op##=(const T2 & rhs) { \ @@ -235,8 +294,13 @@ class wi_element get_coord() { #if defined(__SYCL_DEVICE_ONLY__) +#ifndef __SPIRV_USE_COOPERATIVE_MATRIX __ocl_vec_t coord = __spirv_JointMatrixGetElementCoordINTEL(M.spvm, idx); +#else + __ocl_vec_t coord = + __spirv_CooperativeMatrixGetElementCoordINTEL(M.spvm, idx); +#endif // __SPIRV_USE_COOPERATIVE_MATRIX const uint32_t row = coord[0]; const uint32_t col = coord[1]; return std::make_tuple(row, col); @@ -248,11 +312,20 @@ class wi_element::value, spv_matrix_layout_traits::value, spv_scope_traits::value>(M.spvm, idx); +#else + sycl::ext::oneapi::bfloat16 *ExtractP = + __spirv_AccessChain::value, + spv_scope_traits::value>(&M.spvm, idx); + return *ExtractP; +#endif // __SPIRV_USE_COOPERATIVE_MATRIX #else throw exception(make_error_code(errc::runtime), "joint matrix is not supported on host."); @@ -261,6 +334,7 @@ class wi_element( __spirv_VectorExtractDynamic< sycl::ext::oneapi::bfloat16, sycl::ext::oneapi::bfloat16, @@ -268,6 +342,16 @@ class wi_element::value, spv_scope_traits::value>(M.spvm, idx))) >= std::numeric_limits::epsilon(); +#else + sycl::ext::oneapi::bfloat16 *ExtractP = + __spirv_AccessChain::value, + spv_scope_traits::value>(&M.spvm, idx); + sycl::ext::oneapi::bfloat16 Elem = *ExtractP; + return sycl::fabs(static_cast(Elem)) >= + std::numeric_limits::epsilon(); +#endif // __SPIRV_USE_COOPERATIVE_MATRIX #else throw exception(make_error_code(errc::runtime), "joint matrix is not supported on host."); @@ -276,7 +360,16 @@ class wi_element::value, + spv_scope_traits::value>(&M.spvm, idx); + *InsertP = rhs; +#endif // __SPIRV_USE_COOPERATIVE_MATRIX return *this; #else (void)rhs; @@ -288,6 +381,7 @@ class wi_element &rhs) { #ifdef __SYCL_DEVICE_ONLY__ +#ifndef __SPIRV_USE_COOPERATIVE_MATRIX M.spvm = __spirv_VectorInsertDynamic( M.spvm, __spirv_VectorExtractDynamic::value, + spv_scope_traits::value>(&rhs.M.spvm, + rhs.idx); + sycl::ext::oneapi::bfloat16 *InsertP = + __spirv_AccessChain::value, + spv_scope_traits::value>(&M.spvm, idx); + *InsertP = *ExtractP; + return *this; +#endif // __SPIRV_USE_COOPERATIVE_MATRIX #else (void)rhs; throw exception(make_error_code(errc::runtime), @@ -306,6 +415,7 @@ class wi_element::value, \ + spv_scope_traits::value>(&M.spvm, idx); \ + sycl::ext::oneapi::bfloat16 *InsertP = \ + __spirv_AccessChain::value, \ + spv_scope_traits::value>(&M.spvm, idx); \ + *InsertP = *ExtractP op rhs; \ + return *this; \ + } +#endif // __SPIRV_USE_COOPERATIVE_MATRIX #else // __SYCL_DEVICE_ONLY__ #define OP(opassign, op) \ wi_element &operator opassign(const sycl::ext::oneapi::bfloat16 & rhs) { \ @@ -333,6 +460,7 @@ class wi_element::value, \ spv_scope_traits::value>(rhs.M.spvm, rhs.idx) op lhs; \ } +#else +#define OP(type, op) \ + friend type operator op( \ + const wi_element &lhs, \ + const sycl::ext::oneapi::bfloat16 &rhs) { \ + sycl::ext::oneapi::bfloat16 *ExtractP = \ + __spirv_AccessChain::value, \ + spv_scope_traits::value>(&lhs.M.spvm, \ + lhs.idx); \ + return *ExtractP op rhs; \ + } \ + friend type operator op( \ + const sycl::ext::oneapi::bfloat16 &lhs, \ + const wi_element &rhs) { \ + sycl::ext::oneapi::bfloat16 *ExtractP = \ + __spirv_AccessChain::value, \ + spv_scope_traits::value>(&rhs.M.spvm, \ + rhs.idx); \ + return *ExtractP op lhs; \ + } +#endif // __SPIRV_USE_COOPERATIVE_MATRIX OP(sycl::ext::oneapi::bfloat16, +) OP(sycl::ext::oneapi::bfloat16, -) OP(sycl::ext::oneapi::bfloat16, *) OP(sycl::ext::oneapi::bfloat16, /) #undef OP +#ifndef __SPIRV_USE_COOPERATIVE_MATRIX #define OP(type, op) \ friend type operator op( \ const wi_element::value>(rhs.M.spvm, rhs.idx)) \ op static_cast(lhs)}; \ } +#else +#define OP(type, op) \ + friend type operator op( \ + const wi_element &lhs, \ + const sycl::ext::oneapi::bfloat16 &rhs) { \ + sycl::ext::oneapi::bfloat16 *ExtractP = \ + __spirv_AccessChain::value, \ + spv_scope_traits::value>(&lhs.M.spvm, \ + lhs.idx); \ + return type{static_cast(*ExtractP) op static_cast(rhs)}; \ + } \ + friend type operator op( \ + const sycl::ext::oneapi::bfloat16 &lhs, \ + const wi_element &rhs) { \ + sycl::ext::oneapi::bfloat16 *ExtractP = \ + __spirv_AccessChain::value, \ + spv_scope_traits::value>(&rhs.M.spvm, \ + rhs.idx); \ + return type{static_cast(*ExtractP) op static_cast(lhs)}; \ + } +#endif // __SPIRV_USE_COOPERATIVE_MATRIX OP(bool, ==) OP(bool, !=) OP(bool, <) @@ -448,7 +631,11 @@ class wi_data { public: size_t length() { #if __SYCL_DEVICE_ONLY__ +#ifndef __SPIRV_USE_COOPERATIVE_MATRIX return __spirv_JointMatrixWorkItemLengthINTEL(jm.spvm); +#else + return __spirv_CooperativeMatrixLengthKHR(jm.spvm); +#endif // __SPIRV_USE_COOPERATIVE_MATRIX #else throw exception(make_error_code(errc::runtime), "joint matrix is not supported on host."); @@ -503,6 +690,7 @@ joint_matrix_store(Group, // intel's impl using DecorT = typename sycl::detail::DecoratedType::type; DecorT *Ptr = sycl::detail::getDecorated(dst); +#ifndef __SPIRV_USE_COOPERATIVE_MATRIX __spirv_JointMatrixStoreINTEL::value, @@ -512,6 +700,18 @@ joint_matrix_store(Group, sycl::ext::oneapi::experimental::matrix::spv_matrix_layout_traits< Layout>::value, sycl::ext::oneapi::experimental::matrix::spv_scope_traits::value); +#else + __spirv_CooperativeMatrixStoreKHR< + DecorT, Tp, NumRows, NumCols, + sycl::ext::oneapi::experimental::matrix::spv_matrix_use_traits< + Use>::value, + sycl::ext::oneapi::experimental::matrix::spv_matrix_layout_traits< + Layout>::value>( + Ptr, src.spvm, + sycl::ext::oneapi::experimental::matrix::spv_matrix_layout_traits< + Layout>::value, + stride); +#endif // __SPIRV_USE_COOPERATIVE_MATRIX #endif // defined(__NVPTX__) #else std::ignore = src; @@ -548,6 +748,7 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_store( #else // intel's impl T *Ptr = dst.get(); +#ifndef __SPIRV_USE_COOPERATIVE_MATRIX __spirv_JointMatrixStoreINTEL::value, @@ -557,6 +758,18 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_store( sycl::ext::oneapi::experimental::matrix::spv_matrix_layout_traits< Layout>::value, sycl::ext::oneapi::experimental::matrix::spv_scope_traits::value); +#else + __spirv_CooperativeMatrixStoreKHR< + T, Tp, NumRows, NumCols, + sycl::ext::oneapi::experimental::matrix::spv_matrix_use_traits< + Use>::value, + sycl::ext::oneapi::experimental::matrix::spv_matrix_layout_traits< + Layout>::value>( + Ptr, src.spvm, + sycl::ext::oneapi::experimental::matrix::spv_matrix_layout_traits< + Layout>::value, + stride); +#endif // __SPIRV_USE_COOPERATIVE_MATRIX #endif // defined(__NVPTX__) #else std::ignore = src; diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp index cba25494ca65d..d3d57f24c56e6 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp @@ -50,9 +50,15 @@ struct joint_matrix { sycl::ext::oneapi::detail::joint_matrix_hip matrix_impl; #elif defined(__SPIR__) || defined(__SPIRV__) +#ifndef __SPIRV_USE_COOPERATIVE_MATRIX __spv::__spirv_JointMatrixINTEL< T, Rows, Cols, spv_matrix_layout_traits::value, spv_scope_traits::value, spv_matrix_use_traits::value> *spvm; +#else + __spv::__spirv_CooperativeMatrixKHR::value, Rows, + Cols, spv_matrix_use_traits::value> + *spvm; +#endif // __SPIRV_USE_COOPERATIVE_MATRIX #else static_assert(false, "The joint_matrix API is only supported by the Intel, " "CUDA and HIP (GFX90A) backends"); @@ -200,12 +206,20 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_load( std::ignore = sg; using DecorT = typename sycl::detail::DecoratedType::type; DecorT *Ptr = sycl::detail::getDecorated(src); +#ifndef __SPIRV_USE_COOPERATIVE_MATRIX res.spvm = __spirv_JointMatrixLoadINTEL< DecorT, S, NumRows, NumCols, spv_matrix_use_traits::value, spv_matrix_layout_traits::value>( Ptr, stride, sycl::detail::joint_matrix_layout_to_spv(Layout), spv_scope_traits::value); +#else + res.spvm = __spirv_CooperativeMatrixLoadKHR< + DecorT, S, NumRows, NumCols, + spv_matrix_use_traits::value, + spv_matrix_layout_traits::value>( + Ptr, sycl::detail::joint_matrix_layout_to_spv(Layout), stride); +#endif // __SPIRV_USE_COOPERATIVE_MATRIX #endif // defined(__NVPTX__) #else std::ignore = sg; @@ -246,12 +260,20 @@ joint_matrix_load(Group sg, std::ignore = sg; using DecorT = typename sycl::detail::DecoratedType::type; DecorT *Ptr = sycl::detail::getDecorated(src); +#ifndef __SPIRV_USE_COOPERATIVE_MATRIX res.spvm = __spirv_JointMatrixLoadINTEL::value, spv_matrix_layout_traits::value>( Ptr, stride, spv_matrix_layout_traits::value, spv_scope_traits::value); +#else + res.spvm = + __spirv_CooperativeMatrixLoadKHR::value, + spv_matrix_layout_traits::value>( + Ptr, spv_matrix_layout_traits::value, stride); +#endif // __SPIRV_USE_COOPERATIVE_MATRIX #endif // defined(__NVPTX__) #else std::ignore = sg; @@ -284,11 +306,18 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_load( #else std::ignore = sg; T *Ptr = src.get(); +#ifndef __SPIRV_USE_COOPERATIVE_MATRIX res.spvm = __spirv_JointMatrixLoadINTEL< T, S, NumRows, NumCols, spv_matrix_use_traits::value, spv_matrix_layout_traits::value>( Ptr, stride, sycl::detail::joint_matrix_layout_to_spv(Layout), spv_scope_traits::value); +#else + res.spvm = __spirv_CooperativeMatrixLoadKHR< + T, S, NumRows, NumCols, spv_matrix_use_traits::value, + spv_matrix_layout_traits::value>( + Ptr, sycl::detail::joint_matrix_layout_to_spv(Layout), stride); +#endif // __SPIRV_USE_COOPERATIVE_MATRIX #endif // defined(__NVPTX__) #else std::ignore = sg; @@ -323,12 +352,20 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_load( #else std::ignore = sg; T *Ptr = src.get(); +#ifndef __SPIRV_USE_COOPERATIVE_MATRIX res.spvm = __spirv_JointMatrixLoadINTEL::value, spv_matrix_layout_traits::value>( Ptr, stride, spv_matrix_layout_traits::value, spv_scope_traits::value); +#else + res.spvm = + __spirv_CooperativeMatrixLoadKHR::value, + spv_matrix_layout_traits::value>( + Ptr, spv_matrix_layout_traits::value, stride); +#endif // __SPIRV_USE_COOPERATIVE_MATRIX #endif // defined(__NVPTX__) #else std::ignore = sg; @@ -365,12 +402,20 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_store( std::ignore = sg; using DecorT = typename sycl::detail::DecoratedType::type; DecorT *Ptr = sycl::detail::getDecorated(dst); +#ifndef __SPIRV_USE_COOPERATIVE_MATRIX __spirv_JointMatrixStoreINTEL< DecorT, T, NumRows, NumCols, spv_matrix_use_traits::value, spv_matrix_layout_traits::value>( Ptr, src.spvm, stride, sycl::detail::joint_matrix_layout_to_spv(Layout), spv_scope_traits::value); +#else + __spirv_CooperativeMatrixStoreKHR< + DecorT, T, NumRows, NumCols, + spv_matrix_use_traits::value, + spv_matrix_layout_traits::value>( + Ptr, src.spvm, sycl::detail::joint_matrix_layout_to_spv(Layout), stride); +#endif // __SPIRV_USE_COOPERATIVE_MATRIX #endif // defined(__NVPTX__) #else std::ignore = sg; @@ -403,11 +448,18 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_store( #else std::ignore = sg; T *Ptr = dst.get(); +#ifndef __SPIRV_USE_COOPERATIVE_MATRIX __spirv_JointMatrixStoreINTEL< T, T, NumRows, NumCols, spv_matrix_use_traits::value, spv_matrix_layout_traits::value>( Ptr, src.spvm, stride, sycl::detail::joint_matrix_layout_to_spv(Layout), spv_scope_traits::value); +#else + __spirv_CooperativeMatrixStoreKHR< + T, T, NumRows, NumCols, spv_matrix_use_traits::value, + spv_matrix_layout_traits::value>( + Ptr, src.spvm, sycl::detail::joint_matrix_layout_to_spv(Layout), stride); +#endif // __SPIRV_USE_COOPERATIVE_MATRIX #endif // defined(__NVPTX__) #else std::ignore = sg; @@ -464,8 +516,9 @@ joint_matrix_mad( "requires that joint_matrix data types Ta and Tb match"); } #else - if constexpr (std::is_same::value && - std::is_same::value && +#ifndef __SPIRV_USE_COOPERATIVE_MATRIX + if constexpr (std::is_same::value && + std::is_same::value && std::is_same::value) D.spvm = __spirv_JointMatrixMadINTEL(A.spvm, B.spvm, C.spvm); else if constexpr (std::is_unsigned::value && std::is_unsigned::value) @@ -476,6 +529,38 @@ joint_matrix_mad( D.spvm = __spirv_JointMatrixUSMadINTEL(A.spvm, B.spvm, C.spvm); else D.spvm = __spirv_JointMatrixMadINTEL(A.spvm, B.spvm, C.spvm); +#else + if constexpr (std::is_same::value && + std::is_same::value && + std::is_same::value) { + constexpr uint32_t MatrixOperand = static_cast( + __spv::MatrixOperands::MatrixAAndBBFloat16ComponentsINTEL); + D.spvm = __spirv_CooperativeMatrixMulAddKHR(A.spvm, B.spvm, C.spvm, + MatrixOperand); + } else if constexpr (std::is_signed::value && + std::is_unsigned::value) { + constexpr uint32_t MatrixOperand = static_cast( + __spv::MatrixOperands::MatrixASignedComponentsKHR); + D.spvm = __spirv_CooperativeMatrixMulAddKHR(A.spvm, B.spvm, C.spvm, + MatrixOperand); + } else if constexpr (std::is_unsigned::value && + std::is_signed::value) { + constexpr uint32_t MatrixOperand = static_cast( + __spv::MatrixOperands::MatrixBSignedComponentsKHR); + D.spvm = __spirv_CooperativeMatrixMulAddKHR(A.spvm, B.spvm, C.spvm, + MatrixOperand); + } else if constexpr (std::is_signed::value && std::is_signed::value) { + constexpr uint32_t MatrixOperand = + static_cast( + __spv::MatrixOperands::MatrixASignedComponentsKHR) + + static_cast( + __spv::MatrixOperands::MatrixBSignedComponentsKHR); + D.spvm = __spirv_CooperativeMatrixMulAddKHR(A.spvm, B.spvm, C.spvm, + MatrixOperand); + } else { + D.spvm = __spirv_CooperativeMatrixMulAddKHR(A.spvm, B.spvm, C.spvm); + } +#endif // __SPIRV_USE_COOPERATIVE_MATRIX #endif // defined(__NVPTX__) #else std::ignore = A; diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_abc.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_abc.cpp new file mode 100644 index 0000000000000..3bf9030658112 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_abc.cpp @@ -0,0 +1,14 @@ +//==----------- element_wise_abc.cpp - DPC++ joint_matrix------------- ----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu, aspect-ext_intel_matrix + +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" +#include "../element_wise_abc_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_ops.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_ops.cpp new file mode 100644 index 0000000000000..19c2d5ef9c273 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_ops.cpp @@ -0,0 +1,14 @@ +//==----------- element_wise_ops.cpp - DPC++ joint_matrix------------- ----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu, aspect-ext_intel_matrix + +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" +#include "../element_wise_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16.cpp new file mode 100644 index 0000000000000..db664554a1a14 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16.cpp @@ -0,0 +1,14 @@ +//==-------- joint_matrix_bfloat16.cpp - DPC++ joint_matrix----------- ----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu, aspect-ext_intel_matrix + +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" +#include "../joint_matrix_bfloat16_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array.cpp new file mode 100644 index 0000000000000..bcbb20228ac00 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array.cpp @@ -0,0 +1,14 @@ +//==-------- joint_matrix_bfloat16_array.cpp - DPC++ joint_matrix----------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu, aspect-ext_intel_matrix + +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" +#include "../joint_matrix_bfloat16_array_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_half.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_half.cpp new file mode 100644 index 0000000000000..424bb83050048 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_half.cpp @@ -0,0 +1,17 @@ +//==-------- joint_matrix_half.cpp - DPC++ joint_matrix------------ ----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-fp16 +// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: matrix-fp16 +// REQUIRES: gpu + +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" +#include "../joint_matrix_half_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_prefetch.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_prefetch.cpp new file mode 100644 index 0000000000000..fe3c25e338094 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_prefetch.cpp @@ -0,0 +1,15 @@ +//==-------- joint_matrix_prefetch.cpp - DPC++ joint_matrix----------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu, aspect-ext_intel_matrix +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" + +constexpr size_t TN = 16; +#include "../joint_matrix_prefetch_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_ss_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_ss_int8.cpp new file mode 100644 index 0000000000000..ac650910f237a --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_ss_int8.cpp @@ -0,0 +1,14 @@ +//==-------- joint_matrix_ss_int8.cpp - DPC++ joint_matrix------------ ----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu, aspect-ext_intel_matrix + +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" +#include "../joint_matrix_ss_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_su_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_su_int8.cpp new file mode 100644 index 0000000000000..55d85decd7caa --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_su_int8.cpp @@ -0,0 +1,14 @@ +//==-------- joint_matrix_su_int8.cpp - DPC++ joint_matrix------------ ----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu, aspect-ext_intel_matrix + +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" +#include "../joint_matrix_su_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_tf32.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_tf32.cpp new file mode 100644 index 0000000000000..bdf6afc5bc395 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_tf32.cpp @@ -0,0 +1,18 @@ +//==---------------- joint_matrix_tf32.cpp - DPC++ joint_matrix------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu, matrix-tf32 +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" + +constexpr size_t TN = 16; + +#include "../joint_matrix_tf32_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_us_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_us_int8.cpp new file mode 100644 index 0000000000000..08e2f532b887c --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_us_int8.cpp @@ -0,0 +1,14 @@ +//==-------- joint_matrix_us_int8.cpp - DPC++ joint_matrix------------ ----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu, aspect-ext_intel_matrix + +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" +#include "../joint_matrix_us_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_uu_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_uu_int8.cpp new file mode 100644 index 0000000000000..f1b19bc257b1d --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_uu_int8.cpp @@ -0,0 +1,14 @@ +//==-------- joint_matrix_uu_int8.cpp - DPC++ joint_matrix------------ ----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu, aspect-ext_intel_matrix + +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" +#include "../joint_matrix_uu_int8_impl.hpp" From 534681d8125fde5fc07ebb8678c7bd7bb664f110 Mon Sep 17 00:00:00 2001 From: "Sidorov, Dmitry" Date: Thu, 8 Aug 2024 07:35:52 -0700 Subject: [PATCH 02/19] format Signed-off-by: Sidorov, Dmitry --- sycl/include/CL/__spirv/spirv_ops.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp index b33b8dc20a0af..c3a4dac369c88 100644 --- a/sycl/include/CL/__spirv/spirv_ops.hpp +++ b/sycl/include/CL/__spirv/spirv_ops.hpp @@ -175,7 +175,7 @@ template * __spirv_VectorInsertDynamic(__spv::__spirv_JointMatrixINTEL *, Ts val, size_t i); -#else // __SPIRV_USE_COOPERATIVE_MATRIX +#else // __SPIRV_USE_COOPERATIVE_MATRIX template Date: Thu, 8 Aug 2024 07:52:54 -0700 Subject: [PATCH 03/19] coord fix Signed-off-by: Sidorov, Dmitry --- sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp index 07d5a1939547d..a3749a0137e78 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp @@ -120,7 +120,7 @@ class wi_element { __spirv_JointMatrixGetElementCoordINTEL(M.spvm, idx); #else __ocl_vec_t coord = - __spirv_CooperativeMatrixGetElementCoordINTEL(M.spvm, idx); + __spirv_JointMatrixGetElementCoordINTEL(M.spvm, idx); #endif // __SPIRV_USE_COOPERATIVE_MATRIX const size_t row = coord[0]; const size_t col = coord[1]; @@ -299,7 +299,7 @@ class wi_element coord = - __spirv_CooperativeMatrixGetElementCoordINTEL(M.spvm, idx); + __spirv_JointMatrixGetElementCoordINTEL(M.spvm, idx); #endif // __SPIRV_USE_COOPERATIVE_MATRIX const uint32_t row = coord[0]; const uint32_t col = coord[1]; From e95ff62821e5c7fa5b196d6a38476c131e8ce4eb Mon Sep 17 00:00:00 2001 From: "Sidorov, Dmitry" Date: Mon, 12 Aug 2024 07:53:50 -0700 Subject: [PATCH 04/19] wip Signed-off-by: Sidorov, Dmitry --- sycl/test-e2e/replace.sh | 19 +++++++++++++++++++ sycl/test-e2e/replace2.sh | 15 +++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 sycl/test-e2e/replace.sh create mode 100644 sycl/test-e2e/replace2.sh diff --git a/sycl/test-e2e/replace.sh b/sycl/test-e2e/replace.sh new file mode 100644 index 0000000000000..902efda078010 --- /dev/null +++ b/sycl/test-e2e/replace.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +for file in *.cpp; do + if grep -q "// RUN: %{build} -o %t.out" "$file"; then + sed -i 's|// RUN: %{build} -o %t.out|// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out|' "$file" + echo "Updated $file" + else + echo "No matching line found in $file" + fi + + if grep -q '#include "' "$file"; then + sed -i -E 's|#include "([^"]+)"|#include "../\1"|' "$file" + echo "Updated all #include lines in $file" + else + echo "No #include lines found in $file" + fi + +done + diff --git a/sycl/test-e2e/replace2.sh b/sycl/test-e2e/replace2.sh new file mode 100644 index 0000000000000..0f19f0486f68f --- /dev/null +++ b/sycl/test-e2e/replace2.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +find . -name "*.cpp" | while read -r file; do + + line_number=$(grep -n -m 1 "// REQUIRES: .*" "$file" | cut -d: -f1) + + if [ ! -z "$line_number" ]; then + sed -i "${line_number}s|// REQUIRES: |// REQUIRES: gpu, |" "$file" + echo "Updated line $line_number in $file" + else + echo "No matching line found in $file" + fi + +done + From cc8b76200ce3907b2db39057dad9ad4e0497d6b8 Mon Sep 17 00:00:00 2001 From: "Sidorov, Dmitry" Date: Mon, 12 Aug 2024 07:56:31 -0700 Subject: [PATCH 05/19] move tests Signed-off-by: Sidorov, Dmitry --- .../SG32/element_wise_abc.cpp | 20 ++ .../SG32/element_wise_all_ops.cpp | 20 ++ .../SG32/element_wise_all_ops_half.cpp | 21 ++ .../SG32/element_wise_all_ops_int8.cpp | 20 ++ .../SG32/element_wise_all_ops_int8_packed.cpp | 22 ++ .../SG32/element_wise_all_ops_tf32.cpp | 19 + .../SG32/element_wise_all_sizes.cpp | 18 + .../SG32/element_wise_ops.cpp | 20 ++ .../SG32/get_coord_float_matC.cpp | 20 ++ .../SG32/get_coord_int8_matA.cpp | 20 ++ .../SG32/get_coord_int8_matB.cpp | 21 ++ .../SG32/joint_matrix_all_sizes.cpp | 20 ++ .../SG32/joint_matrix_annotated_ptr.cpp | 22 ++ .../SG32/joint_matrix_apply_bf16.cpp | 20 ++ .../SG32/joint_matrix_apply_two_matrices.cpp | 20 ++ .../SG32/joint_matrix_bf16_fill_k_cache.cpp | 22 ++ .../joint_matrix_bf16_fill_k_cache_init.cpp | 22 ++ .../joint_matrix_bf16_fill_k_cache_unroll.cpp | 24 ++ ...t_matrix_bf16_fill_k_cache_unroll_init.cpp | 24 ++ .../SG32/joint_matrix_bfloat16.cpp | 20 ++ .../SG32/joint_matrix_bfloat16_array.cpp | 20 ++ ...nt_matrix_bfloat16_colmajorA_colmajorB.cpp | 31 ++ .../SG32/joint_matrix_bfloat16_packedB.cpp | 19 + .../SG32/joint_matrix_colA_rowB_colC.cpp | 21 ++ .../SG32/joint_matrix_down_convert.cpp | 18 + .../SG32/joint_matrix_half.cpp | 22 ++ .../joint_matrix_int8_colmajorA_colmajorB.cpp | 27 ++ .../joint_matrix_int8_rowmajorA_rowmajorB.cpp | 22 ++ .../SG32/joint_matrix_out_bounds.cpp | 22 ++ .../SG32/joint_matrix_prefetch.cpp | 22 ++ .../SG32/joint_matrix_rowmajorA_rowmajorB.cpp | 27 ++ .../SG32/joint_matrix_ss_int8.cpp | 20 ++ .../SG32/joint_matrix_su_int8.cpp | 20 ++ .../SG32/joint_matrix_tf32.cpp | 22 ++ .../SG32/joint_matrix_transposeC.cpp | 20 ++ .../SG32/joint_matrix_unaligned_k.cpp | 22 ++ .../SG32/joint_matrix_us_int8.cpp | 20 ++ .../SG32/joint_matrix_uu_int8.cpp | 20 ++ .../SPVCooperativeMatrix/element_wise_abc.cpp | 14 + .../SPVCooperativeMatrix/element_wise_ops.cpp | 14 + .../joint_matrix_bfloat16.cpp | 14 + .../joint_matrix_bfloat16_array.cpp | 14 + .../joint_matrix_half.cpp | 17 + .../joint_matrix_prefetch.cpp | 15 + .../joint_matrix_ss_int8.cpp | 14 + .../joint_matrix_su_int8.cpp | 14 + .../joint_matrix_tf32.cpp | 18 + .../joint_matrix_us_int8.cpp | 14 + .../joint_matrix_uu_int8.cpp | 14 + .../Matrix/SPVCooperativeMatrix/common.hpp | 213 +++++++++++ .../SPVCooperativeMatrix/element_wise_abc.cpp | 8 +- .../element_wise_abc_impl.hpp | 119 +++++++ .../element_wise_all_ops.cpp | 17 + .../element_wise_all_ops_1d.cpp | 17 + .../element_wise_all_ops_1d_cont.cpp | 17 + .../element_wise_all_ops_half.cpp | 16 + .../element_wise_all_ops_half_impl.hpp | 127 +++++++ .../element_wise_all_ops_impl.hpp | 248 +++++++++++++ .../element_wise_all_ops_int8.cpp | 14 + .../element_wise_all_ops_int8_impl.hpp | 127 +++++++ .../element_wise_all_ops_int8_packed.cpp | 16 + .../element_wise_all_ops_int8_packed_impl.hpp | 130 +++++++ .../element_wise_all_ops_scalar.cpp | 17 + .../element_wise_all_ops_tf32.cpp | 18 + .../element_wise_all_ops_tf32_impl.hpp | 102 ++++++ .../element_wise_all_sizes.cpp | 23 ++ .../element_wise_all_sizes_impl.hpp | 130 +++++++ .../SPVCooperativeMatrix/element_wise_ops.cpp | 8 +- .../element_wise_ops_impl.hpp | 150 ++++++++ .../elemwise_irreg_size_ops_bf16.cpp | 145 ++++++++ .../get_coord_float_matC.cpp | 14 + .../get_coord_float_matC_impl.hpp | 130 +++++++ .../get_coord_int8_matA.cpp | 14 + .../get_coord_int8_matA_impl.hpp | 176 +++++++++ .../get_coord_int8_matB.cpp | 15 + .../get_coord_int8_matB_impl.hpp | 214 +++++++++++ .../joint_matrix_all_sizes.cpp | 14 + .../joint_matrix_all_sizes_impl.hpp | 155 ++++++++ .../joint_matrix_annotated_ptr.cpp | 19 + .../joint_matrix_annotated_ptr_impl.hpp | 128 +++++++ .../joint_matrix_apply_bf16.cpp | 14 + .../joint_matrix_apply_bf16_impl.hpp | 113 ++++++ .../joint_matrix_apply_two_matrices.cpp | 14 + .../joint_matrix_apply_two_matrices_impl.hpp | 157 ++++++++ .../joint_matrix_bf16_fill_k_cache.cpp | 16 + .../joint_matrix_bf16_fill_k_cache_OOB.cpp | 19 + .../joint_matrix_bf16_fill_k_cache_impl.hpp | 337 ++++++++++++++++++ .../joint_matrix_bf16_fill_k_cache_init.cpp | 16 + .../joint_matrix_bf16_fill_k_cache_unroll.cpp | 18 + ...t_matrix_bf16_fill_k_cache_unroll_init.cpp | 18 + .../joint_matrix_bfloat16.cpp | 8 +- .../joint_matrix_bfloat16_array.cpp | 8 +- .../joint_matrix_bfloat16_array_impl.hpp | 153 ++++++++ ...nt_matrix_bfloat16_colmajorA_colmajorB.cpp | 22 ++ ...trix_bfloat16_colmajorA_colmajorB_impl.hpp | 105 ++++++ .../joint_matrix_bfloat16_impl.hpp | 142 ++++++++ .../joint_matrix_bfloat16_packedB.cpp | 18 + .../joint_matrix_bfloat16_packedB_impl.hpp | 132 +++++++ .../joint_matrix_colA_rowB_colC.cpp | 19 + .../joint_matrix_colA_rowB_colC_impl.hpp | 99 +++++ .../joint_matrix_down_convert.cpp | 14 + .../joint_matrix_down_convert_impl.hpp | 82 +++++ .../joint_matrix_half.cpp | 7 +- .../joint_matrix_half_impl.hpp | 131 +++++++ .../joint_matrix_int8_colmajorA_colmajorB.cpp | 22 ++ ...t_matrix_int8_colmajorA_colmajorB_impl.hpp | 114 ++++++ .../joint_matrix_int8_rowmajorA_rowmajorB.cpp | 19 + ...t_matrix_int8_rowmajorA_rowmajorB_impl.hpp | 123 +++++++ .../joint_matrix_opt_kernel_feature.cpp | 18 + .../joint_matrix_opt_kernel_feature_impl.hpp | 109 ++++++ ...trix_opt_kernel_feature_unsupported_hw.cpp | 36 ++ .../joint_matrix_out_bounds.cpp | 22 ++ .../joint_matrix_out_bounds_impl.hpp | 124 +++++++ .../joint_matrix_prefetch.cpp | 8 +- .../joint_matrix_prefetch_impl.hpp | 148 ++++++++ .../joint_matrix_query_default.cpp | 174 +++++++++ .../joint_matrix_rowmajorA_rowmajorB.cpp | 22 ++ .../joint_matrix_rowmajorA_rowmajorB_impl.hpp | 151 ++++++++ .../joint_matrix_ss_int8.cpp | 8 +- .../joint_matrix_ss_int8_impl.hpp | 128 +++++++ .../joint_matrix_su_int8.cpp | 8 +- .../joint_matrix_su_int8_impl.hpp | 132 +++++++ .../joint_matrix_tf32.cpp | 8 +- .../joint_matrix_tf32_impl.hpp | 120 +++++++ .../joint_matrix_transposeC.cpp | 14 + .../joint_matrix_transposeC_impl.hpp | 151 ++++++++ .../joint_matrix_unaligned_k.cpp | 20 ++ .../joint_matrix_us_int8.cpp | 8 +- .../joint_matrix_us_int8_impl.hpp | 133 +++++++ .../joint_matrix_uu_int8.cpp | 8 +- .../joint_matrix_uu_int8_impl.hpp | 133 +++++++ 131 files changed, 7092 insertions(+), 44 deletions(-) create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_abc.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_half.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8_packed.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_tf32.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_sizes.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_ops.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_float_matC.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matA.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matB.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_all_sizes.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_annotated_ptr.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_bf16.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_two_matrices.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_array.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_packedB.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_colA_rowB_colC.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_down_convert.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_half.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_out_bounds.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_prefetch.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_rowmajorA_rowmajorB.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_ss_int8.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_su_int8.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_tf32.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_transposeC.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_unaligned_k.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_us_int8.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_uu_int8.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/element_wise_abc.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/element_wise_ops.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_bfloat16.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_half.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_prefetch.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_ss_int8.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_su_int8.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_tf32.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_us_int8.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_uu_int8.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/common.hpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_abc_impl.hpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d_cont.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_half.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_half_impl.hpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_impl.hpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8_impl.hpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8_packed.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8_packed_impl.hpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_scalar.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_tf32.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_tf32_impl.hpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_sizes.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_sizes_impl.hpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_ops_impl.hpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/elemwise_irreg_size_ops_bf16.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_float_matC.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_float_matC_impl.hpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matA.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matA_impl.hpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matB.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matB_impl.hpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_all_sizes.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_all_sizes_impl.hpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_annotated_ptr.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_annotated_ptr_impl.hpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_bf16.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_bf16_impl.hpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_two_matrices.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_two_matrices_impl.hpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_OOB.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_impl.hpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_init.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array_impl.hpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_impl.hpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_packedB.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_packedB_impl.hpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_colA_rowB_colC.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_colA_rowB_colC_impl.hpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_down_convert.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_down_convert_impl.hpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_half_impl.hpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_colmajorA_colmajorB.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_colmajorA_colmajorB_impl.hpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_rowmajorA_rowmajorB.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_rowmajorA_rowmajorB_impl.hpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature_impl.hpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature_unsupported_hw.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_out_bounds.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_out_bounds_impl.hpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_prefetch_impl.hpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_query_default.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_rowmajorA_rowmajorB.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_rowmajorA_rowmajorB_impl.hpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_ss_int8_impl.hpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_su_int8_impl.hpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_tf32_impl.hpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_transposeC.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_transposeC_impl.hpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_unaligned_k.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_us_int8_impl.hpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_uu_int8_impl.hpp diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_abc.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_abc.cpp new file mode 100644 index 0000000000000..1fdd989ae091f --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_abc.cpp @@ -0,0 +1,20 @@ +//==----------- element_wise_abc.cpp - DPC++ joint_matrix------------- ----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// UNSUPPORTED: gpu-intel-dg2 +// REQUIRES: aspect-ext_intel_matrix +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" + +#define SG_SZ 32 + +#include "../element_wise_abc_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops.cpp new file mode 100644 index 0000000000000..fdcc7d2aef9f7 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops.cpp @@ -0,0 +1,20 @@ +//==------------ element_wise_all_ops.cpp - DPC++ joint_matrix-------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// UNSUPPORTED: gpu-intel-dg2 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" + +#define SG_SZ 32 + +#include "../element_wise_all_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_half.cpp new file mode 100644 index 0000000000000..847f4a7812aa2 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_half.cpp @@ -0,0 +1,21 @@ +//==----------- element_wise_all_ops_half.cpp - DPC++ joint_matrix---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// UNSUPPORTED: gpu-intel-dg2 +// REQUIRES: aspect-fp16 +// REQUIRES: aspect-ext_intel_matrix, gpu +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" + +#define SG_SZ 32 + +#include "../element_wise_all_ops_half_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8.cpp new file mode 100644 index 0000000000000..984ff9a9b082f --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8.cpp @@ -0,0 +1,20 @@ +//==----------- element_wise_all_ops_int8.cpp - DPC++ joint_matrix---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// UNSUPPORTED: gpu-intel-dg2 +// REQUIRES: aspect-ext_intel_matrix +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" + +#define SG_SZ 32 + +#include "../element_wise_all_ops_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8_packed.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8_packed.cpp new file mode 100644 index 0000000000000..af2f4df82b648 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8_packed.cpp @@ -0,0 +1,22 @@ +//==------ element_wise_all_ops_int8_packed.cpp - DPC++ joint_matrix-------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// UNSUPPORTED: gpu-intel-dg2 +// REQUIRES: aspect-ext_intel_matrix +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// This test stores the matrix B that is VNNIed (packed). + +#include "../common.hpp" + +#define SG_SZ 32 + +#include "../element_wise_all_ops_int8_packed_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_tf32.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_tf32.cpp new file mode 100644 index 0000000000000..06d459a2a3ce5 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_tf32.cpp @@ -0,0 +1,19 @@ +//==----------- element_wise_all_ops_tf32.cpp - DPC++ joint_matrix---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix-tf32 +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" + +#define SG_SZ 32 +constexpr size_t TN = 16; + +#include "../element_wise_all_ops_tf32_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_sizes.cpp new file mode 100644 index 0000000000000..4624110577ea2 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_sizes.cpp @@ -0,0 +1,18 @@ +//==----------- element_wise_all_sizes.cpp - DPC++ joint_matrix---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" + +#define SG_SZ 32 + +#include "../element_wise_all_sizes_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_ops.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_ops.cpp new file mode 100644 index 0000000000000..1a09518e65ffb --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_ops.cpp @@ -0,0 +1,20 @@ +//==----------- element_wise_ops.cpp - DPC++ joint_matrix------------- ----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// UNSUPPORTED: gpu-intel-dg2 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" + +#define SG_SZ 32 + +#include "../element_wise_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_float_matC.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_float_matC.cpp new file mode 100644 index 0000000000000..9b4777faced20 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_float_matC.cpp @@ -0,0 +1,20 @@ +//==----------- get_coord_float_matC.cpp - DPC++ joint_matrix---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// UNSUPPORTED: gpu-intel-dg2 +// REQUIRES: aspect-ext_intel_matrix +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" + +#define SG_SZ 32 + +#include "../get_coord_float_matC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matA.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matA.cpp new file mode 100644 index 0000000000000..09c3ccf9983ab --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matA.cpp @@ -0,0 +1,20 @@ +//==----------- get_coord_int8_matA.cpp - DPC++ joint_matrix---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// UNSUPPORTED: gpu-intel-dg2 +// REQUIRES: aspect-ext_intel_matrix +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" + +#define SG_SZ 32 + +#include "../get_coord_int8_matA_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matB.cpp new file mode 100644 index 0000000000000..80e0c0c6b845d --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matB.cpp @@ -0,0 +1,21 @@ +//==----------- get_coord_int8_matB.cpp - DPC++ joint_matrix---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// UNSUPPORTED: gpu-intel-dg2 +// REQUIRES: aspect-ext_intel_matrix +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// XFAIL: cpu + +#include "../common.hpp" + +#define SG_SZ 32 + +#include "../get_coord_int8_matB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_all_sizes.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_all_sizes.cpp new file mode 100644 index 0000000000000..3fe1f33343b17 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_all_sizes.cpp @@ -0,0 +1,20 @@ +//==-------- joint_matrix_all_sizes.cpp - DPC++ joint_matrix---------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// UNSUPPORTED: gpu-intel-dg2 +// REQUIRES: aspect-ext_intel_matrix +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" + +#define SG_SZ 32 + +#include "../joint_matrix_all_sizes_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_annotated_ptr.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_annotated_ptr.cpp new file mode 100644 index 0000000000000..7a6aca1024418 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_annotated_ptr.cpp @@ -0,0 +1,22 @@ +//==-------- joint_matrix_annotated_ptr.cpp - DPC++ joint_matrix-----------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// Currently row major B fails when annotated_ptr is used +// XFAIL: gpu + +#include "../common.hpp" + +#define SG_SZ 32 +constexpr size_t TN = 16; + +#include "../joint_matrix_annotated_ptr_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_bf16.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_bf16.cpp new file mode 100644 index 0000000000000..88d5ba29922de --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_bf16.cpp @@ -0,0 +1,20 @@ +//==----------- joint_matrix_apply_bf16.cpp - DPC++ joint_matrix-----------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// UNSUPPORTED: gpu-intel-dg2 +// REQUIRES: aspect-ext_intel_matrix +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" + +#define SG_SZ 32 + +#include "../joint_matrix_apply_bf16_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_two_matrices.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_two_matrices.cpp new file mode 100644 index 0000000000000..8e5ebeed1c222 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_two_matrices.cpp @@ -0,0 +1,20 @@ +//==------ joint_matrix_apply_two_matrices.cpp - DPC++ joint_matrix--------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix + +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// UNSUPPORTED: gpu-intel-dg2 + +// RUN: %{build} -ffp-model=precise -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" + +#define SG_SZ 32 + +#include "../joint_matrix_apply_two_matrices_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache.cpp new file mode 100644 index 0000000000000..10334f93afa80 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache.cpp @@ -0,0 +1,22 @@ +//==--- joint_matrix_bf16_fill_k_cache.cpp - DPC++ joint_matrix----------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// UNSUPPORTED: gpu-intel-dg2 +// REQUIRES: aspect-ext_intel_matrix +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// RUN: %{build} -o %t.out -ffp-model=precise +// RUN: %{run} %t.out + +// -ffp-model=precise is added to not depend on compiler defaults. + +#include "../common.hpp" + +#define SG_SZ 32 + +#include "../joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp new file mode 100644 index 0000000000000..32af965ec431a --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp @@ -0,0 +1,22 @@ +//==---joint_matrix_bf16_fill_k_cache_init.cpp - DPC++ joint_matrix--------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// UNSUPPORTED: gpu-intel-dg2 +// REQUIRES: aspect-ext_intel_matrix, gpu +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// RUN: %{build} -o %t.out -DINIT_LIST -ffp-model=precise +// RUN: %{run} %t.out + +// -ffp-model=precise is added to not depend on compiler defaults. + +#include "../common.hpp" + +#define SG_SZ 32 + +#include "../joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp new file mode 100644 index 0000000000000..1c7533e331e73 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp @@ -0,0 +1,24 @@ +//==---joint_matrix_bf16_fill_k_cache_unroll.cpp - DPC++ joint_matrix------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// UNSUPPORTED: gpu-intel-dg2 +// REQUIRES: aspect-ext_intel_matrix +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// RUN: %{build} -mllvm -inline-threshold=5000 -ffp-model=precise -o %t.out -DMANUAL_UNROLL +// RUN: %{run} %t.out + +// -mllvm -inline-threshold added as a workaround, +// since IGC doesn't support some variants of IR for Joint Matrix currently +// -ffp-model=precise is added to not depend on compiler defaults. + +#include "../common.hpp" + +#define SG_SZ 32 + +#include "../joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp new file mode 100644 index 0000000000000..f8d30cdc26756 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp @@ -0,0 +1,24 @@ +//==--joint_matrix_bf16_fill_k_cache_unroll_init.cpp - DPC++ joint_matrix--==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// UNSUPPORTED: gpu-intel-dg2 +// REQUIRES: aspect-ext_intel_matrix, gpu +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// RUN: %{build} -mllvm -inline-threshold=5000 -ffp-model=precise -o %t_gpu.out -DINIT_LIST -DMANUAL_UNROLL +// RUN: %{run} %t_gpu.out + +// -mllvm -inline-threshold added as a workaround, +// since IGC doesn't support some variants of IR for Joint Matrix currently +// -ffp-model=precise is added to not depend on compiler defaults. + +#include "../common.hpp" + +#define SG_SZ 32 + +#include "../joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16.cpp new file mode 100644 index 0000000000000..4857ded37a011 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16.cpp @@ -0,0 +1,20 @@ +//==-------- joint_matrix_bfloat16.cpp - DPC++ joint_matrix----------- ----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// UNSUPPORTED: gpu-intel-dg2 +// REQUIRES: aspect-ext_intel_matrix +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" + +#define SG_SZ 32 + +#include "../joint_matrix_bfloat16_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_array.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_array.cpp new file mode 100644 index 0000000000000..87fd837446618 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_array.cpp @@ -0,0 +1,20 @@ +//==-------- joint_matrix_bfloat16_array.cpp - DPC++ joint_matrix----------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// UNSUPPORTED: gpu-intel-dg2 +// REQUIRES: aspect-ext_intel_matrix +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" + +#define SG_SZ 32 + +#include "../joint_matrix_bfloat16_array_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp new file mode 100644 index 0000000000000..6532bcfe47bff --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp @@ -0,0 +1,31 @@ +//==-- joint_matrix_bfloat16_colmajorA_colmajorB.cpp - DPC++ joint_matrix--==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// This tests support of col major layout for matrix B which does transpose and +// then VNNI transform. This is currently only available on AMX + +// XFAIL: gpu + +#include "../common.hpp" +#include +#include +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; +using bfloat16 = sycl::ext::oneapi::bfloat16; + +#define SG_SZ 32 +constexpr size_t TN = 16; + +#include "../joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_packedB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_packedB.cpp new file mode 100644 index 0000000000000..d2ba1cdbdd38e --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_packedB.cpp @@ -0,0 +1,19 @@ +//==----- joint_matrix_bfloat16_packedB.cpp - DPC++ joint_matrix----------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// XFAIL: gpu + +#include "../common.hpp" + +#define SG_SZ 32 +#include "../joint_matrix_bfloat16_packedB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_colA_rowB_colC.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_colA_rowB_colC.cpp new file mode 100644 index 0000000000000..70e53441cb48f --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_colA_rowB_colC.cpp @@ -0,0 +1,21 @@ +//==---------- joint_matrix_colA_rowB_colC.cpp - DPC++ joint_matrix---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// XFAIL:* + +#include "../common.hpp" + +#define SG_SZ 32 +constexpr size_t TN = 16; + +#include "../joint_matrix_colA_rowB_colC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_down_convert.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_down_convert.cpp new file mode 100644 index 0000000000000..b474f846d11d5 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_down_convert.cpp @@ -0,0 +1,18 @@ +//==-------- joint_matrix_down_convert.cpp - DPC++ joint_matrix------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" + +#define SG_SZ 32 + +#include "../joint_matrix_down_convert_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_half.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_half.cpp new file mode 100644 index 0000000000000..2b990ea3c6098 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_half.cpp @@ -0,0 +1,22 @@ +//==-------- joint_matrix_half.cpp - DPC++ joint_matrix------------ ----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// UNSUPPORTED: gpu-intel-dg2 +// REQUIRES: aspect-fp16 +// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: matrix-fp16 +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" + +#define SG_SZ 32 + +#include "../joint_matrix_half_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp new file mode 100644 index 0000000000000..c89c657c77fbc --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp @@ -0,0 +1,27 @@ +//==----- joint_matrix_int8_colmajorA_colmajorB.cpp - DPC++ joint_matrix---==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// This tests support of col major layout for matrix B which does transpose and +// then VNNI transform. This is currently only available on AMX + +// XFAIL: gpu + +#include "../common.hpp" + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; + +#define SG_SZ 32 +constexpr size_t TN = 16; + +#include "../joint_matrix_int8_colmajorA_colmajorB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp new file mode 100644 index 0000000000000..d73a2300888d3 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp @@ -0,0 +1,22 @@ +//==--- joint_matrix_int8_rowmajorA_rowmajorB.cpp - DPC++ joint_matrix-----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// UNSUPPORTED: gpu-intel-dg2 +// REQUIRES: aspect-ext_intel_matrix +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// XFAIL: gpu + +#include "../common.hpp" + +#define SG_SZ 32 + +#include "../joint_matrix_int8_rowmajorA_rowmajorB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_out_bounds.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_out_bounds.cpp new file mode 100644 index 0000000000000..1848a480a0eb7 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_out_bounds.cpp @@ -0,0 +1,22 @@ +//==-------- joint_matrix_out_bounds.cpp - DPC++ joint_matrix--------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// XFAIL:* + +#include "../common.hpp" + +#define SG_SZ 32 +constexpr size_t TN = 16; +constexpr size_t MATRIX_K = 1024 + 24; + +#include "../joint_matrix_out_bounds_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_prefetch.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_prefetch.cpp new file mode 100644 index 0000000000000..a6ce09fe13b9f --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_prefetch.cpp @@ -0,0 +1,22 @@ +//==-------- joint_matrix_prefetch.cpp - DPC++ joint_matrix----------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 +// REQUIRES: aspect-ext_intel_matrix +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// XFAIL: gpu + +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// UNSUPPORTED: gpu-intel-dg2 + +#include "../common.hpp" + +#define SG_SZ 32 +constexpr size_t TN = 16; +#include "../joint_matrix_prefetch_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_rowmajorA_rowmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_rowmajorA_rowmajorB.cpp new file mode 100644 index 0000000000000..0fe45a24c499f --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_rowmajorA_rowmajorB.cpp @@ -0,0 +1,27 @@ +//==--------joint_matrix_rowmajorA_rowmajorB.cpp - DPC++ joint_matrix------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This tests support of row major layout for matrix B which does automatic VNNI +// REQUIRES: aspect-ext_intel_matrix +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 +// VNNI transform and sub-group size 32 are not supported yet on DG2 by IGC +// UNSUPPORTED: gpu-intel-dg2 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// Sub-group size 32 support for this test is not currently available in IGC +// XFAIL: gpu + +#include "../common.hpp" + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; + +#define SG_SZ 32 + +#include "../joint_matrix_rowmajorA_rowmajorB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_ss_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_ss_int8.cpp new file mode 100644 index 0000000000000..08f44d2a00090 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_ss_int8.cpp @@ -0,0 +1,20 @@ +//==-------- joint_matrix_ss_int8.cpp - DPC++ joint_matrix------------ ----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// UNSUPPORTED: gpu-intel-dg2 +// REQUIRES: aspect-ext_intel_matrix +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" + +#define SG_SZ 32 + +#include "../joint_matrix_ss_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_su_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_su_int8.cpp new file mode 100644 index 0000000000000..c260cb7558ed5 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_su_int8.cpp @@ -0,0 +1,20 @@ +//==-------- joint_matrix_su_int8.cpp - DPC++ joint_matrix------------ ----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// UNSUPPORTED: gpu-intel-dg2 +// REQUIRES: aspect-ext_intel_matrix +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" + +#define SG_SZ 32 + +#include "../joint_matrix_su_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_tf32.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_tf32.cpp new file mode 100644 index 0000000000000..18da250bc808d --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_tf32.cpp @@ -0,0 +1,22 @@ +//==---------------- joint_matrix_tf32.cpp - DPC++ joint_matrix------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix-tf32 +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; + +#define SG_SZ 32 +constexpr size_t TN = 16; + +#include "../joint_matrix_tf32_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_transposeC.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_transposeC.cpp new file mode 100644 index 0000000000000..6cea5a248e0b2 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_transposeC.cpp @@ -0,0 +1,20 @@ +//==----------- joint_matrix_transposeC.cpp - DPC++ joint_matrix-----------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// UNSUPPORTED: gpu-intel-dg2 +// REQUIRES: aspect-ext_intel_matrix +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" + +#define SG_SZ 32 + +#include "../joint_matrix_transposeC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_unaligned_k.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_unaligned_k.cpp new file mode 100644 index 0000000000000..f4b2426af93a8 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_unaligned_k.cpp @@ -0,0 +1,22 @@ +//==-------- joint_matrix_unaligned_k.cpp - DPC++ joint_matrix-------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// XFAIL:* + +#include "../common.hpp" + +#define SG_SZ 32 +constexpr size_t TN = 16; +static constexpr size_t MATRIX_K = 1024 + 14; + +#include "../joint_matrix_out_bounds_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_us_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_us_int8.cpp new file mode 100644 index 0000000000000..a0ed38373a43a --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_us_int8.cpp @@ -0,0 +1,20 @@ +//==-------- joint_matrix_us_int8.cpp - DPC++ joint_matrix------------ ----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// UNSUPPORTED: gpu-intel-dg2 +// REQUIRES: aspect-ext_intel_matrix +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" + +#define SG_SZ 32 + +#include "../joint_matrix_us_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_uu_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_uu_int8.cpp new file mode 100644 index 0000000000000..d5c5281f67c9f --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_uu_int8.cpp @@ -0,0 +1,20 @@ +//==-------- joint_matrix_uu_int8.cpp - DPC++ joint_matrix------------ ----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// UNSUPPORTED: gpu-intel-dg2 +// REQUIRES: aspect-ext_intel_matrix +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" + +#define SG_SZ 32 + +#include "../joint_matrix_uu_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/element_wise_abc.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/element_wise_abc.cpp new file mode 100644 index 0000000000000..3bf9030658112 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/element_wise_abc.cpp @@ -0,0 +1,14 @@ +//==----------- element_wise_abc.cpp - DPC++ joint_matrix------------- ----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu, aspect-ext_intel_matrix + +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" +#include "../element_wise_abc_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/element_wise_ops.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/element_wise_ops.cpp new file mode 100644 index 0000000000000..19c2d5ef9c273 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/element_wise_ops.cpp @@ -0,0 +1,14 @@ +//==----------- element_wise_ops.cpp - DPC++ joint_matrix------------- ----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu, aspect-ext_intel_matrix + +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" +#include "../element_wise_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_bfloat16.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_bfloat16.cpp new file mode 100644 index 0000000000000..db664554a1a14 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_bfloat16.cpp @@ -0,0 +1,14 @@ +//==-------- joint_matrix_bfloat16.cpp - DPC++ joint_matrix----------- ----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu, aspect-ext_intel_matrix + +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" +#include "../joint_matrix_bfloat16_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array.cpp new file mode 100644 index 0000000000000..bcbb20228ac00 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array.cpp @@ -0,0 +1,14 @@ +//==-------- joint_matrix_bfloat16_array.cpp - DPC++ joint_matrix----------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu, aspect-ext_intel_matrix + +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" +#include "../joint_matrix_bfloat16_array_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_half.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_half.cpp new file mode 100644 index 0000000000000..424bb83050048 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_half.cpp @@ -0,0 +1,17 @@ +//==-------- joint_matrix_half.cpp - DPC++ joint_matrix------------ ----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-fp16 +// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: matrix-fp16 +// REQUIRES: gpu + +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" +#include "../joint_matrix_half_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_prefetch.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_prefetch.cpp new file mode 100644 index 0000000000000..fe3c25e338094 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_prefetch.cpp @@ -0,0 +1,15 @@ +//==-------- joint_matrix_prefetch.cpp - DPC++ joint_matrix----------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu, aspect-ext_intel_matrix +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" + +constexpr size_t TN = 16; +#include "../joint_matrix_prefetch_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_ss_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_ss_int8.cpp new file mode 100644 index 0000000000000..ac650910f237a --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_ss_int8.cpp @@ -0,0 +1,14 @@ +//==-------- joint_matrix_ss_int8.cpp - DPC++ joint_matrix------------ ----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu, aspect-ext_intel_matrix + +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" +#include "../joint_matrix_ss_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_su_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_su_int8.cpp new file mode 100644 index 0000000000000..55d85decd7caa --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_su_int8.cpp @@ -0,0 +1,14 @@ +//==-------- joint_matrix_su_int8.cpp - DPC++ joint_matrix------------ ----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu, aspect-ext_intel_matrix + +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" +#include "../joint_matrix_su_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_tf32.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_tf32.cpp new file mode 100644 index 0000000000000..bdf6afc5bc395 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_tf32.cpp @@ -0,0 +1,18 @@ +//==---------------- joint_matrix_tf32.cpp - DPC++ joint_matrix------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu, matrix-tf32 +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" + +constexpr size_t TN = 16; + +#include "../joint_matrix_tf32_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_us_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_us_int8.cpp new file mode 100644 index 0000000000000..08e2f532b887c --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_us_int8.cpp @@ -0,0 +1,14 @@ +//==-------- joint_matrix_us_int8.cpp - DPC++ joint_matrix------------ ----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu, aspect-ext_intel_matrix + +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" +#include "../joint_matrix_us_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_uu_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_uu_int8.cpp new file mode 100644 index 0000000000000..f1b19bc257b1d --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_uu_int8.cpp @@ -0,0 +1,14 @@ +//==-------- joint_matrix_uu_int8.cpp - DPC++ joint_matrix------------ ----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu, aspect-ext_intel_matrix + +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out +// RUN: %{run} %t.out + +#include "../common.hpp" +#include "../joint_matrix_uu_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/common.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/common.hpp new file mode 100644 index 0000000000000..2b134f8144667 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/common.hpp @@ -0,0 +1,213 @@ +//==------------------ common.hpp - DPC++ joint_matrix---------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include +#include +#include +#include +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; +namespace syclex = sycl::ext::oneapi::experimental; +namespace syclintelex = sycl::ext::intel::experimental; +using bfloat16 = sycl::ext::oneapi::bfloat16; + +// Most of the time, failures related to floating-point calculations (both float +// and bfloat16) are caused by accumulation errors rather than the algorithm +// itself. If it is an algorithm issue, the calculated result gap from the +// reference would be much bigger. To avoid flaky test results while catching +// algorithm errors, we are increasing the accuracy threshold. +// Something like this should be good enough to catch algorithm errors: +// fabs(ref[i] - val[i])/max(fabs(ref)) < 10e-2 +constexpr float FLOAT_EPSILON = 10e-2; + +template struct big_matrix { +public: + T *mat; + +public: + T *get_data() { return mat; } + void set_data(T *data) { mat = data; } + big_matrix(T *data) : mat(data) {} +}; + +float make_fp32(bfloat16 x) { + unsigned int y = *((int *)&x); + y = y << 16; + float *res = reinterpret_cast(&y); + return *res; +} + +template +void matrix_multiply_ref(Ta *A, Tb *B, Tc *C, int M, int N, int K, + bool transpose_c = false, bool colmajor_a = false, + bool colmajor_b = false, F &&lambda = {}) { + for (unsigned int m = 0; m < M; m++) { + for (unsigned int n = 0; n < N; n++) { + int c_ind = transpose_c ? (n * M + m) : m * N + n; + Tc acc = *(C + c_ind); + + for (unsigned int k = 0; k < K; k++) { + int a_ind = colmajor_a ? (k * M + m) : m * K + k; + int b_ind = colmajor_b ? (n * K + k) : k * N + n; + Ta *va = (Ta *)(A + a_ind * VF); + Tb *vb = (Tb *)(B + b_ind * VF); + + for (unsigned int i = 0; i < VF; i++) { + if constexpr (std::is_same_v && + std::is_same_v) + acc += make_fp32(va[i]) * make_fp32(vb[i]); + else if constexpr (std::is_same_v && + std::is_same_v || + std::is_integral_v && std::is_integral_v || + (std::is_same_v && + std::is_same_v)) + acc += va[i] * vb[i]; + else if constexpr (std::is_same_v && + std::is_same_v) + acc += (float)va[i] * (float)vb[i]; + else + assert(false && "Unsupported type in matrix_multiply_ref."); + } + } + + if constexpr (!std::is_same_v) { + lambda(acc); + } + *(C + c_ind) = acc; + } + } +} + +template +void matrix_vnni(unsigned int rows, unsigned int cols, T *src, T *dest, + unsigned int vnniFactor = 2) { + for (unsigned int i = 0; i < rows / vnniFactor; i++) { + for (unsigned int j = 0; j < cols; j++) { + for (unsigned int k = 0; k < vnniFactor; k++) { + dest[i * cols * vnniFactor + j * vnniFactor + k] = + src[(i * vnniFactor + k) * cols + j]; + } + } + } +} + +template +void matrix_transpose(unsigned int rows, unsigned int cols, T *dst, T *src) { + for (unsigned int i = 0; i < rows; i++) { + for (unsigned int j = 0; j < cols; j++) { + dst[i + j * rows] = src[i * cols + j]; + } + } +} + +template +void matrix_fill(unsigned int rows, unsigned int cols, T *src, T val) { + for (unsigned int i = 0; i < rows; i++) { + for (unsigned int j = 0; j < cols; j++) { + src[i * cols + j] = val; + } + } +} + +template +void matrix_fill(unsigned int rows, unsigned int cols, T *src, F op) { + for (unsigned int i = 0; i < rows; i++) { + for (unsigned int j = 0; j < cols; j++) { + src[i * cols + j] = T(op(i, j)); + } + } +} + +template +void matrix_rand(unsigned int rows, unsigned int cols, T *src, T val) { + std::random_device dev; + std::uniform_real_distribution fdistr(-val, val); + std::uniform_int_distribution idistr((int)-val, (int)val); + + for (unsigned int i = 0; i < rows; i++) { + for (unsigned int j = 0; j < cols; j++) { + if constexpr (std::is_same_v || std::is_same_v || + std::is_same_v) { + src[i * cols + j] = T(fdistr(dev)); + } else if constexpr (std::is_integral_v) { + src[i * cols + j] = T(idistr(dev)); + } else { + assert(false && "Unsupported type in matrix_rand."); + } + } + } +} + +template +void matrix_copy(unsigned int rows, unsigned int cols, T *src, T *dst) { + for (unsigned int i = 0; i < rows; i++) { + for (unsigned int j = 0; j < cols; j++) { + dst[i * cols + j] = src[i * cols + j]; + } + } +} + +template +bool matrix_compare(unsigned int rows, unsigned int cols, T1 *src, T2 *ref) { + for (int i = 0; i < rows; i++) { + for (int j = 0; j < cols; j++) { + if constexpr (!exact && (std::is_same_v || + std::is_same_v || + (std::is_same_v && + std::is_same_v))) { + float diff = std::fabs(src[i * cols + j] - (T1)ref[i * cols + j]); + if (diff > FLOAT_EPSILON || std::isnan(src[i * cols + j])) { + std::cout << "Incorrect result in matrix. " + << "i: " << i << ", j: " << j + << ", Ref: " << (T1)ref[i * cols + j] + << ", Val: " << src[i * cols + j] << ", Diff: " << diff + << ", Epsilon: " << FLOAT_EPSILON << "\n"; + return false; + } + } else if constexpr (exact || std::is_same_v) { + if (src[i * cols + j] != ref[i * cols + j]) { + std::cout << "Incorrect result in matrix." + << "i: " << i << ", j: " << j + << ", Ref: " << ref[i * cols + j] + << ", Val: " << src[i * cols + j] << "\n"; + return false; + } + } else { + std::cout << "Unsupported type in matrix_compare\n"; + return false; + } + } + } + return true; +} + +bool is_type_supported_by_device(queue q, matrix_type type) { + std::vector combinations = + q.get_device() + .get_info(); + for (int i = 0; i < combinations.size(); i++) { + if (combinations[i].atype == type) { + return true; + } + } + return false; +} + +template size_t get_sg_size(queue q) { + auto KernelID = get_kernel_id(); + auto KB = + get_kernel_bundle(q.get_context(), {KernelID}); + auto kernel = KB.get_kernel(KernelID); + + return kernel + .template get_info( + q.get_device()); +} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_abc.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_abc.cpp index 3bf9030658112..8a4e277bf6d11 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_abc.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_abc.cpp @@ -5,10 +5,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix -// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out +// RUN: %{build} -o %t.out // RUN: %{run} %t.out -#include "../common.hpp" -#include "../element_wise_abc_impl.hpp" +#include "common.hpp" +#include "element_wise_abc_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_abc_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_abc_impl.hpp new file mode 100644 index 0000000000000..5caf6d3e0a3e7 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_abc_impl.hpp @@ -0,0 +1,119 @@ +//==----------- element_wise_abc_impl.hpp - DPC++ joint_matrix-------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; + +template class add; + +template +void matrix_elem_wise_ops(big_matrix &C, big_matrix &A, + big_matrix &B) { + size_t NDRangeM = 1; + size_t NDRangeN = 1; + buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufB(B.get_data(), range<2>(K, N)); + buffer bufC(C.get_data(), range<2>(M, N)); + + queue q; + size_t sg_size = get_sg_size>(q); + q.submit([&](handler &cgh) { + accessor accC{bufC, cgh}; + accessor accA{bufA, cgh}; + accessor accB{bufB, cgh}; + + cgh.parallel_for>( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + // The submatrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup no + // code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a; + // For B, we assume B has been already VNNIed. + joint_matrix + sub_b; + joint_matrix sub_c; + + joint_matrix_load( + sg, sub_a, + accA.template get_multi_ptr() + + (sg_startx * M) * K, + K); + joint_matrix_apply(sg, sub_a, [](T2 &x) { x += 1; }); + + joint_matrix_load( + sg, sub_b, + accB.template get_multi_ptr() + + sg_starty / sg_size * N * vnniFactor, + N * vnniFactor); + joint_matrix_apply(sg, sub_b, [](T2 &x) { x += 1; }); + + joint_matrix_load( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * M) * N + sg_starty / sg_size * N, + N, layout::row_major); + joint_matrix_apply(sg, sub_c, [](T1 &x) { x += 1; }); + }); // parallel for + }).wait(); +} + +template +void test() { + Tc A[TM][TK]; + Tc B[TK / VF][TN * VF]; + Ta C[TM][TN]; + + big_matrix MC((Ta *)&C); + big_matrix MA((Tc *)&A); + big_matrix MB((Tc *)&B); + + return matrix_elem_wise_ops(MC, MA, MB); +} + +int main() { + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0) { // Intel AMX + test(); + break; + } + + if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc + test(); + break; + } + + if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* + test(); + break; + } + } + + return 0; +} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops.cpp new file mode 100644 index 0000000000000..d14240b145e81 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops.cpp @@ -0,0 +1,17 @@ +//==------------ element_wise_all_ops.cpp - DPC++ joint_matrix-------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// TODO: Reenable, see https://github.com/intel/llvm/issues/14598 +// UNSUPPORTED: windows, linux + +#include "common.hpp" +#include "element_wise_all_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d.cpp new file mode 100644 index 0000000000000..ad6bd117044a3 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d.cpp @@ -0,0 +1,17 @@ +//==-------- element_wise_all_ops_1d.cpp - DPC++ joint_matrix ---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix, gpu + +// RUN: %{build} -o %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %{run} %t.out + +// TODO: Reenable, see https://github.com/intel/llvm/issues/14598 +// UNSUPPORTED: windows, linux + +#include "common.hpp" +#include "element_wise_all_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d_cont.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d_cont.cpp new file mode 100644 index 0000000000000..46e0771238de6 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d_cont.cpp @@ -0,0 +1,17 @@ +//==-------- element_wise_all_ops_1d_cont.cpp - DPC++ joint_matrix ---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix, gpu + +// RUN: %{build} -o %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=2 %{run} %t.out + +// TODO: Reenable, see https://github.com/intel/llvm/issues/14598 +// UNSUPPORTED: windows, linux + +#include "common.hpp" +#include "element_wise_all_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_half.cpp new file mode 100644 index 0000000000000..bb651568f9251 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_half.cpp @@ -0,0 +1,16 @@ +//==----------- element_wise_all_ops_half.cpp - DPC++ joint_matrix---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-fp16 +// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: matrix-fp16 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include "common.hpp" +#include "element_wise_all_ops_half_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_half_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_half_impl.hpp new file mode 100644 index 0000000000000..4065c7a78a566 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_half_impl.hpp @@ -0,0 +1,127 @@ +//===---element_wise_all_ops_half_impl.hpp - DPC++ joint_matrix------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +template class add; +template class sub; +template class mul; +template class divide; +template class logic; + +template +void assert_ops_ref(host_accessor C, const R ref) { + for (size_t i = 0; i < Rows; i++) + for (size_t j = 0; j < Cols; j++) { + auto diff = C[i][j] - ref; + assert(std::fabs(static_cast(diff)) < + std::numeric_limits::epsilon()); + } +} + +template +void matrix_verify_op(big_matrix &A, const R ref, OP op) { + buffer bufA(A.get_data(), range<2>(Rows, Cols)); + + queue q; + size_t sg_size = get_sg_size(q); + nd_range<2> r({Rows / TileRows, Cols / TileCols * sg_size}, {1, 1 * sg_size}); + + q.submit([&](handler &cgh) { + sycl::accessor accA{bufA, cgh, sycl::read_write}; + + cgh.parallel_for( + r, [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix + sub_a; + + joint_matrix_fill(sg, sub_a, 5); + + joint_matrix_apply(sg, sub_a, op); + ext::intel::experimental::matrix::joint_matrix_store( + sg, sub_a, + accA.template get_multi_ptr() + + (sg_startx * TileRows) * Cols + + sg_starty / sg_size * TileCols, + Cols); + }); // parallel for + }).wait(); + assert_ops_ref(bufA.get_host_access(read_only), ref); +} + +template void test() { + static constexpr size_t Rows = TM * 2; + static constexpr size_t Cols = TK * 2; + Ta A[Rows][Cols]; + + big_matrix MA((Ta *)&A); + + matrix_verify_op, TResult>( + MA, 7, [=](Ta &x) { x = x + static_cast(2); }); + matrix_verify_op, TResult>( + MA, 3, [=](Ta &x) { x = x - static_cast(2); }); + matrix_verify_op, TResult>( + MA, 10, [=](Ta &x) { x = x * static_cast(2); }); + matrix_verify_op, TResult>( + MA, 2.5, [=](Ta &x) { x = x / static_cast(2); }); + matrix_verify_op, TResult>( + MA, 7, [=](Ta &x) { + if (x) { + if (x > static_cast(2) || x >= static_cast(2) || + x < static_cast(2) || x <= static_cast(2)) { + Ta val = (x != static_cast(2)) ? x : static_cast(2); + val--; + val++; + if (x == static_cast(2)) { + val -= static_cast(2); + val *= static_cast(3); + val /= static_cast(2); + } else { + val += static_cast(2); + } + x = val; + } + } + }); +} + +int main() { + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0) { // Intel AMX + test(); + break; + } + + if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc + test(); + break; + } + + if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* + test(); + break; + } + } + + return 0; +} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_impl.hpp new file mode 100644 index 0000000000000..db686d23ae05d --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_impl.hpp @@ -0,0 +1,248 @@ +//==----------- element_wise_all_ops_impl.hpp - DPC++ joint_matrix---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +template +void assert_ops_ref(host_accessor mat, + const float ref) { + for (size_t i = 0; i < NUM_ROWS; i++) + for (size_t j = 0; j < NUM_COLS; j++) { + float diff; + if constexpr (std::is_same_v) + diff = make_fp32(mat[i][j]) - ref; + else + diff = mat[i][j] - ref; + assert(std::fabs(static_cast(diff)) < + std::numeric_limits::epsilon()); + } +} + +template +void verify_op_ab(const T l, const T r, const float ref, OP op) { + T mat[NUM_ROWS / VF][NUM_COLS * VF]; + big_matrix big_mat((T *)&mat); + + buffer bufMat(big_mat.get_data(), + range<2>(NUM_ROWS / VF, NUM_COLS * VF)); + + queue q; + size_t sg_size = get_sg_size(q); + q.submit([&](handler &cgh) { + sycl::accessor accessMat{bufMat, cgh, sycl::read_write}; + cgh.parallel_for( + nd_range<2>({NUM_ROWS / SUB_ROWS, NUM_COLS / SUB_COLS * sg_size}, + {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_mat; + joint_matrix_fill(sg, sub_mat, l); + joint_matrix_apply(sg, sub_mat, [=](T &x) { x = op(x, r); }); + ext::intel::experimental::matrix::joint_matrix_store( + sg, sub_mat, + accessMat.template get_multi_ptr() + + (sg_startx * SUB_ROWS / VF) * NUM_COLS * VF + + sg_starty / sg_size * SUB_COLS * VF, + NUM_COLS * VF); + }); // parallel for + }).wait(); + assert_ops_ref( + bufMat.get_host_access(read_only), ref); +} + +template +void verify_op_c(const T l, const T r, const float ref, OP op) { + T mat[NUM_ROWS][NUM_COLS]; + big_matrix big_mat((T *)&mat); + + buffer bufMat(big_mat.get_data(), range<2>(NUM_ROWS, NUM_COLS)); + queue q; + size_t sg_size = get_sg_size(q); + q.submit([&](handler &cgh) { + sycl::accessor accessMat{bufMat, cgh, sycl::read_write}; + cgh.parallel_for( + nd_range<2>({NUM_ROWS / SUB_ROWS, NUM_COLS / SUB_COLS * sg_size}, + {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix + sub_mat; + joint_matrix_fill(sg, sub_mat, l); + joint_matrix_apply(sg, sub_mat, [=](T &x) { x = op(x, r); }); + + joint_matrix_store( + sg, sub_mat, + accessMat.template get_multi_ptr() + + (sg_startx * SUB_ROWS) * NUM_COLS + + sg_starty / sg_size * SUB_COLS, + NUM_COLS, layout::row_major); + }); // parallel for + }).wait(); + assert_ops_ref(bufMat.get_host_access(read_only), ref); +} + +// Avoid same kernel name for different types +template +class ewops_ab {}; +template +void test_ewops_ab() { + if constexpr (Use == use::a) + std::cout << "Test A "; + else + std::cout << "Test B "; + std::cout << SROWS << "x" << SCOLS << "\n"; + + static constexpr size_t NROWS = SROWS * 2; + static constexpr size_t NCOLS = SCOLS * 2; + + verify_op_ab>( + T(5.0), T(2.0), 7.0, [](auto l, auto r) { return l + r; }); + verify_op_ab>( + T(5.0), T(2.0), 3.0, [](auto l, auto r) { return l - r; }); + verify_op_ab>( + T(5.0), T(2.0), 10.0, [](auto l, auto r) { return l * r; }); + verify_op_ab>( + T(5.0), T(2.0), 2.5, [](auto l, auto r) { return l / r; }); + verify_op_ab>( + T(5.0), T(5.0), 5.0, [](auto l, auto r) { return l == r ? l : T(1.0); }); + verify_op_ab>( + T(5.0), T(4.0), 4.0, [](auto l, auto r) { return l == r ? l : r; }); + verify_op_ab>( + T(5.0), T(5.0), 1.0, [](auto l, auto r) { return l != r ? l : T(1.0); }); + verify_op_ab>( + T(5.0), T(2.0), 3.0, + [](auto l, auto r) { return l > r ? T(3.0) : T(2.0); }); + verify_op_ab>( + T(5.0), T(2.0), 2.0, + [](auto l, auto r) { return l < r ? T(3.0) : T(2.0); }); + verify_op_ab>( + T(5.0), T(2.0), 3.0, + [](auto l, auto r) { return l >= r ? T(3.0) : T(2.0); }); + verify_op_ab>( + T(5.0), T(2.0), 2.0, + [](auto l, auto r) { return l <= r ? T(3.0) : T(2.0); }); +} + +// Avoid same kernel name for different types and numbers of columns +template class ewops_c {}; +template void test_ewops_c() { + std::cout << "Test C " << SROWS << "x" << SCOLS << "\n"; + + static constexpr size_t NROWS = SROWS * 2; + static constexpr size_t NCOLS = SCOLS * 2; + + verify_op_c>( + T(5.0), T(2.0), 7.0, [](auto l, auto r) { return l + r; }); + verify_op_c>( + T(5.0), T(2.0), 3.0, [](auto l, auto r) { return l - r; }); + verify_op_c>( + T(5.0), T(2.0), 10.0, [](auto l, auto r) { return l * r; }); + verify_op_c>( + T(5.0), T(2.0), 2.5, [](auto l, auto r) { return l / r; }); + verify_op_c>( + T(5.0), T(5.0), 5.0, [](auto l, auto r) { return l == r ? l : T(1.0); }); + verify_op_c>( + T(5.0), T(4.0), 4.0, [](auto l, auto r) { return l == r ? l : r; }); + verify_op_c>( + T(5.0), T(5.0), 1.0, [](auto l, auto r) { return l != r ? l : T(1.0); }); + verify_op_c>( + T(5.0), T(2.0), 3.0, + [](auto l, auto r) { return l > r ? T(3.0) : T(2.0); }); + verify_op_c>( + T(5.0), T(2.0), 2.0, + [](auto l, auto r) { return l < r ? T(3.0) : T(2.0); }); + verify_op_c>( + T(5.0), T(2.0), 3.0, + [](auto l, auto r) { return l >= r ? T(3.0) : T(2.0); }); + verify_op_c>( + T(5.0), T(2.0), 2.0, + [](auto l, auto r) { return l <= r ? T(3.0) : T(2.0); }); +} + +int main() { + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + for (auto &combination : combinations) { + if (combination.nsize == 0 || + combination.nsize == 16) { // Intel AMX or architecture::intel_gpu_pvc + test_ewops_ab(); + test_ewops_ab(); + test_ewops_ab(); + test_ewops_c(); + test_ewops_c(); + + if (combination.nsize == 16) { // architecture::intel_gpu_pvc + test_ewops_ab(); + test_ewops_c(); +// This combination is not currently supported for sub group size = 32 in IGC +#if (!defined(SG_SZ) || SG_SZ != 32) + test_ewops_ab(); + test_ewops_ab(); + test_ewops_c(); + test_ewops_c(); +#endif + } + break; + } + + if (combination.nsize == 8) { // architecture::intel_gpu_dg2* + test_ewops_ab(); + test_ewops_ab(); + test_ewops_c(); + break; + } + } + + return 0; +} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8.cpp new file mode 100644 index 0000000000000..ba538e4ebffef --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8.cpp @@ -0,0 +1,14 @@ +//==----------- element_wise_all_ops_int8.cpp - DPC++ joint_matrix---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include "common.hpp" +#include "element_wise_all_ops_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8_impl.hpp new file mode 100644 index 0000000000000..8a2f1f495e41d --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8_impl.hpp @@ -0,0 +1,127 @@ +//===---element_wise_all_ops_int8_impl.hpp - DPC++ joint_matrix------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +template class add; +template class sub; +template class mul; +template class divide; +template class logic; + +template +void assert_ops_ref(host_accessor C, const R ref) { + for (size_t i = 0; i < Rows; i++) + for (size_t j = 0; j < Cols; j++) { + auto diff = C[i][j] - ref; + assert(std::fabs(static_cast(diff)) <= + std::numeric_limits::epsilon()); + } +} + +template +void matrix_verify_op(big_matrix &A, const R ref, OP op) { + buffer bufA(A.get_data(), range<2>(Rows, Cols)); + + queue q; + size_t sg_size = get_sg_size(q); + nd_range<2> r({Rows / TileRows, Cols / TileCols * sg_size}, {1, 1 * sg_size}); + + q.submit([&](handler &cgh) { + sycl::accessor accA{bufA, cgh, sycl::read_write}; + + cgh.parallel_for( + r, [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix + sub_a; + + joint_matrix_fill(sg, sub_a, 5); + + joint_matrix_apply(sg, sub_a, op); + ext::intel::experimental::matrix::joint_matrix_store( + sg, sub_a, + accA.template get_multi_ptr() + + (sg_startx * TileRows) * Cols + + sg_starty / sg_size * TileCols, + Cols); + }); // parallel for + }).wait(); + assert_ops_ref(bufA.get_host_access(read_only), ref); +} + +template void test() { + static constexpr size_t Rows = TM * 2; + static constexpr size_t Cols = TK * 2; + Ta A[Rows][Cols]; + + big_matrix MA((Ta *)&A); + + matrix_verify_op, TResult>( + MA, 7, [=](auto &x) { x = x + 2; }); + matrix_verify_op, TResult>( + MA, 3, [=](auto &x) { x = x - 2; }); + matrix_verify_op, TResult>( + MA, 10, [=](auto &x) { x = x * 2; }); + matrix_verify_op, TResult>( + MA, 2, [=](auto &x) { x = x / 2; }); // truncation is expected + matrix_verify_op, TResult>( + MA, 7, [=](auto &x) { + if (x) { + if (x > 2 || x >= 2 || x < 2 || x <= 2) { + Ta val = (x != 2) ? x : 2; + val--; + val++; + if (x == 2) { + val -= 2; + val *= 3; + val /= 2; + } else { + val += 2; + } + x = val; + } + } + }); +} + +int main() { + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0) { // Intel AMX + test(); + break; + } + + if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc + test(); + break; + } + + if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* + test(); + break; + } + } + + return 0; +} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8_packed.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8_packed.cpp new file mode 100644 index 0000000000000..be27718279b79 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8_packed.cpp @@ -0,0 +1,16 @@ +//==------ element_wise_all_ops_int8_packed.cpp - DPC++ joint_matrix-------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// This test stores the matrix B that is VNNIed (packed). + +#include "common.hpp" +#include "element_wise_all_ops_int8_packed_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8_packed_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8_packed_impl.hpp new file mode 100644 index 0000000000000..7336bb8467fa5 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8_packed_impl.hpp @@ -0,0 +1,130 @@ +//===---element_wise_all_ops_int8_packed_impl.hpp - DPC++ joint_matrix-----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +template class add; +template class sub; +template class mul; +template class divide; +template class logic; + +template +void assert_ops_ref(host_accessor C, + const TResult ref) { + for (size_t i = 0; i < Rows; i++) + for (size_t j = 0; j < Cols; j++) { + TResult diff = C[i][j] - ref; + assert(std::fabs(static_cast(diff)) <= + std::numeric_limits::epsilon()); + } +} + +template +void matrix_verify_op(big_matrix &B, const TResult ref, OP op) { + buffer bufB(B.get_data(), range<2>(Rows, Cols)); + + queue q; + size_t sg_size = get_sg_size(q); + nd_range<2> r({Rows / TileRows, Cols / TileCols * sg_size}, {1, 1 * sg_size}); + + q.submit([&](handler &cgh) { + sycl::accessor accB{bufB, cgh, sycl::read_write}; + + cgh.parallel_for( + r, [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix + sub_b; + + joint_matrix_fill(sg, sub_b, 5); + + joint_matrix_apply(sg, sub_b, op); + ext::intel::experimental::matrix::joint_matrix_store( + sg, sub_b, + accB.template get_multi_ptr() + + (sg_startx * TileRows / VNNI) * Cols * VNNI + + sg_starty / sg_size * TileCols * VNNI, + Cols * VNNI); + }); // parallel for + }).wait(); + assert_ops_ref(bufB.get_host_access(read_only), ref); +} + +template +void test() { + static constexpr size_t Rows = TK * 2; + static constexpr size_t Cols = TN * 2; + T B[Rows][Cols]; + + big_matrix MB((T *)&B); + + matrix_verify_op, TResult>( + MB, 7, [=](auto &x) { x = x + 2; }); + matrix_verify_op, TResult>( + MB, 3, [=](auto &x) { x = x - 2; }); + matrix_verify_op, TResult>( + MB, 10, [=](auto &x) { x = x * 2; }); + matrix_verify_op, TResult>( + MB, 2, [=](auto &x) { x = x / 2; }); // truncation is expected + matrix_verify_op, TResult>( + MB, 7, [=](auto &x) { + if (x) { + if (x > 2 || x >= 2 || x < 2 || x <= 2) { + T val = (x != 2) ? x : 2; + val--; + val++; + if (x == 2) { + val -= 2; + val *= 3; + val /= 2; + } else { + val += 2; + } + x = val; + } + } + }); +} + +int main() { + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0) { // Intel AMX + test(); + break; + } + + if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc + test(); + break; + } + + if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* + test(); + break; + } + } + + return 0; +} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_scalar.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_scalar.cpp new file mode 100644 index 0000000000000..93432bca4a03e --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_scalar.cpp @@ -0,0 +1,17 @@ +//==---- element_wise_all_ops_scalar.cpp - DPC++ joint_matrix ------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix, gpu + +// RUN: %{build} -o %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %{run} %t.out + +// TODO: Reenable, see https://github.com/intel/llvm/issues/14598 +// UNSUPPORTED: windows, linux + +#include "common.hpp" +#include "element_wise_all_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_tf32.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_tf32.cpp new file mode 100644 index 0000000000000..6e2f8dcff6384 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_tf32.cpp @@ -0,0 +1,18 @@ +//==----------- element_wise_all_ops_tf32.cpp - DPC++ joint_matrix---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix-tf32 +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include "common.hpp" + +constexpr size_t TN = 16; + +#include "element_wise_all_ops_tf32_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_tf32_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_tf32_impl.hpp new file mode 100644 index 0000000000000..ba63c45deea2d --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_tf32_impl.hpp @@ -0,0 +1,102 @@ +//===---element_wise_all_ops_tf32_impl.hpp - DPC++ joint_matrix------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#define TM 8 +#define TK 8 + +template +void assert_ops_ref(host_accessor C, + const float ref) { + for (size_t i = 0; i < M; i++) + for (size_t j = 0; j < N; j++) { + auto diff = C[i][j] - ref; + assert(std::fabs(static_cast(diff)) < + std::numeric_limits::epsilon()); + } +} + +template +void matrix_verify_op(big_matrix &A, const float ref, OP op) { + buffer bufA(A.get_data(), range<2>(M, K)); + + queue q; + size_t sg_size = get_sg_size(q); + nd_range<2> r({M / TileM, K / TileK * sg_size}, {1, 1 * sg_size}); + + q.submit([&](handler &cgh) { + sycl::accessor accA{bufA, cgh, sycl::read_write}; + + cgh.parallel_for( + r, [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix + sub_a; + joint_matrix_fill(sg, sub_a, round_to_tf32(5.0)); + + joint_matrix_apply(sg, sub_a, op); + ext::intel::experimental::matrix::joint_matrix_store( + sg, sub_a, + accA.template get_multi_ptr() + + (sg_startx * TileM) * K + sg_starty / sg_size * TileK, + K); + }); // parallel for + }).wait(); + assert_ops_ref(bufA.get_host_access(sycl::read_only), ref); +} + +static constexpr size_t MATRIX_M = TM * 2; +static constexpr size_t MATRIX_K = TK * 2; +float A[MATRIX_M][MATRIX_K]; + +int main() { + + big_matrix MA((float *)&A); + + matrix_verify_op(MA, 7.0, + [=](auto &x) { x = x + round_to_tf32(2); }); + matrix_verify_op(MA, 3.0, + [=](auto &x) { x = x - round_to_tf32(2); }); + matrix_verify_op(MA, 10.0, + [=](auto &x) { x = x * round_to_tf32(2); }); + matrix_verify_op(MA, 2.5, + [=](auto &x) { x = x / round_to_tf32(2); }); + matrix_verify_op(MA, 7.0, [=](auto &x) { + if (x) { + if (x > 2 || x >= 2 || x < 2 || x <= 2) { + float val = (x != 2) ? x : 2; + val--; + val++; + if (x == 2) { + val -= 2; + val *= 3; + val /= 2; + } else { + val += 2; + } + x = val; + } + } + }); + + return 0; +} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_sizes.cpp new file mode 100644 index 0000000000000..83db076d2be20 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_sizes.cpp @@ -0,0 +1,23 @@ +//==----------- element_wise_all_sizes.cpp - DPC++ joint_matrix---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// TODO: Reenable, see https://github.com/intel/llvm/issues/14598 +// UNSUPPORTED: windows, linux + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// This is a version of the test with disabled device code +// split to test against fixed bug in IGC +// RUN: %{build} -fsycl-device-code-split=off -o %t_split.out +// RUN: %if gpu-intel-dg2 %{ %{run} %t_split.out %} + +#include "common.hpp" +#include "element_wise_all_sizes_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_sizes_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_sizes_impl.hpp new file mode 100644 index 0000000000000..5228a154e9f6f --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_sizes_impl.hpp @@ -0,0 +1,130 @@ +//===---element_wise_all_ops_all_sizes_impl.hpp - DPC++ joint_matrix-------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +static constexpr size_t M_MULTIPLIER = 16; +template class add; + +template +void assert_ops_ref(host_accessor C, + const T ref) { + for (size_t i = 0; i < M; i++) + for (size_t j = 0; j < N; j++) { + if (std::is_same_v) { + auto diff = make_fp32(C[i][j]) - make_fp32(ref); + assert(std::fabs(static_cast(diff)) < + std::numeric_limits::epsilon()); + } else if (std::is_same_v) { + assert(C[i][j] == ref); + } + } +} + +template +void matrix_verify_add(const T1 val1, const T1 val2, const T1 result) { + static constexpr size_t M = TM * M_MULTIPLIER; + static constexpr size_t K = 128; + T MatA[M][K]; + + size_t NDRangeM = M / TM; + size_t NDRangeK = K / TK; + queue q; + size_t sg_size = get_sg_size(q); + nd_range<2> r({NDRangeM, NDRangeK * sg_size}, {1, 1 * sg_size}); + big_matrix A((T *)&MatA); + + buffer bufA(A.get_data(), range<2>(M, K)); + + q.submit([&](handler &cgh) { + sycl::accessor accA{bufA, cgh, sycl::read_write}; + + cgh.parallel_for( + r, [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a; + + joint_matrix_fill(sg, sub_a, val1); + + joint_matrix_apply(sg, sub_a, [=](T &x) { x += val2; }); + + ext::intel::experimental::matrix::joint_matrix_store( + sg, sub_a, + accA.template get_multi_ptr() + + (sg_startx * TM) * K + sg_starty / sg_size * TK, + K); + }); // parallel for + }).wait(); + assert_ops_ref(bufA.get_host_access(), result); +} + +template +void add_ref() { + if constexpr (std::is_same_v) { + // Tests whether 5 + 2 = 7 operation is successful. + matrix_verify_add( + bfloat16(5.0), bfloat16(2.0), bfloat16(7.0)); + } + if constexpr (std::is_same_v) { + matrix_verify_add( + 5 /*val1*/, 2 /*val2*/, 7 /*result*/); + } +} + +template void test() { + add_ref>(); + add_ref>(); + add_ref>(); + add_ref>(); + add_ref>(); + add_ref>(); + add_ref>(); + + add_ref>(); + add_ref>(); + add_ref>(); + add_ref>(); + add_ref>(); + add_ref>(); + add_ref>(); +} + +int main() { + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0) { // Intel AMX + test<16>(); + break; + } + + if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc + test<16>(); + break; + } + + if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* + test<8>(); + break; + } + } + + std::cout << "Passed\n"; +} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_ops.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_ops.cpp index 19c2d5ef9c273..855ed65900a6f 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_ops.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_ops.cpp @@ -5,10 +5,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix -// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out +// RUN: %{build} -o %t.out // RUN: %{run} %t.out -#include "../common.hpp" -#include "../element_wise_ops_impl.hpp" +#include "common.hpp" +#include "element_wise_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_ops_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_ops_impl.hpp new file mode 100644 index 0000000000000..8ffbbf8eabff1 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_ops_impl.hpp @@ -0,0 +1,150 @@ +//==----------- element_wise_ops_impl.hpp - DPC++ joint_matrix---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +template +void matrix_multiply(big_matrix &C, big_matrix &A, + big_matrix &B) { + // stride should be X's cols, e.g., B's stirde = N*4 + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufB(B.get_data(), range<2>(K, N)); + buffer bufC(C.get_data(), range<2>(M, N)); + + queue q; + size_t sg_size = get_sg_size(q); + q.submit([&](handler &cgh) { + auto accC = bufC.template get_access(cgh); + auto accA = bufA.template get_access(cgh); + auto accB = bufB.template get_access(cgh); + + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + // The submatrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup no + // code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sycl::sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a; + // For B, we assume B has been already VNNIed. + joint_matrix + sub_b; + joint_matrix sub_c; + + joint_matrix_load( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::row_major); + for (int k = 0; k < K / TK; k += 1) { + joint_matrix_load( + sg, sub_a, + accA.template get_multi_ptr() + + (sg_startx * TM) * K + k * TK, + K); + joint_matrix_load( + sg, sub_b, + accB.template get_multi_ptr() + + (k * TK / VF) * (N * VF) + sg_starty / sg_size * TN * VF, + N * VF); + joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); + } + joint_matrix_apply(sg, sub_c, [](Tc &x) { x = x * 2; }); + joint_matrix_store( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::row_major); + }); // parallel for + }).wait(); +} + +template +bool test() { + + static constexpr size_t MATRIX_M = TM * 2; + static constexpr size_t MATRIX_N = TN * 2; + static constexpr size_t MATRIX_K = TK * 2; + + Ta A[MATRIX_M][MATRIX_K]; + Ta B[MATRIX_K / VF][MATRIX_N * VF]; + Tc C[MATRIX_M][MATRIX_N]; + Tc D[MATRIX_M][MATRIX_N]; + + matrix_rand(MATRIX_M, MATRIX_K, (Ta *)A, (Ta)100); + matrix_rand(MATRIX_K / VF, MATRIX_N * VF, (Ta *)B, (Ta)100); + matrix_fill(MATRIX_M, MATRIX_N, (Tc *)C, (Tc)1); + matrix_fill(MATRIX_M, MATRIX_N, (Tc *)D, (Tc)1); + + big_matrix MC((Tc *)&C); + big_matrix MD((Tc *)&D); + big_matrix MA((Ta *)&A); + big_matrix MB((Ta *)&B); + + matrix_multiply(MC, MA, MB); + matrix_multiply_ref((Ta *)A, (Ta *)B, (Tc *)D, MATRIX_M, + MATRIX_N, MATRIX_K / VF, false, false, + false, [](Tc &x) { x = x * 2; }); + bool res = matrix_compare(MATRIX_M, MATRIX_N, (Tc *)C, (Tc *)D); + + std::cout << TM << "x" << TN << "x" << TK << ": " + << (res ? "passed" : "failed") << std::endl; + return res; +} + +int main() { + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + bool passed = true; + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0) { // Intel AMX + passed &= test(); + passed &= test(); + passed &= test(); + break; + } + + if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc + passed &= test(); + passed &= test(); + passed &= test(); +#if (!defined(SG_SZ) || SG_SZ != 32) + // These combination are not currently supported for subgroup size = 32 in + // IGC + passed &= test(); + passed &= test(); + passed &= test(); +#endif + break; + } + + if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* + passed &= test(); + passed &= test(); + passed &= test(); + break; + } + } + + return !passed; +} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/elemwise_irreg_size_ops_bf16.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/elemwise_irreg_size_ops_bf16.cpp new file mode 100644 index 0000000000000..5b4c785d46d3d --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/elemwise_irreg_size_ops_bf16.cpp @@ -0,0 +1,145 @@ +//==-------- elemwise_irreg_size_ops_bf16.cpp - DPC++ joint_matrix---- ----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This test is for element wise operations when matrix size does not multiply +// SG size. This corner case only applies to AMX. Also, it tests bf16 type. +// only run this on AMX +// REQUIRES: cpu +// REQUIRES: matrix + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include "common.hpp" + +#define SG_SZ 16 + +// 10x12 is not multiply the sg size, slicing implementation will have to insert +// padding +#define TM 10 +#define TN 12 +#define TK 16 + +template +void matrix_multiply(big_matrix &C, + big_matrix &A, + big_matrix &B) { + size_t M = NUM_ROWS_C; + size_t N = NUM_COLS_C; + size_t K = NUM_COLS_A; + + assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B * 2); + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufB(B.get_data(), range<2>(K / 2, N * 2)); + buffer bufC((float *)C.get_data(), range<2>(M, N)); + + queue q; + q.submit([&](handler &cgh) { + auto accC = bufC.get_access(cgh); + auto accA = bufA.get_access(cgh); + auto accB = bufB.get_access(cgh); + + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), + [accA, accB, accC, M, N, K](nd_item<2> spmd_item) + [[intel::reqd_sub_group_size(SG_SZ)]] + + { + // The submatrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup no + // code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix + sub_a; + // For B, we assume B has been already VNNIed. + joint_matrix + sub_b; + joint_matrix sub_c; + joint_matrix_load( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + N, layout::row_major); + for (int k = 0; k < K; k += TK) { + joint_matrix_load( + sg, sub_a, + accA.template get_multi_ptr() + + (sg_startx * TM) * K + k, + K); + // Assume we alreay in vnni format. + joint_matrix_load( + sg, sub_b, + accB.template get_multi_ptr() + + (k) * (N) + sg_starty / SG_SZ * TN * 2, + N * 2); + joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); + } + joint_matrix_apply(sg, sub_c, [](float &x) { x += 5.0; }); + joint_matrix_store( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + N, layout::row_major); + }); // parallel for + }).wait(); +} + +static constexpr size_t MATRIX_M = TM * 2; +static constexpr size_t MATRIX_N = TN * 2; +static constexpr size_t MATRIX_K = TK * 2; +bfloat16 A[MATRIX_M][MATRIX_K]; +bfloat16 B[MATRIX_K / 2][MATRIX_N * 2]; +float C[MATRIX_M][MATRIX_N]; +float D[MATRIX_M][MATRIX_N]; + +int main() { + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_K; j++) { + A[i][j] = bfloat16(1.0f * (i + j)); + } + } + for (int i = 0; i < MATRIX_K / 2; i++) { + for (int j = 0; j < MATRIX_N * 2; j++) { + B[i][j] = bfloat16(2.0f * i + 3.0f * j); + } + } + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) { + C[i][j] = 1.0; + D[i][j] = 1.0; + } + } + + big_matrix MC((float *)&C); + big_matrix MD((float *)&D); + big_matrix MA((bfloat16 *)&A); + big_matrix MB((bfloat16 *)&B); + matrix_multiply(MC, MA, MB); + matrix_multiply_ref( + (bfloat16 *)A, (bfloat16 *)B, (float *)D, MATRIX_M, MATRIX_N, + MATRIX_K / 2, false, false, false, [](float &x) { x = x + 5.0; }); + + bool res = true; + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) { + if (C[i][j] != D[i][j]) + res = false; + } + } + std::cout << (res ? "passed" : "failed") << std::endl; + return !res; +} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_float_matC.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_float_matC.cpp new file mode 100644 index 0000000000000..044c8072bb3bc --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_float_matC.cpp @@ -0,0 +1,14 @@ +//==----------- get_coord_float_matC.cpp - DPC++ joint_matrix---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include "common.hpp" +#include "get_coord_float_matC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_float_matC_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_float_matC_impl.hpp new file mode 100644 index 0000000000000..bb0b6336a0ec5 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_float_matC_impl.hpp @@ -0,0 +1,130 @@ +//==----------- get_coord_float_matC_impl.hpp - DPC++ joint_matrix---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include +#include + +template class add_rows; + +// clang-format off +/* +Here's how the data is distributed for sub group size = 16 on PVC +W0 --> 0 1 2 3 4 5 6 7 +wi [0,0] -> i=0, [0, 0] wi [0,1] --> i=0, [0, 1] wi [0,15] --> i=0, [0, 15] + i=1, [1, 0] i=1, [1, 1] i=1, [1, 15] + i=2, [2, 0] i=2, [2, 1] ... + ... .... + i=7, [7, 0] i=7, [7, 1] +*/ +// clang-format on + +template +void matrix_sum_rows(big_matrix &C, T *sum_rows) { + buffer bufC((T *)C.get_data(), range<2>(Rows, Cols)); + buffer sum_rows_v(sum_rows, Rows); + + queue q; + size_t sg_size = get_sg_size>(q); + q.submit([&](handler &cgh) { + sycl::accessor accC{bufC, cgh, sycl::read_write}; + sycl::accessor v{sum_rows_v, cgh, sycl::read_write}; + + cgh.parallel_for>( + nd_range<2>({Rows / TileRows, Cols / TileCols * sg_size}, + {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + // The submatrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup no + // code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix + sub_c; + + joint_matrix_load( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TileRows) * Cols + + sg_starty / sg_size * TileCols, + Cols, layout::row_major); + + T sum_local_rows[Rows] = {0}; + + ext::intel::experimental::matrix::joint_matrix_apply( + sg, sub_c, [&](T &x, size_t row, size_t col) { + sum_local_rows[row + global_idx * TileRows] += x; + }); + for (int i = 0; i < Rows; i++) { + sum_local_rows[i] = + reduce_over_group(sg, sum_local_rows[i], sycl::plus<>()); + // only Groups leader perform the global reduction + if (global_idy % sg_size == 0) { + sycl::atomic_ref + aref(v[i]); + aref.fetch_add(sum_local_rows[i]); + } + } + }); // parallel for + }).wait(); +} + +template void test() { + constexpr size_t SCALE = 2; + static constexpr size_t Rows = TM * SCALE; + static constexpr size_t Cols = TN * SCALE; + + T sum_rows[Rows] = {0}; + T sum_rows_ref[Rows] = {0}; + T C[Rows][Cols]; + big_matrix MC((T *)&C); + + matrix_rand(Rows, Cols, (T *)&C, (T)100); + matrix_sum_rows(MC, sum_rows); + + for (int i = 0; i < Rows; i++) { + for (int j = 0; j < Cols; j++) { + sum_rows_ref[i] += C[i][j]; + } + assert(std::fabs(sum_rows_ref[i] - sum_rows[i]) <= FLOAT_EPSILON); + } +} + +int main() { + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0) { // Intel AMX + test(); + break; + } + + if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc + test(); + break; + } + + if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* + test(); + break; + } + } + return 0; +} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matA.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matA.cpp new file mode 100644 index 0000000000000..8e953c8689305 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matA.cpp @@ -0,0 +1,14 @@ +//==----------- get_coord_int8_matA.cpp - DPC++ joint_matrix---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include "common.hpp" +#include "get_coord_int8_matA_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matA_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matA_impl.hpp new file mode 100644 index 0000000000000..f3d89cc717ef4 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matA_impl.hpp @@ -0,0 +1,176 @@ +//==----------- get_coord_int8_matA_impl.hpp - DPC++ joint_matrix---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include +#include + +template class add_rows; + +template +void sum_rows_ref( + host_accessor A, + host_accessor sum_rows) { + int sum_rows_ref[Rows] = {0}; + for (size_t i = 0; i < Rows; i++) { + for (size_t j = 0; j < Cols; j++) { + sum_rows_ref[i] += A[i][j]; + } + auto diff = sum_rows[i] - sum_rows_ref[i]; + assert(std::fabs(static_cast(diff)) <= + std::numeric_limits::epsilon()); + } +} + +// clang-format off +/* For sub group size = 16: +wi [0,0] -> i=0, [0, 0] wi [0,1] --> i=0, [0, 2] wi [0,15] --> i=0, [0, 30] + i=1, [0, 1] i=1, [0, 3] i=1, [0, 31] + i=2, [1, 0] i=2, [1, 2] i=2, [1, 30] + i=3, [1, 1] i=3, [1, 3] i=3, [1, 31] + i=4, [2, 0] i=4, [2, 2] ... + i=5, [2, 1] i=5, [2, 3] + ... .... + i=14,[7, 0] i=14, [7, 2] + i=15,[7, 1] i=15, [7, 3] i=15, [7, 31] + +Here's how the distribution of the A matrix looks like for this test case +x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x +x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x +x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x +x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x +x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x +x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x +x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x +x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x +<--------------------------------- SG1 ---------------------------------> +x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x +x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x +x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x +x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x +x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x +x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x +x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x +x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x +<0> <1> <2> <3> <4> <5> <6> <7> ..... WORK ITEMS +Each work item has 16 elements <8 rows and 2 cols of the original matrix> +the data_slice holds the matrix elements in the following order: +0 0 0 0 + / + / +1 1 1 1 + / + / +2 2 2 2 + / + / +3 3 3 3 +W0 --> 0 0 1 1 2 2 3 3 .... 7 7 +*/ +// clang-format on + +template +void matrix_sum_rows(big_matrix &A) { + buffer bufA(A.get_data(), range<2>(Rows, Cols)); + + // size of vector is equal to number of rows in big matrix + TResult sum_rows[Rows] = {0}; + buffer sum_rows_v(sum_rows, Rows); + queue q; + size_t sg_size = get_sg_size>(q); + nd_range<2> r({Rows / TileRows, Cols / TileCols * sg_size}, {1, 1 * sg_size}); + q.submit([&](handler &cgh) { + sycl::accessor accA{bufA, cgh, sycl::read_write}; + sycl::accessor v{sum_rows_v, cgh, sycl::read_write}; + + cgh.parallel_for>( + r, [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sycl::sub_group sg = spmd_item.get_sub_group(); + joint_matrix + sub_a; + joint_matrix_load( + sg, sub_a, + accA.template get_multi_ptr() + + (sg_startx * TileRows * Cols) + + sg_starty / sg_size * TileCols, + Cols); + + TResult sum_local_rows[Rows] = {0}; + + ext::intel::experimental::matrix::joint_matrix_apply( + sg, sub_a, [&](T &x, size_t row, size_t col) { + sum_local_rows[row + global_idx * TileRows] += x; + }); + for (int i = 0; i < Rows; ++i) { + sum_local_rows[i] = + reduce_over_group(sg, sum_local_rows[i], sycl::plus<>()); + + // only Groups leader performs the global reduction + if (global_idy % sg_size == 0) { + sycl::atomic_ref + aref(v[i]); + aref.fetch_add(sum_local_rows[i]); + } + } + }); // parallel for + }).wait(); + sum_rows_ref(bufA.get_host_access(), + sum_rows_v.get_host_access()); +} + +template void test() { + static constexpr size_t Rows = TM * 2; + static constexpr size_t Cols = TK * 2; + T A[Rows][Cols]; + + big_matrix MA((T *)&A); + + for (int i = 0; i < Rows; i++) { + for (int j = 0; j < Cols; j++) { + A[i][j] = i + j; + } + } + + matrix_sum_rows(MA); +} + +int main() { + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0) { // Intel AMX + test(); + break; + } + + if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc + test(); + break; + } + + if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* + test(); + break; + } + } + return 0; +} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matB.cpp new file mode 100644 index 0000000000000..a5166bbf54201 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matB.cpp @@ -0,0 +1,15 @@ +//==----------- get_coord_int8_matB.cpp - DPC++ joint_matrix---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// XFAIL: cpu + +#include "common.hpp" +#include "get_coord_int8_matB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matB_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matB_impl.hpp new file mode 100644 index 0000000000000..8b63dadc029b3 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matB_impl.hpp @@ -0,0 +1,214 @@ +//==----------- get_coord_int8_matB_impl.hpp - DPC++ joint_matrix---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include +#include + +template class add_cols; + +template +void sum_cols_ref( + host_accessor B, + host_accessor sum_cols) { + TResult sum_cols_ref[Cols] = {0}; + for (size_t j = 0; j < Cols; j++) { + for (size_t i = 0; i < Rows; i++) { + sum_cols_ref[j] += B[i][j]; + } + auto diff = sum_cols[j] - sum_cols_ref[j]; + assert(std::fabs(static_cast(diff)) <= + std::numeric_limits::epsilon()); + } +} + +// clang-format off +/* + Here is a demonstration of how matrix B will be divided across + work items for this test case for sub group size = 16 on PVC. + < --------------- 128 ----------------------------------> + x x x x x x x x x x x x x x x x .......... x x x x x x ^ + x x x x x x x x x x x x x x x x .......... x x x x x x 16 + x x x x x x x x x x x x x x x x .......... x x x x x x | + ..... | + x x x x x x x x x x x x x x x x .......... x x x x x x | + x x x x x x x x x x x x x x x x .......... x x x x x x v + + --------------- 64 ----------------> + x x x x x x .......... x x x x x x ^ + x x x x x x .......... x x x x x x 8 + x x x x x x .......... x x x x x x | <-- part of (VNNI-ed) + ..... | original matrix each SG + x x x x x x .......... x x x x x x | holds + x x x x x x .......... x x x x x x v + < WI0 > < WI15 > + <-------- 16 -------------> + x x x .......... x x x ^ + x x x .......... x x x | + x x x .......... x x x | <-- part of (non-VNNI-ed) original matrix + ..... | each SG holds + x x x .......... x x x | + x x x .......... x x x | + x x x .......... x x x 32 + x x x .......... x x x | + x x x .......... x x x | + x x x .......... x x x | + x x x .......... x x x | + x x x .......... x x x | + x x x .......... x x x v + If we divide the above matrix across 16 (SG_SZ) work items, + each WI will hold 32 elements. And these 32 elements will be + 8x4 chunks as shown in the VNNI-ed matrix figure. + +The total distribution among the WIs in ALL the sub-groups is as follows: +This is useful to figure out how the global index is to be calculated + +W0 --> 0 0 0 0 1 1 1 1 ... 7 7 7 7 --> total 32 elements +wi [0,0] --> i=0, [0, 0] wi [0,1] --> i=0, [0, 4] wi [0,15] --> i=0, [0, 60] | wi [0,16] --> i=0, [0, 64] + i=1, [0, 1] i=1, [0, 5] i=1, [0, 61] | i=1, [0, 65] + i=2, [0, 2] i=2, [0, 6] i=2, [0, 62] | i=2, [0, 66] + i=3, [0, 3] i=3, [0, 7] i=3, [0, 63] | i=3, [0, 67] + i=4, [1, 0] i=4, [1, 4] i=4, [1, 60] | .... + i=5, [1, 1] i=5, [1, 5] i=5, [1, 61] | + i=6, [1, 2] i=6, [1, 6] i=6, [1, 62] | + i=7, [1, 3] i=7, [1, 7] i=7, [1, 63] | + ... ... .... | + i=28,[7, 0] i=28,[7, 4] i=28,[7, 60] | i=28, [7, 124] + i=29,[7, 1] i=29,[7, 5] i=29,[7, 61] | i=29, [7, 125] + i=30,[7, 2] i=30,[7, 6] i=30,[7, 62] | i=30, [7, 126] + i=31,[7, 3] i=31,[7, 7] i=31,[7, 63] | i=31, [7, 127] +---------------------------------------------------------------------------------------- --------------------------- +wi [1,0] --> i=0, [8, 0] + i=1, [8, 1] + i=2, [8, 2] + i=3, [8, 2] + ... + i=28, [15, 0] + i=29, [15, 1] + i=30, [15, 2] + i=31, [15, 3] +*/ + +// clang-format on + +template +void matrix_sum_cols(big_matrix &B, + big_matrix &Bvnni) { + buffer bufB(B.get_data(), range<2>(Rows, Cols)); + buffer bufBvnni(Bvnni.get_data(), range<2>(Rows / VNNI, Cols * VNNI)); + + TResult sum_cols[Cols] = {0}; + buffer sum_cols_v(sum_cols, Cols); + + size_t NDRangeK = Rows / TileRows; + size_t NDRangeN = Cols / TileCols; + queue q; + size_t sg_size = get_sg_size>(q); + nd_range<2> r({NDRangeK, NDRangeN * sg_size}, {1, 1 * sg_size}); + + q.submit([&](handler &cgh) { + sycl::accessor accB{bufBvnni, cgh, sycl::read_write}; + sycl::accessor v{sum_cols_v, cgh, sycl::read_write}; + + cgh.parallel_for>( + r, [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sycl::sub_group sg = spmd_item.get_sub_group(); + + joint_matrix + sub_b; + + joint_matrix_load( + sg, sub_b, + accB.template get_multi_ptr() + + (sg_startx * (TileRows / VNNI) * Cols * VNNI) + + sg_starty / sg_size * TileCols * VNNI, + Cols * VNNI); + + TResult sum_local_cols[Cols] = {0}; + ext::intel::experimental::matrix::joint_matrix_apply( + sg, sub_b, [&](T &x, size_t row, size_t col) { + // the coordinates returned are in the logical range + // [Rows,Cols] If users want to retrieve the VNNIed + // coordinates, they can be obtained using colVNNI = col/VNNI + // rowVNNI = row*VNNI + size_t global_index = col + global_idy / sg_size * TileCols; + sum_local_cols[global_index] += x; + }); + + for (int i = 0; i < Cols; i++) { + sum_local_cols[i] = + reduce_over_group(sg, sum_local_cols[i], sycl::plus<>()); + if (global_idy % sg_size == 0) { + sycl::atomic_ref + aref(v[i]); + aref.fetch_add(sum_local_cols[i]); + } + } + }); // parallel for + }).wait(); + sum_cols_ref(bufB.get_host_access(), + sum_cols_v.get_host_access()); +} + +template +void test() { + static constexpr size_t scale = 2; + static constexpr size_t MATRIX_K = TK * scale; + static constexpr size_t MATRIX_N = TN * scale; + + T B[MATRIX_K][MATRIX_N]; + big_matrix MB((T *)&B); + + T Bvnni[MATRIX_K / VNNI][MATRIX_N * VNNI]; + big_matrix MBvnni((T *)&Bvnni); + + for (int i = 0; i < MATRIX_K; i++) { + for (int j = 0; j < MATRIX_N; j++) { + B[i][j] = i + j; + } + } + matrix_vnni(MATRIX_K, MATRIX_N, *B, *Bvnni, VNNI); + // This test calculates sum of columns in the non VNNI B matrix + matrix_sum_cols(MB, MBvnni); +} + +int main() { + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0) { // Intel AMX + test(); + break; + } + + if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc + test(); + break; + } + + if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* + test(); + break; + } + } + return 0; +} \ No newline at end of file diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_all_sizes.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_all_sizes.cpp new file mode 100644 index 0000000000000..e4a9523314ea0 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_all_sizes.cpp @@ -0,0 +1,14 @@ +//==-------- joint_matrix_all_sizes.cpp - DPC++ joint_matrix---------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include "common.hpp" +#include "joint_matrix_all_sizes_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_all_sizes_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_all_sizes_impl.hpp new file mode 100644 index 0000000000000..b3001a68bb227 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_all_sizes_impl.hpp @@ -0,0 +1,155 @@ +//===---joint_matrix_all_sizes_impl.hpp - DPC++ joint_matrix---------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +static constexpr size_t M_MULTIPLIER = 16; + +template class mult; + +template +void matrix_multiply(big_matrix &C, big_matrix &A, + big_matrix &B) { + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufB(B.get_data(), range<2>(K, N)); + buffer bufC(C.get_data(), range<2>(M, N)); + + queue q; + size_t sg_size = get_sg_size(q); + q.submit([&](handler &cgh) { + sycl::accessor accC{bufC, cgh, sycl::read_write}; + sycl::accessor accA{bufA, cgh, sycl::read_only}; + sycl::accessor accB{bufB, cgh, sycl::read_only}; + + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + // The submatrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup no + // code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a; + // For B, we assume B has been already VNNIed. + joint_matrix + sub_b; + joint_matrix sub_c; + + joint_matrix_load( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::row_major); + for (int k = 0; k < K / TK; k += 1) { + joint_matrix_load( + sg, sub_a, + accA.template get_multi_ptr() + + (sg_startx * TM) * K + k * TK, + K); + joint_matrix_load( + sg, sub_b, + accB.template get_multi_ptr() + + (k * TK / vnniFactor) * (N * vnniFactor) + + sg_starty / sg_size * TN * vnniFactor, + N * vnniFactor); + joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); + } + joint_matrix_store( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::row_major); + }); // parallel for + }).wait(); +} + +template +void init_and_multiply() { + static constexpr size_t MATRIX_M = tM * M_MULTIPLIER; + static constexpr size_t MATRIX_N = 128; + static constexpr size_t MATRIX_K = 128; + + std::cout << "MATRIX_M=" << MATRIX_M << "\n"; + + T A[MATRIX_M][MATRIX_K]; + T B[MATRIX_K][MATRIX_N]; + T Bvnni[MATRIX_K / vnni_factor][MATRIX_N * vnni_factor]; + TResult C[MATRIX_M][MATRIX_N]; + TResult D[MATRIX_M][MATRIX_N]; + + matrix_rand(MATRIX_M, MATRIX_K, (T *)A, (T)50); + matrix_rand(MATRIX_K, MATRIX_N, (T *)B, (T)50); + matrix_fill(MATRIX_M, MATRIX_N, (TResult *)C, (TResult)1); + matrix_fill(MATRIX_M, MATRIX_N, (TResult *)D, (TResult)1); + + big_matrix MC((TResult *)&C); + big_matrix MD((TResult *)&D); + big_matrix MA((T *)&A); + matrix_vnni(MATRIX_K, MATRIX_N, (T *)&B, (T *)&Bvnni, vnni_factor); + big_matrix MBvnni( + (T *)&Bvnni); + + matrix_multiply(MC, MA, MBvnni); + matrix_multiply_ref((T *)A, (T *)B, (TResult *)D, MATRIX_M, MATRIX_N, + MATRIX_K); + + assert(matrix_compare(MATRIX_M, MATRIX_N, (TResult *)C, (TResult *)D)); +} + +template +void test() { + init_and_multiply>(); + init_and_multiply>(); + init_and_multiply>(); + init_and_multiply>(); + init_and_multiply>(); + init_and_multiply>(); + init_and_multiply>(); + init_and_multiply>(); +} + +int main() { + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0) { // Intel AMX + test(); + test(); + break; + } + + if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc + test(); + test(); + break; + } + + if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* + test(); + test(); + break; + } + } + + return 0; +} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_annotated_ptr.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_annotated_ptr.cpp new file mode 100644 index 0000000000000..e21bdaa06a328 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_annotated_ptr.cpp @@ -0,0 +1,19 @@ +//==-------- joint_matrix_annotated_ptr.cpp - DPC++ joint_matrix-----------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=0 %{run} %t.out %} +// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=1 %{run} %t.out %} + +#include "common.hpp" + +constexpr size_t TN = 16; + +#include "joint_matrix_annotated_ptr_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_annotated_ptr_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_annotated_ptr_impl.hpp new file mode 100644 index 0000000000000..5463ea040d1eb --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_annotated_ptr_impl.hpp @@ -0,0 +1,128 @@ +//===---joint_matrix_annotated_ptr_impl.hpp - DPC++ joint_matrix-----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#define TM 8 +#define TK 16 + +template class mult; + +template +void matrix_multiply(T1 *C, T2 *A, T2 *B, queue &q) { + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + size_t sg_size = get_sg_size>(q); + q.submit([&](handler &cgh) { + cgh.parallel_for>( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix + sub_a; + joint_matrix + sub_b; + joint_matrix + sub_bp; + joint_matrix sub_c; + + auto C_ptr = syclex::annotated_ptr{ + C, syclex::properties{ + syclintelex::read_assertion>}}; + auto A_ptr = syclex::annotated_ptr{ + A, + syclex::properties{syclintelex::read_assertion< + syclintelex::cache_control>}}; + auto B_ptr = syclex::annotated_ptr{ + B, + syclex::properties{syclintelex::read_hint< + syclintelex::cache_control>}}; + joint_matrix_load(sg, sub_c, + C_ptr + (sg_startx * TM) * N + + sg_starty / sg_size * TN, + N, layout::row_major); + for (int k = 0; k < K / TK; k += 1) { + joint_matrix_load(sg, sub_a, A_ptr + (sg_startx * TM) * K + k * TK, + K); + if constexpr (vnniFactor == 0) { + joint_matrix_load( + sg, sub_b, B_ptr + (k * TK) * N + sg_starty / sg_size * TN, + N); + joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); + } else { + joint_matrix_load(sg, sub_bp, + B_ptr + + (k * TK / vnniFactor) * (N * vnniFactor) + + sg_starty / sg_size * TN * vnniFactor, + N * vnniFactor); + + joint_matrix_mad(sg, sub_c, sub_a, sub_bp, sub_c); + } + } + auto C_w_ptr = syclex::annotated_ptr{ + C, + syclex::properties{syclintelex::write_hint< + syclintelex::cache_control>}}; + joint_matrix_store(sg, sub_c, + C_w_ptr + (sg_startx * TM) * N + + sg_starty / sg_size * TN, + N, layout::row_major); + }); // parallel for + }).wait(); +} + +int main() { + queue q; + static constexpr size_t M = TM * 2; + static constexpr size_t N = TN * 2; + static constexpr size_t K = TK * 2; + static constexpr unsigned int vnniFactor = 2; + bfloat16 *A = malloc_shared(M * K, q); + bfloat16 *B = malloc_shared(K * N, q); + bfloat16 *vnniB = malloc_shared(K * N, q); + float *C = malloc_shared(M * N, q); + float *D = malloc_shared(M * N, q); + + matrix_fill(M, K, A, [](int i, int j) { return 1.0f * (i + j); }); + matrix_fill(K, N, (bfloat16 *)B, + [](int i, int j) { return 2.0f * i + 3.0f * j; }); + matrix_fill(M, N, C, 1.0f); + matrix_fill(M, N, D, 1.0f); + + matrix_vnni(K, N, B, vnniB, vnniFactor); + + matrix_multiply_ref(A, B, D, M, N, K); + + // Currently row major B fails when annotated_ptr is used + matrix_multiply(C, A, B, q); + bool res0 = matrix_compare(M, N, C, D); + std::cout << (res0 ? "B row major passed" : "failed") << std::endl; + + matrix_fill(M, N, C, 1.0f); + matrix_multiply(C, A, vnniB, q); + bool res1 = matrix_compare(M, N, C, D); + std::cout << (res1 ? "B VNNI passed" : "failed") << std::endl; + + return !(res0 & res1); +} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_bf16.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_bf16.cpp new file mode 100644 index 0000000000000..cdfa28c07673e --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_bf16.cpp @@ -0,0 +1,14 @@ +//==----------- joint_matrix_apply_bf16.cpp - DPC++ joint_matrix-----------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include "common.hpp" +#include "joint_matrix_apply_bf16_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_bf16_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_bf16_impl.hpp new file mode 100644 index 0000000000000..3fc96f77e020a --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_bf16_impl.hpp @@ -0,0 +1,113 @@ +//===---joint_matrix_apply_bf16_impl.hpp - DPC++ joint_matrix--------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +template class add; +template class add_func; + +template struct apply_add { + void operator()(T &x) const { x = x + T(2); } +}; + +template +void matrix_verify_add(big_matrix &A, const TResult ref, + F &&lambda) { + buffer bufA(A.get_data(), range<2>(Rows, Cols)); + + queue q; + size_t sg_size = get_sg_size(q); + nd_range<2> r({Rows / TileRows, Cols / TileCols * sg_size}, {1, 1 * sg_size}); + + q.submit([&](handler &cgh) { + accessor accA{bufA, cgh}; + + cgh.parallel_for( + r, [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix + sub_a; + + joint_matrix_fill(sg, sub_a, T(5.0)); + + joint_matrix_apply(sg, sub_a, lambda); + + ext::intel::experimental::matrix::joint_matrix_store( + sg, sub_a, + accA.template get_multi_ptr() + + (sg_startx * TileRows) * Cols + + sg_starty / sg_size * TileCols, + Cols); + }); // parallel for + }).wait(); + // Check if the results are correct + { + host_accessor Acc{bufA}; + assert(std::all_of(Acc.begin(), Acc.end(), [=](auto Elem) { + return (std::fabs(static_cast(make_fp32(Elem) - ref)) < + std::numeric_limits::epsilon()); + })); + } +} + +template void test() { + std::cout << "Testing: " << TM << " x " << TK << " [TM x TK]" << std::endl; + + static constexpr size_t Rows = TM * 2; + static constexpr size_t Cols = TK * 2; + T A[Rows][Cols]; + + big_matrix MA((T *)&A); + + matrix_verify_add>( + MA, 7.0, [=](T &x) { x = x + T(2); }); + matrix_verify_add>( + MA, 7.0, apply_add()); +} + +int main() { + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0) { // Intel AMX + test(); + break; + } + + if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc + test(); + // This combination is not currently supported for sub group size = 32 in + // IGC +#if (!defined(SG_SZ) || SG_SZ != 32) + test(); + test(); + test(); +#endif + break; + } + + if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* + test(); + break; + } + } + return 0; +} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_two_matrices.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_two_matrices.cpp new file mode 100644 index 0000000000000..e61448852710b --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_two_matrices.cpp @@ -0,0 +1,14 @@ +//==------ joint_matrix_apply_two_matrices.cpp - DPC++ joint_matrix--------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix + +// RUN: %{build} -ffp-model=precise -o %t.out +// RUN: %{run} %t.out + +#include "common.hpp" +#include "joint_matrix_apply_two_matrices_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_two_matrices_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_two_matrices_impl.hpp new file mode 100644 index 0000000000000..9751571bcbcf5 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_two_matrices_impl.hpp @@ -0,0 +1,157 @@ +//==------- joint_matrix_apply_two_matrices_impl.hpp - DPC++ joint_matrix--==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include + +template +bool apply_verify(Tc *C, Tc *D, Ta *A, Ta *Ar) { + for (size_t i = 0; i < M; i++) + for (size_t j = 0; j < N; j++) { + Tc diffc = D[i * N + j] - C[i * N + j] * 2; + Ta diffa = Ar[i * N + j] - (A[i * N + j] + 42); + if constexpr (std::is_same_v) { + if (std::fabs(diffc) > FLOAT_EPSILON || + std::fabs(diffa) > FLOAT_EPSILON || std::isnan(C[i * N + j]) || + std::isnan(A[i * N + j])) { + return false; + } + } else { + if (std::abs(diffc) > 0 || std::abs(diffa) > 0) { + return false; + } + } + } + return true; +} +template +bool apply_two_matrices(Tc *C, Tc *D, Ta *A, Ta *Ar, queue q) { + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + + size_t sg_size = get_sg_size(q); + q.submit([&](handler &cgh) { + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + auto pC = + address_space_cast(C); + auto pD = + address_space_cast(D); + auto pA = + address_space_cast(A); + auto pAr = + address_space_cast(Ar); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a; + joint_matrix + sub_ar; + joint_matrix sub_c; + joint_matrix sub_d; + + joint_matrix_load( + sg, sub_c, pC + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::row_major); + joint_matrix_apply(sg, sub_c, sub_d, + [](const Tc &x, Tc &y) { y = x * 2; }); + joint_matrix_store( + sg, sub_d, pD + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::row_major); + joint_matrix_load( + sg, sub_a, pA + (sg_startx * TM) * K + sg_starty / sg_size * TK, + K); + joint_matrix_apply(sg, sub_a, sub_ar, + [](const Ta &x, Ta &y) { y = x + 42; }); + ext::intel::experimental::matrix::joint_matrix_store( + sg, sub_ar, + pAr + (sg_startx * TM) * K + sg_starty / sg_size * TK, K); + }); // parallel for + }).wait(); + return apply_verify(C, D, A, Ar); +} + +template +bool test() { + static constexpr size_t M = TM * 2; + static constexpr size_t N = TN * 2; + static constexpr size_t K = TK * 2; + queue q; + + Tc *C = malloc_shared(M * N, q); + Tc *D = malloc_shared(M * N, q); + Ta *A = malloc_shared(M * K, q); + Ta *Ar = malloc_shared(M * K, q); + + matrix_rand(M, N, (Tc *)C, (Tc)100); + matrix_rand(M, K, (Ta *)A, (Ta)100); + + bool res = apply_two_matrices( + C, D, A, Ar, q); + + if constexpr (std::is_same_v) + std::cout << "bfloat16 " << TM << "x" << TN << "x" << TK << ": " + << (res ? "passed" : "failed") << std::endl; + else if constexpr (std::is_same_v) + std::cout << "int8_t " << TM << "x" << TN << "x" << TK << ": " + << (res ? "passed" : "failed") << std::endl; + free(C, q); + free(D, q); + free(A, q); + free(Ar, q); + + return res; +} + +int main() { + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + bool passed = true; + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0) { // Intel AMX + passed &= test(); + passed &= test(); + break; + } + + if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc + passed &= test(); + passed &= test(); +// This combination is not currently supported for sub group size = 32 in IGC +#if (!defined(SG_SZ) || SG_SZ != 32) + passed &= test(); +#endif + break; + } + + if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* + passed &= test(); + passed &= test(); + break; + } + } + + return !passed; +} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache.cpp new file mode 100644 index 0000000000000..2be4c14615799 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache.cpp @@ -0,0 +1,16 @@ +//==--- joint_matrix_bf16_fill_k_cache.cpp - DPC++ joint_matrix----------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix + +// RUN: %{build} -o %t.out -ffp-model=precise +// RUN: %{run} %t.out + +// -ffp-model=precise is added to not depend on compiler defaults. + +#include "common.hpp" +#include "joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_OOB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_OOB.cpp new file mode 100644 index 0000000000000..4d84656c3d451 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_OOB.cpp @@ -0,0 +1,19 @@ +//==--- joint_matrix_bf16_fill_k_cache_OOB.cpp - DPC++ joint_matrix--------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix, gpu +// UNSUPPORTED: gpu-intel-dg2 + +// RUN: %{build} -o %t_gpu.out -ffp-model=precise -DOOB +// RUN: %{run} %t_gpu.out + +// XFAIL: gpu + +// -ffp-model=precise is added to not depend on compiler defaults. + +#include "common.hpp" +#include "joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_impl.hpp new file mode 100644 index 0000000000000..56250cf9fb3e1 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_impl.hpp @@ -0,0 +1,337 @@ +//------------------------------------------------------------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-------------------------------------------------------------------------===// + +#include +#include + +// number of test iterations +constexpr unsigned int testIterations = 100; +// start recording time after X iterations +constexpr unsigned int recordThresh = 10; + +#ifndef MATRIX_SIZE +#define MATRIX_SIZE 256 +#endif + +#ifdef MANUAL_UNROLL +template +static constexpr void loop(std::integer_sequence, F &&f) { + (f(std::integral_constant{}), ...); // C++17 fold expression +} + +template +static constexpr void manually_unroll_loop(F &&f) { + loop(std::make_integer_sequence{}, std::forward(f)); +} +#endif + +template class MatMul; + +template +double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { + size_t sgSize = get_sg_size>(q); + range<2> global{rowsA / MCache1, (colsB / NCache1) * sgSize}; + range<2> cachelocal{MCache2 / MCache1, NCache2 / NCache1 * sgSize}; + + // throw error if padding needed + assert(colsA == rowsB); + assert(rowsA % TM == 0); + assert(colsA % TK == 0); + assert(colsB % TN == 0); + // submit main kernel + std::chrono::high_resolution_clock::time_point start = + std::chrono::high_resolution_clock::now(); + + q.submit([&](handler &h) { + h.parallel_for>( // cache layer#1 + nd_range<2>{global, cachelocal}, + // loop global + // loop localrange + [=](nd_item<2> it) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + auto pA = + address_space_cast(A); + auto pB = + address_space_cast(B); + auto pC = + address_space_cast(C); + auto m2 = it.get_group(0); + auto n2 = it.get_group(1); + auto m1 = it.get_local_id(0); + auto n1 = it.get_local_id(1) / sgSize; + auto sg = it.get_sub_group(); + joint_matrix + tC[MCache1 / TM][NCache1 / TN] +#ifdef INIT_LIST + = {}; // default initialization of all array elements +#else + ; // no initialization +#endif + +#ifdef MANUAL_UNROLL + manually_unroll_loop([&](auto m) { + manually_unroll_loop([&](auto n) { +#else + for (unsigned int m = 0; m < MCache1 / TM; m++) { + for (unsigned int n = 0; n < NCache1 / TN; n++) { +#endif + joint_matrix_fill(sg, tC[m][n], 0); +#ifdef MANUAL_UNROLL + }); + }); +#else + } + } +#endif + + for (unsigned int k2 = 0; k2 < colsA / KCache2; k2++) { + joint_matrix + tA[MCache1 / TM][KCache2 / KCache1] +#ifdef INIT_LIST + = {}; // default initialization of all array elements +#else + ; // no initialization +#endif + + joint_matrix + tB[NCache1 / TN][KCache2 / KCache1] +#ifdef INIT_LIST + = {}; // default initialization of all array elements +#else + ; // no initialization +#endif + +#ifdef MANUAL_UNROLL + manually_unroll_loop([&](auto k1) { +#else + for (unsigned int k1 = 0; k1 < KCache2 / KCache1; k1++) { +#endif + // physical layer + unsigned int k = (k2 * KCache2 + k1 * KCache1) / TK; +#ifdef MANUAL_UNROLL + manually_unroll_loop([&](auto m) { +#else + for (unsigned int m = 0; m < MCache1 / TM; m++) { +#endif +#ifdef OOB + ext::intel::experimental::matrix::joint_matrix_load_checked( + sg, tA[m][k1], pA, colsA, rowsA, colsA, + m2 * MCache2 + m1 * MCache1 + m * TM, k * TK); +#else + joint_matrix_load( + sg, tA[m][k1], + pA + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsA + + k * TK, + colsA); +#endif +#ifdef MANUAL_UNROLL + }); // m +#else + } // m +#endif +#ifdef MANUAL_UNROLL + manually_unroll_loop([&](auto n) { +#else + for (unsigned int n = 0; n < NCache1 / TN; n++) { +#endif +#ifdef OOB + ext::intel::experimental::matrix::joint_matrix_load_checked( + sg, tB[n][k1], pB, colsB * VNNI, rowsB / VNNI, colsB * VNNI, + k * TK / VNNI, + (n2 * NCache2 + n1 * NCache1 + n * TN) * VNNI); +#else + joint_matrix_load(sg, tB[n][k1], + pB + (k * TK / VNNI) * (colsB * VNNI) + + (n2 * NCache2 + n1 * NCache1 + n * TN) * + VNNI, + colsB * VNNI); +#endif +#ifdef MANUAL_UNROLL + }); +#else + } // n +#endif +#ifdef MANUAL_UNROLL + manually_unroll_loop([&](auto m) { +#else + for (unsigned int m = 0; m < MCache1 / TM; m++) { +#endif +#ifdef MANUAL_UNROLL + manually_unroll_loop([&](auto n) { +#else + for (unsigned int n = 0; n < NCache1 / TN; n++) { + +#endif + joint_matrix_mad(sg, tC[m][n], tA[m][k1], tB[n][k1], + tC[m][n]); +#ifdef MANUAL_UNROLL + }); // n + }); // m + }); // for k1 +#else + } // n + } // m + } // k1 +#endif + } // for k2 +#ifdef MANUAL_UNROLL + manually_unroll_loop([&](auto m) { +#else + for (unsigned int m = 0; m < MCache1 / TM; m++) { +#endif +#ifdef MANUAL_UNROLL + manually_unroll_loop([&](auto n) { +#else + for (unsigned int n = 0; n < NCache1 / TN; n++) { +#endif +#ifdef OOB + ext::intel::experimental::matrix::joint_matrix_store_checked( + sg, tC[m][n], pC, colsB, layout::row_major, rowsA, colsB, + m2 * MCache2 + m1 * MCache1 + m * TM, + n2 * NCache2 + n1 * NCache1 + n * TN); +#else + joint_matrix_store( + sg, tC[m][n], + pC + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsB + + (n2 * NCache2 + n1 * NCache1 + n * TN), + colsB, layout::row_major); +#endif +#ifdef MANUAL_UNROLL + }); // n + }); // m +#else + } // n + } // m +#endif + }); // parallel_for + }); // queue.submit + + if (i == testIterations - 1) + q.wait(); + std::chrono::duration duration = + std::chrono::high_resolution_clock::now() - start; + + return duration.count(); +} + +template +void test() { + assert(MATRIX_SIZE >= TM && MATRIX_SIZE >= TK && MATRIX_SIZE >= TN && + "invalid matrix size"); + assert((MATRIX_SIZE % TM) == 0 && (MATRIX_SIZE % TN) == 0 && + (MATRIX_SIZE % TK) == 0 && + "invalid matrix size detected: not a multiple of "); + + std::cout << "Testing: " << TM << " x " << TN << " x " << TK + << " [TM x TN x TK]" << std::endl; + + queue q; + T *A = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); + T *B = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); + T *vnniB = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); + TResult *C = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); + TResult *refC = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); + + matrix_rand(MATRIX_SIZE, MATRIX_SIZE, A, T(1)); + matrix_rand(MATRIX_SIZE, MATRIX_SIZE, B, T(1)); + matrix_vnni(MATRIX_SIZE, MATRIX_SIZE, B, vnniB, VNNI); + + matrix_multiply_ref(A, B, refC, MATRIX_SIZE, MATRIX_SIZE, + MATRIX_SIZE); + + // run testIterations time, aggregate and calculate average run time + double totalDuration = 0; + for (unsigned int i = 0; i < testIterations; i++) { + double duration = + joint_matmul(A, vnniB, C, q, i); + if (i >= recordThresh) { + totalDuration += duration; + } + } + + assert(matrix_compare(MATRIX_SIZE, MATRIX_SIZE, C, refC)); + + double msecPerMatrixMul = + totalDuration / static_cast(testIterations - recordThresh); + double gflops = (2.f * MATRIX_SIZE * MATRIX_SIZE * MATRIX_SIZE * 1.0e-9f) / + (msecPerMatrixMul / 1000.f); + + std::cout << "DONE for size " << MATRIX_SIZE << std::endl; + std::cout << "GOPS is " << gflops << " Gop/s" << std::endl; + + free(A, q); + free(B, q); + free(vnniB, q); + free(C, q); + free(refC, q); +} + +int main() { + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + constexpr size_t MCache1 = 32; + constexpr size_t MCache2 = 256; + constexpr size_t NCache2 = 256; + constexpr size_t KCache2 = 32; + + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0) { // Intel AMX + constexpr size_t NCache1 = 32; + constexpr size_t KCache1 = 32; + + test(); + break; + } + + if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc + constexpr size_t NCache1 = 4 * /*TN*/ 16; + constexpr size_t KCache1 = 16; + + test(); +#if (!defined(SG_SZ) || SG_SZ != 32) + // These combination are not currently supported for subgroup size = 32 in + // IGC + test(); + test(); +#endif + break; + } + + if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* + constexpr size_t NCache1 = 4 * /*TN*/ 8; + constexpr size_t KCache1 = 16; + + test(); + break; + } + } + return 0; +} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_init.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_init.cpp new file mode 100644 index 0000000000000..0770e7881edc7 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_init.cpp @@ -0,0 +1,16 @@ +//==---joint_matrix_bf16_fill_k_cache_init.cpp - DPC++ joint_matrix--------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix, gpu + +// RUN: %{build} -o %t.out -DINIT_LIST -ffp-model=precise +// RUN: %{run} %t.out + +// -ffp-model=precise is added to not depend on compiler defaults. + +#include "common.hpp" +#include "joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll.cpp new file mode 100644 index 0000000000000..4f5616d7e7f4f --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll.cpp @@ -0,0 +1,18 @@ +//==---joint_matrix_bf16_fill_k_cache_unroll.cpp - DPC++ joint_matrix------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix + +// RUN: %{build} -mllvm -inline-threshold=2000 -ffp-model=precise -o %t.out -DMANUAL_UNROLL +// RUN: %{run} %t.out + +// -mllvm -inline-threshold=2000 added as a workaround, +// since IGC doesn't support some variants of IR for Joint Matrix currently +// -ffp-model=precise is added to not depend on compiler defaults. + +#include "common.hpp" +#include "joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp new file mode 100644 index 0000000000000..ff4c29251200d --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp @@ -0,0 +1,18 @@ +//==--joint_matrix_bf16_fill_k_cache_unroll_init.cpp - DPC++ joint_matrix--==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix, gpu + +// RUN: %{build} -mllvm -inline-threshold=2000 -ffp-model=precise -o %t_gpu.out -DINIT_LIST -DMANUAL_UNROLL +// RUN: %{run} %t_gpu.out + +// -mllvm -inline-threshold=2000 added as a workaround, +// since IGC doesn't support some variants of IR for Joint Matrix currently +// -ffp-model=precise is added to not depend on compiler defaults. + +#include "common.hpp" +#include "joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16.cpp index db664554a1a14..d1fbd4cbda15b 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16.cpp @@ -5,10 +5,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix -// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out +// RUN: %{build} -o %t.out // RUN: %{run} %t.out -#include "../common.hpp" -#include "../joint_matrix_bfloat16_impl.hpp" +#include "common.hpp" +#include "joint_matrix_bfloat16_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array.cpp index bcbb20228ac00..5cd2a4dc1962f 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array.cpp @@ -5,10 +5,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix -// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out +// RUN: %{build} -o %t.out // RUN: %{run} %t.out -#include "../common.hpp" -#include "../joint_matrix_bfloat16_array_impl.hpp" +#include "common.hpp" +#include "joint_matrix_bfloat16_array_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array_impl.hpp new file mode 100644 index 0000000000000..f393eaa5e8436 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array_impl.hpp @@ -0,0 +1,153 @@ +//==----------------------------------------------------------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-------------------------------------------------------------------------===// + +template class mult; + +static constexpr int JM_ARRAY_SZ = 2; + +template +void matrix_multiply(big_matrix &C, big_matrix &A, + big_matrix &B) { + size_t NDRangeM = M / (TM * JM_ARRAY_SZ); + size_t NDRangeN = N / TN; + buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufB(B.get_data(), range<2>(K, N)); + buffer bufC((TResult *)C.get_data(), range<2>(M, N)); + + queue q; + size_t sg_size = get_sg_size>(q); + q.submit([&](handler &cgh) { + sycl::accessor accA{bufA, cgh, sycl::read_write}; + sycl::accessor accB{bufB, cgh, sycl::read_write}; + sycl::accessor accC{bufC, cgh, sycl::read_write}; + + cgh.parallel_for>( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + // Matrix API has to be accessed by all the workitems in a + // subgroup. These functions will be called once by the subgroup. + // No code divergence between the workitems. + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix + sub_a[JM_ARRAY_SZ]; + + // For B, we assume B has been already VNNIed. + joint_matrix + sub_b; + joint_matrix + sub_c[JM_ARRAY_SZ]; + + for (int i = 0; i < JM_ARRAY_SZ; ++i) + joint_matrix_fill(sg, sub_c[i], TResult(1)); + + for (int k = 0; k < K / TK; ++k) { + joint_matrix_load( + sg, sub_b, + accB.template get_multi_ptr() + + (k * TK / VNNI) * (N * VNNI) + + sg_starty / sg_size * TN * VNNI, + N * VNNI); + + for (int i = 0; i < JM_ARRAY_SZ; ++i) { + joint_matrix_load( + sg, sub_a[i], + accA.template get_multi_ptr() + + (sg_startx * TM * JM_ARRAY_SZ + TM * i) * K + k * TK, + K); + joint_matrix_mad(sg, sub_c[i], sub_a[i], sub_b, sub_c[i]); + } + } + + for (int i = 0; i < JM_ARRAY_SZ; ++i) + joint_matrix_store( + sg, sub_c[i], + accC.template get_multi_ptr() + + (sg_startx * TM * JM_ARRAY_SZ + TM * i) * N + + sg_starty / sg_size * TN, + N, layout::row_major); + }); // parallel for + }).wait(); +} + +template +void test() { + std::cout << "Testing: " << TM << " x " << TN << " x " << TK + << " [TM x TN x TK]" << std::endl; + static constexpr size_t MATRIX_M = TM * 2; + static constexpr size_t MATRIX_N = TN * 2; + static constexpr size_t MATRIX_K = TK * 2; + + T A[MATRIX_M][MATRIX_K]; + T B[MATRIX_K / VNNI][MATRIX_N * VNNI]; + + TResult C[MATRIX_M][MATRIX_N]; + TResult D[MATRIX_M][MATRIX_N]; + + matrix_fill(MATRIX_M, MATRIX_K, (T *)A, + [](int i, int j) { return TResult(1) * (i + j); }); + matrix_fill(MATRIX_K / VNNI, MATRIX_N * VNNI, (T *)B, + [](int i, int j) { return TResult(2) * i + TResult(3) * j; }); + matrix_fill(MATRIX_M, MATRIX_N, (TResult *)C, TResult(1)); + matrix_fill(MATRIX_M, MATRIX_N, (TResult *)D, TResult(1)); + + big_matrix MC((TResult *)&C); + big_matrix MD((TResult *)&D); + big_matrix MA((T *)&A); + big_matrix MB((T *)&B); + + matrix_multiply( + MC, MA, MB); + matrix_multiply_ref((T *)A, (T *)B, (TResult *)D, + MATRIX_M, MATRIX_N, MATRIX_K / VNNI); + + assert(matrix_compare(MATRIX_M, MATRIX_N, (TResult *)C, (TResult *)D)); +} + +int main() { + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0) { // Intel AMX + test(); + break; + } + + if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc + test(); +#if (!defined(SG_SZ) || SG_SZ != 32) + // These combination are not currently supported for subgroup size = 32 in + // IGC + test(); + test(); + test(); + break; +#endif + } + + if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* + test(); + break; + } + } + return 0; +} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp new file mode 100644 index 0000000000000..21d5f1239cd8d --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp @@ -0,0 +1,22 @@ +//==-- joint_matrix_bfloat16_colmajorA_colmajorB.cpp - DPC++ joint_matrix--==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// This tests support of col major layout for matrix B which does transpose and +// then VNNI transform. This is currently only available on AMX + +// XFAIL: gpu + +#include "common.hpp" + +constexpr size_t TN = 16; + +#include "joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp new file mode 100644 index 0000000000000..e3234da2cd5d9 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp @@ -0,0 +1,105 @@ +//==-joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp- DPC++ joint_matrix-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#define TM 8 +#define TK 16 + +template +void matrix_multiply(big_matrix &C, big_matrix &A, + big_matrix &B) { + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufB(B.get_data(), range<2>(K, N)); + buffer bufC((float *)C.get_data(), range<2>(M, N)); + + queue q; + size_t sg_size = get_sg_size(q); + q.submit([&](handler &cgh) { + auto accC = bufC.get_access(cgh); + auto accA = bufA.get_access(cgh); + auto accB = bufB.get_access(cgh); + + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + // The submatrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup no + // code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix + sub_a; + // For B, we assume B has been already VNNIed. + joint_matrix + sub_b; + joint_matrix sub_c; + + joint_matrix_load( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::row_major); + for (int k = 0; k < K / TK; k += 1) { + joint_matrix_load( + sg, sub_a, + accA.template get_multi_ptr() + + (k * TK) * M + sg_startx * TM, + M); + joint_matrix_load( + sg, sub_b, + accB.template get_multi_ptr() + + (sg_starty / sg_size * TN) * K + k * TK, + K); + joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); + } + joint_matrix_store( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::row_major); + }); // parallel for + }).wait(); +} + +int main() { + static constexpr size_t MATRIX_M = TM * 2; + static constexpr size_t MATRIX_N = TN * 2; + static constexpr size_t MATRIX_K = TK * 2; + bfloat16 A[MATRIX_K][MATRIX_M]; + bfloat16 B[MATRIX_N][MATRIX_K]; + float C[MATRIX_M][MATRIX_N]; + float D[MATRIX_M][MATRIX_N]; + + matrix_fill(MATRIX_K, MATRIX_M, (bfloat16 *)A, + [](int i, int j) { return 1.0f * (i + j); }); + matrix_fill(MATRIX_N, MATRIX_K, (bfloat16 *)B, + [](int i, int j) { return 2.0f * i + 3.0f * j; }); + matrix_fill(MATRIX_M, MATRIX_N, (float *)C, 1.0f); + matrix_fill(MATRIX_M, MATRIX_N, (float *)D, 1.0f); + + big_matrix MC((float *)&C); + big_matrix MD((float *)&D); + big_matrix MA((bfloat16 *)&A); + big_matrix MB((bfloat16 *)&B); + matrix_multiply(MC, MA, MB); + matrix_multiply_ref((bfloat16 *)A, (bfloat16 *)B, (float *)D, MATRIX_M, + MATRIX_N, MATRIX_K, false, true, true); + + bool res = matrix_compare(MATRIX_M, MATRIX_N, (float *)C, (float *)D); + std::cout << (res ? "passed" : "failed") << std::endl; + return !res; +} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_impl.hpp new file mode 100644 index 0000000000000..fc1fc1963e047 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_impl.hpp @@ -0,0 +1,142 @@ +//===---joint_matrix_bfloat16_impl.hpp - DPC++ joint_matrix----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +template class imatrix; + +template +void matrix_multiply(big_matrix &C, big_matrix &A, + big_matrix &B) { + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufB(B.get_data(), range<2>(K, N)); + buffer bufC((float *)C.get_data(), range<2>(M, N)); + + queue q; + size_t sg_size = get_sg_size>(q); + q.submit([&](handler &cgh) { + auto accC = bufC.get_access(cgh); + auto accA = bufA.get_access(cgh); + auto accB = bufB.get_access(cgh); + + cgh.parallel_for>( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + // The submatrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup no + // code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix + sub_a; + // For B, we assume B has been already VNNIed. + joint_matrix + sub_b; + joint_matrix sub_c; + + joint_matrix_load( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::row_major); + for (int k = 0; k < K / TK; k += 1) { // + joint_matrix_load( + sg, sub_a, + accA.template get_multi_ptr() + + (sg_startx * TM) * K + k * TK, + K); + joint_matrix_load( + sg, sub_b, + accB.template get_multi_ptr() + + (k * TK / 2) * (N * 2) + sg_starty / sg_size * TN * 2, + N * 2); + joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); + } + joint_matrix_store( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::row_major); + }); // parallel for + }).wait(); +} + +template +void test() { + std::cout << "Testing: " << TM << " x " << TN << " x " << TK + << " [TM x TN x TK]" << std::endl; + + static constexpr size_t MATRIX_M = TM * 2; + static constexpr size_t MATRIX_N = TN * 2; + static constexpr size_t MATRIX_K = TK * 2; + T A[MATRIX_M][MATRIX_K]; + T B[MATRIX_K / 2][MATRIX_N * 2]; + TResult C[MATRIX_M][MATRIX_N]; + TResult D[MATRIX_M][MATRIX_N]; + + matrix_fill(MATRIX_M, MATRIX_K, (T *)A, + [](int i, int j) { return T(1) * (i + j); }); + matrix_fill(MATRIX_K / 2, MATRIX_N * 2, (T *)B, + [](int i, int j) { return T(2) * i + T(3) * j; }); + matrix_fill(MATRIX_M, MATRIX_N, (TResult *)C, TResult(1)); + matrix_fill(MATRIX_M, MATRIX_N, (TResult *)D, TResult(1)); + + big_matrix MC((TResult *)&C); + big_matrix MD((TResult *)&D); + big_matrix MA((T *)&A); + big_matrix MB((T *)&B); + matrix_multiply(MC, MA, + MB); + matrix_multiply_ref((T *)A, (T *)B, (TResult *)D, MATRIX_M, + MATRIX_N, MATRIX_K / 2); + + assert(matrix_compare(MATRIX_M, MATRIX_N, (TResult *)C, (TResult *)D)); +} +int main() { + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0) { // Intel AMX + test(); + break; + } + + if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc + test(); + + // This combination is not currently supported for sub group size = 32 in + // IGC +#if (!defined(SG_SZ) || SG_SZ != 32) + test(); + test(); + test(); +#endif + break; + } + + if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* + test(); + break; + } + } + return 0; +} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_packedB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_packedB.cpp new file mode 100644 index 0000000000000..bf3a65d2a16d7 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_packedB.cpp @@ -0,0 +1,18 @@ +//==----- joint_matrix_bfloat16_packedB.cpp - DPC++ joint_matrix----------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix +// REQUIRES-INTEL-DRIVER: lin: 27868, win: 101.5181 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=2 %{run} %t.out %} +// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=1 %{run} %t.out %} +// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=0 %{run} %t.out %} + +#include "common.hpp" +#include "joint_matrix_bfloat16_packedB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_packedB_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_packedB_impl.hpp new file mode 100644 index 0000000000000..6a7182c41985d --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_packedB_impl.hpp @@ -0,0 +1,132 @@ +//=----- joint_matrix_bfloat16_packedB_impl.hpp - DPC++ joint_matrix -------=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//=-------------------------------------------------------------------------=// + +template +void matrix_multiply(big_matrix &C, big_matrix &A, + big_matrix &B) { + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufB(B.get_data(), range<2>(K, N)); + buffer bufC((float *)C.get_data(), range<2>(M, N)); + + queue q; + size_t sg_size = get_sg_size(q); + q.submit([&](handler &cgh) { + auto accC = bufC.get_access(cgh); + auto accA = bufA.get_access(cgh); + auto accB = bufB.get_access(cgh); + + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + // The submatrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup no + // code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix + sub_a; + // For B, we assume B has been already VNNIed. + joint_matrix + sub_b; + joint_matrix sub_c; + + joint_matrix_load( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::row_major); + for (int k = 0; k < K / TK; k += 1) { // + joint_matrix_load( + sg, sub_a, + accA.template get_multi_ptr() + + (sg_startx * TM) * K + k * TK, + K); + // Assuming B data is already in VNNI format. + joint_matrix_load( + sg, sub_b, + accB.template get_multi_ptr() + + (k * TK / 2) * (N * 2) + sg_starty / sg_size * TN * 2, + N * 2); + joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); + } + joint_matrix_store( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::row_major); + }); // parallel for + }).wait(); +} + +template int test() { + static constexpr size_t MATRIX_M = TM * 2; + static constexpr size_t MATRIX_N = TN * 2; + static constexpr size_t MATRIX_K = TK * 2; + bfloat16 A[MATRIX_M][MATRIX_K]; + bfloat16 B[MATRIX_K / 2][MATRIX_N * 2]; + float C[MATRIX_M][MATRIX_N]; + float D[MATRIX_M][MATRIX_N]; + + matrix_fill(MATRIX_M, MATRIX_K, (bfloat16 *)A, + [](int i, int j) { return 1.0f * (i + j); }); + matrix_fill(MATRIX_K / 2, MATRIX_N * 2, (bfloat16 *)B, + [](int i, int j) { return 2.0f * i + 3.0f * j; }); + matrix_fill(MATRIX_M, MATRIX_N, (float *)C, 1.0f); + matrix_fill(MATRIX_M, MATRIX_N, (float *)D, 1.0f); + + big_matrix MC((float *)&C); + big_matrix MD((float *)&D); + big_matrix MA((bfloat16 *)&A); + big_matrix MB((bfloat16 *)&B); + matrix_multiply(MC, MA, MB); + matrix_multiply_ref( + (bfloat16 *)A, (bfloat16 *)B, (float *)D, MATRIX_M, MATRIX_N, + MATRIX_K / 2); + + bool res = matrix_compare(MATRIX_M, MATRIX_N, (float *)C, (float *)D); + std::cout << TM << "x" << TN << "x" << TK << " "; + std::cout << (res ? "passed" : "failed") << std::endl; + return !res; +} + +int main() { + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + int ret = 0; + for (auto &combination : combinations) { + if (combination.nsize == 0) { // Intel AMX + ret += test<16, 16, 16, class amx16x16x16>(); + break; + } + + if (combination.nsize == 16) { // architecture::intel_gpu_pvc + ret += test<16, 16, 16, class pvc16x16x16>(); + ret += test<32, 64, 16, class pvc32x64x16>(); + ret += test<1, 64, 16, class pvc1x64x16>(); + break; + } + } + + return ret; +} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_colA_rowB_colC.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_colA_rowB_colC.cpp new file mode 100644 index 0000000000000..354a71006e129 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_colA_rowB_colC.cpp @@ -0,0 +1,19 @@ +//==---------- joint_matrix_colA_rowB_colC.cpp - DPC++ joint_matrix---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// XFAIL:* + +#include "common.hpp" + +constexpr size_t TN = 16; + +#include "joint_matrix_colA_rowB_colC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_colA_rowB_colC_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_colA_rowB_colC_impl.hpp new file mode 100644 index 0000000000000..373ec652cc063 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_colA_rowB_colC_impl.hpp @@ -0,0 +1,99 @@ +//===---joint_matrix_colA_rowB_colC_impl.hpp - DPC++ joint_matrix----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include + +constexpr size_t TM = 8; +constexpr size_t TK = 16; + +template +void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q) { + size_t M = NUM_ROWS_C; + size_t N = NUM_COLS_C; + size_t K = NUM_COLS_A; + + assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B); + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + size_t sg_size = get_sg_size(q); + + q.submit([&](handler &cgh) { + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + auto pA = + address_space_cast(A); + auto pB = + address_space_cast(B); + auto pC = + address_space_cast(C); + + // The submatrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup no + // code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix + sub_a; + joint_matrix + sub_b; + joint_matrix sub_c; + joint_matrix_fill(sg, sub_c, 1); + for (int k = 0; k < K; k += TK) { + joint_matrix_load(sg, sub_a, pA + (sg_startx * TM) * K + k, K); + joint_matrix_load(sg, sub_b, pB + k * N + sg_starty / sg_size * TN, + N); + joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); + } + joint_matrix_store( + sg, sub_c, pC + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::col_major); + }); // parallel for + }).wait(); +} + +int main() { + static constexpr size_t MATRIX_M = 1024; + static constexpr size_t MATRIX_N = 1024; + static constexpr size_t MATRIX_K = 1024; + queue q; + bfloat16 *A = malloc_shared(MATRIX_M * MATRIX_K, q); + bfloat16 *B = malloc_shared(MATRIX_K * MATRIX_N, q); + float *C = malloc_shared(MATRIX_M * MATRIX_N, q); + float *D = malloc_shared(MATRIX_M * MATRIX_N, q); + + matrix_rand(MATRIX_M, MATRIX_K, A, (bfloat16)5); + matrix_rand(MATRIX_K, MATRIX_N, B, (bfloat16)5); + matrix_fill(MATRIX_M, MATRIX_N, C, (float)1.0); + matrix_fill(MATRIX_M, MATRIX_N, D, (float)1.0); + + matrix_multiply(C, A, B, q); + matrix_multiply_ref(A, B, D, MATRIX_M, MATRIX_N, MATRIX_K, + true /*transposed c*/); + + bool res = matrix_compare(MATRIX_M, MATRIX_N, C, D); + + std::cout << (res ? "passed" : "failed") << std::endl; + return !res; +} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_down_convert.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_down_convert.cpp new file mode 100644 index 0000000000000..dee504c22e7f6 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_down_convert.cpp @@ -0,0 +1,14 @@ +//==-------- joint_matrix_down_convert.cpp - DPC++ joint_matrix------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include "common.hpp" +#include "joint_matrix_down_convert_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_down_convert_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_down_convert_impl.hpp new file mode 100644 index 0000000000000..8ac48511c7e10 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_down_convert_impl.hpp @@ -0,0 +1,82 @@ +//==-------- joint_matrix_down_convert_impl.hpp - DPC++ joint_matrix-------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; + +constexpr size_t TM = 8; +// TN and TK must be the same for this test. +constexpr size_t TN = 16; +constexpr size_t TK = 16; + +template +void matrix_copy(big_matrix &C, big_matrix &A) { + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufC((float *)C.get_data(), range<2>(M, N)); + + queue q; + size_t sg_size = get_sg_size(q); + q.submit([&](handler &cgh) { + auto accC = bufC.get_access(cgh); + auto accA = bufA.get_access(cgh); + + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + // The submatrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup no + // code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix + sub_a; + joint_matrix sub_c; + + joint_matrix_load( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::row_major); + joint_matrix_copy(sg, sub_c, sub_a); + ext::intel::experimental::matrix::joint_matrix_store( + sg, sub_a, + accA.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N); + }); // parallel for + }).wait(); +} + +int main() { + static constexpr size_t MATRIX_M = TM * 2; + static constexpr size_t MATRIX_N = TN * 2; + static constexpr size_t MATRIX_K = TK * 2; + bfloat16 A[MATRIX_M][MATRIX_K]; + float C[MATRIX_M][MATRIX_N]; + + matrix_rand(MATRIX_M, MATRIX_N, *C, (float)5); + + big_matrix MC((float *)&C); + big_matrix MA((bfloat16 *)&A); + matrix_copy(MC, MA); + + bool res = matrix_compare(MATRIX_M, MATRIX_N, (bfloat16 *)A, (float *)C); + std::cout << (res ? "passed" : "failed") << std::endl; + return !res; +} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_half.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_half.cpp index 424bb83050048..44fa8cd86e7cb 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_half.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_half.cpp @@ -8,10 +8,9 @@ // REQUIRES: aspect-fp16 // REQUIRES: aspect-ext_intel_matrix // REQUIRES: matrix-fp16 -// REQUIRES: gpu -// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out +// RUN: %{build} -o %t.out // RUN: %{run} %t.out -#include "../common.hpp" -#include "../joint_matrix_half_impl.hpp" +#include "common.hpp" +#include "joint_matrix_half_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_half_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_half_impl.hpp new file mode 100644 index 0000000000000..a9de620af4237 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_half_impl.hpp @@ -0,0 +1,131 @@ +//===---joint_matrix_half_impl.hpp - DPC++ joint_matrix--------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +template class mult; + +template +void matrix_multiply(big_matrix &C, big_matrix &A, + big_matrix &B) { + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufB(B.get_data(), range<2>(K, N)); + buffer bufC(C.get_data(), range<2>(M, N)); + + queue q; + size_t sg_size = get_sg_size>(q); + q.submit([&](handler &cgh) { + accessor accA{bufA, cgh}; + accessor accB{bufB, cgh}; + accessor accC{bufC, cgh}; + + cgh.parallel_for>( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + // The submatrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup + // no code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a; + // For B, we assume B has been already VNNIed. + joint_matrix + sub_b; + joint_matrix sub_c; + + joint_matrix_load( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::row_major); + for (int k = 0; k < K / TK; k += 1) { + joint_matrix_load( + sg, sub_a, + accA.template get_multi_ptr() + + (sg_startx * TM) * K + k * TK, + K); + joint_matrix_load( + sg, sub_b, + accB.template get_multi_ptr() + + (k * TK / VNNI) * (N * VNNI) + + sg_starty / sg_size * TN * VNNI, + N * VNNI); + joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); + } + joint_matrix_store( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::row_major); + }); // parallel for + }).wait(); +} + +template +void test() { + static constexpr size_t MATRIX_M = TM * 2; + static constexpr size_t MATRIX_N = TN * 2; + static constexpr size_t MATRIX_K = TK * 2; + T A[MATRIX_M][MATRIX_K]; + T B[MATRIX_K / VNNI][MATRIX_N * VNNI]; + TResult C[MATRIX_M][MATRIX_N]; + TResult D[MATRIX_M][MATRIX_N]; + + matrix_fill(MATRIX_M, MATRIX_K, (T *)A, + [](int i, int j) { return i + 2 * j; }); + matrix_fill(MATRIX_K / VNNI, MATRIX_N * VNNI, (T *)B, + [](int i, int j) { return i + j; }); + matrix_fill(MATRIX_M, MATRIX_N, (TResult *)C, TResult(1)); + matrix_fill(MATRIX_M, MATRIX_N, (TResult *)D, TResult(1)); + + big_matrix MC((TResult *)&C); + big_matrix MD((TResult *)&D); + big_matrix MA((T *)&A); + big_matrix MB((T *)&B); + matrix_multiply( + MC, MA, MB); + matrix_multiply_ref((T *)A, (T *)B, (TResult *)D, + MATRIX_M, MATRIX_N, MATRIX_K / VNNI); + + assert(matrix_compare(MATRIX_M, MATRIX_N, (TResult *)C, (TResult *)D)); +} + +int main() { + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0) { // Intel AMX + test(); + break; + } + + if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc + test(); + break; + } + + if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* + test(); + break; + } + } + return 0; +} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_colmajorA_colmajorB.cpp new file mode 100644 index 0000000000000..37769a41f7003 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_colmajorA_colmajorB.cpp @@ -0,0 +1,22 @@ +//==----- joint_matrix_int8_colmajorA_colmajorB.cpp - DPC++ joint_matrix---==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// This tests support of col major layout for matrix B which does transpose and +// then VNNI transform. This is currently only available on AMX + +// XFAIL: gpu + +#include "common.hpp" + +constexpr size_t TN = 16; + +#include "joint_matrix_int8_colmajorA_colmajorB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_colmajorA_colmajorB_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_colmajorA_colmajorB_impl.hpp new file mode 100644 index 0000000000000..1390f8225406c --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_colmajorA_colmajorB_impl.hpp @@ -0,0 +1,114 @@ +//===-joint_matrix_int8_colmajorA_colmajorB_impl.hpp - DPC++ joint_matrix--===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#define TM 8 +#define TK 32 + +template +void matrix_multiply(big_matrix &C, + big_matrix &A, + big_matrix &B) { + size_t M = NUM_ROWS_C; + size_t N = NUM_COLS_C; + size_t K = NUM_COLS_A; + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufB(B.get_data(), range<2>(K, N)); + buffer bufC(C.get_data(), range<2>(M, N)); + + queue q; + size_t sg_size = get_sg_size(q); + q.submit([&](handler &cgh) { + auto accC = bufC.get_access(cgh); + auto accA = bufA.get_access(cgh); + auto accB = bufB.get_access(cgh); + + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + // The submatrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup no + // code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix + sub_a; + joint_matrix + sub_b; + joint_matrix sub_c; + + joint_matrix_fill(sg, sub_c, 0); + for (int k = 0; k < K / TK; k += 1) { + joint_matrix_load( + sg, sub_a, + accA.template get_multi_ptr() + + (k * TK) * M + sg_startx * TM, + M); + joint_matrix_load( + sg, sub_b, + accB.template get_multi_ptr() + + (sg_starty / sg_size * TN) * K + k * TK, + K); + joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); + } + joint_matrix_store( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::row_major); + }); // parallel for + }).wait(); +} + +int main() { + static constexpr size_t MATRIX_M = TM; + static constexpr size_t MATRIX_N = TN; + static constexpr size_t MATRIX_K = TK; + int8_t A[MATRIX_K][MATRIX_M]; + int8_t Aref[MATRIX_K][MATRIX_M]; + int8_t B[MATRIX_N][MATRIX_K]; + int8_t Bref[MATRIX_N][MATRIX_K]; + int32_t C[MATRIX_M][MATRIX_N]; + int32_t D[MATRIX_M][MATRIX_N]; + + matrix_fill(MATRIX_K, MATRIX_M, (int8_t *)A, + [](int i, int j) { return 2 * i + j; }); + matrix_fill(MATRIX_K, MATRIX_M, (int8_t *)Aref, + [](int i, int j) { return 2 * i + j; }); + + matrix_fill(MATRIX_N, MATRIX_K, (int8_t *)B, + [](int i, int j) { return i + 2 * j; }); + matrix_fill(MATRIX_N, MATRIX_K, (int8_t *)Bref, + [](int i, int j) { return i + 2 * j; }); + + matrix_fill(MATRIX_M, MATRIX_N, (int32_t *)C, 0); + matrix_fill(MATRIX_M, MATRIX_N, (int32_t *)D, 0); + + big_matrix MC((int32_t *)&C); + big_matrix MD((int32_t *)&D); + big_matrix MA((int8_t *)&A); + big_matrix MB((int8_t *)&B); + matrix_multiply(MC, MA, MB); + matrix_multiply_ref((int8_t *)Aref, (int8_t *)Bref, (int32_t *)D, MATRIX_M, + MATRIX_N, MATRIX_K, false, true, true); + + bool res = matrix_compare(MATRIX_M, MATRIX_N, (int32_t *)C, (int32_t *)D); + std::cout << (res ? "passed" : "failed") << std::endl; + return !res; +} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_rowmajorA_rowmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_rowmajorA_rowmajorB.cpp new file mode 100644 index 0000000000000..fe2506c2acf69 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_rowmajorA_rowmajorB.cpp @@ -0,0 +1,19 @@ +//==----- joint_matrix_int8_rowmajorA_rowmajorB.cpp - DPC++ joint_matrix---==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// Run these 2 tests on PVC only for now. Check can be updated to "gpu", +// when newer IGC is used in intel/llvm pre-checkin testing on Intel Arc +// RUN: %if arch-intel_gpu_pvc %{ env IGC_JointMatrixLoadStoreOpt=0 %{run} %t.out %} +// RUN: %if arch-intel_gpu_pvc %{ env IGC_JointMatrixLoadStoreOpt=1 %{run} %t.out %} + +#include "common.hpp" +#include "joint_matrix_int8_rowmajorA_rowmajorB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_rowmajorA_rowmajorB_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_rowmajorA_rowmajorB_impl.hpp new file mode 100644 index 0000000000000..12f91f083def4 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_rowmajorA_rowmajorB_impl.hpp @@ -0,0 +1,123 @@ +//===-joint_matrix_int8_rowmajorA_rowmajorB_impl.hpp - DPC++ joint_matrix--===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +template class mult; + +template +void matrix_multiply(big_matrix &C, big_matrix &A, + big_matrix &B) { + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufB(B.get_data(), range<2>(K, N)); + buffer bufC(C.get_data(), range<2>(M, N)); + + queue q; + size_t sg_size = get_sg_size>(q); + q.submit([&](handler &cgh) { + accessor accA{bufA, cgh}; + accessor accB{bufB, cgh}; + accessor accC{bufC, cgh}; + + cgh.parallel_for>( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + // The submatrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup no + // code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a; + joint_matrix sub_b; + joint_matrix sub_c; + + joint_matrix_fill(sg, sub_c, 0); + for (int k = 0; k < K / TK; k += 1) { + joint_matrix_load( + sg, sub_a, + accA.template get_multi_ptr() + + (sg_startx * TM) * K + k * TK, + K); + // VNNI transform is done automatically at this level + joint_matrix_load( + sg, sub_b, + accB.template get_multi_ptr() + + (k * TK) * N + sg_starty / sg_size * TN, + N); + joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); + } + joint_matrix_store( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::row_major); + }); // parallel for + }).wait(); +} + +template +void test() { + static constexpr size_t MATRIX_M = TM * 2; + static constexpr size_t MATRIX_N = TN * 2; + static constexpr size_t MATRIX_K = TK * 2; + T A[MATRIX_M][MATRIX_K]; + T B[MATRIX_K][MATRIX_N]; + TResult C[MATRIX_M][MATRIX_N]; + TResult D[MATRIX_M][MATRIX_N]; + + matrix_fill(MATRIX_M, MATRIX_K, (T *)A, [](int i, int j) { return i + j; }); + matrix_fill(MATRIX_K, MATRIX_N, (T *)B, + [](int i, int j) { return i + j * 2; }); + matrix_fill(MATRIX_M, MATRIX_N, (TResult *)C, 0); + matrix_fill(MATRIX_M, MATRIX_N, (TResult *)D, 0); + + big_matrix MC((TResult *)&C); + big_matrix MD((TResult *)&D); + big_matrix MA((T *)&A); + big_matrix MB((T *)&B); + matrix_multiply(MC, MA, + MB); + matrix_multiply_ref((T *)A, (T *)B, (TResult *)D, MATRIX_M, + MATRIX_N, MATRIX_K); + + assert(matrix_compare(MATRIX_M, MATRIX_N, (TResult *)C, (TResult *)D)); +} + +int main() { + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0) { // Intel AMX + test(); + break; + } + + if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc + test(); + break; + } + + if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* + test(); + break; + } + } + return 0; +} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature.cpp new file mode 100644 index 0000000000000..a6b72f80a989d --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature.cpp @@ -0,0 +1,18 @@ +//===---joint_matrix_opt_kernel_feature.cpp - DPC++ joint_matrix-----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: aspect-ext_intel_matrix + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// Test checks that exception will be thrown in case matrix parameters are +// incompatible on the current device + +#include "common.hpp" +#include "joint_matrix_opt_kernel_feature_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature_impl.hpp new file mode 100644 index 0000000000000..7aba5911c8386 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature_impl.hpp @@ -0,0 +1,109 @@ +//===---joint_matrix_opt_kernel_feature_impl.hpp - DPC++ joint_matrix------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; + +static constexpr size_t M_MULTIPLIER = 16; + +template +void matrix_multiply(big_matrix &C, big_matrix &A, + big_matrix &B) { + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufB(B.get_data(), range<2>(K, N)); + buffer bufC(C.get_data(), range<2>(M, N)); + + queue q; + size_t sg_size = get_sg_size(q); + q.submit([&](handler &cgh) { + sycl::accessor accC{bufC, cgh, sycl::read_write}; + sycl::accessor accA{bufA, cgh, sycl::read_only}; + sycl::accessor accB{bufB, cgh, sycl::read_only}; + + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a; + joint_matrix + sub_b; + joint_matrix sub_c; + + joint_matrix_load( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::row_major); + for (int k = 0; k < K / TK; k += 1) { + joint_matrix_load( + sg, sub_a, + accA.template get_multi_ptr() + + (sg_startx * TM) * K + k * TK, + K); + joint_matrix_load( + sg, sub_b, + accB.template get_multi_ptr() + + (k * TK / vnniFactor) * (N * vnniFactor) + + sg_starty / sg_size * TN * vnniFactor, + N * vnniFactor); + joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); + } + }); + }).wait(); +} + +template +void init_and_multiply() { + static constexpr size_t MATRIX_M = tM * M_MULTIPLIER; + static constexpr size_t MATRIX_N = 128; + static constexpr size_t MATRIX_K = 128; + + Ta A[MATRIX_M][MATRIX_K]; + Ta B[MATRIX_K][MATRIX_N]; + Ta Bvnni[MATRIX_K / vnni_factor][MATRIX_N * vnni_factor]; + Tc C[MATRIX_M][MATRIX_N]; + + matrix_rand(MATRIX_M, MATRIX_K, (Ta *)A, (Ta)50); + matrix_rand(MATRIX_K, MATRIX_N, (Ta *)B, (Ta)50); + matrix_fill(MATRIX_M, MATRIX_N, (Tc *)C, (Tc)1); + + big_matrix MC((Tc *)&C); + big_matrix MA((Ta *)&A); + matrix_vnni(MATRIX_K, MATRIX_N, (Ta *)&B, (Ta *)&Bvnni, vnni_factor); + big_matrix MBvnni( + (Ta *)&Bvnni); + + matrix_multiply(MC, MA, MBvnni); +} + +int main() { + try { + init_and_multiply(); // 500 is not correct size + } catch (const sycl::exception &e) { + if (e.code() == errc::invalid) + return 0; + throw; + } + + return 1; +} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature_unsupported_hw.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature_unsupported_hw.cpp new file mode 100644 index 0000000000000..abf1af49e6933 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature_unsupported_hw.cpp @@ -0,0 +1,36 @@ +//===---joint_matrix_opt_kernel_feature_unsupported_hw_impl.cpp------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: gpu-intel-gen12, gpu + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// Test checks that exception will be thrown in case object of joint_matrix type +// is used on unsupported HW, in this case, on Gen12. + +#include "common.hpp" + +int main() { + sycl::queue q; + + try { + q.submit([&](sycl::handler &cgh) { + cgh.single_task([]() { + joint_matrix + m; // matrix type and sizes do not matter + }); + }); + } catch (const sycl::exception &e) { + assert((e.code() == sycl::errc::kernel_not_supported) && + (std::string(e.what()) == + std::string("no matrix hardware on the target device, joint_matrix " + "is not supported"))); + } + return 0; +} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_out_bounds.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_out_bounds.cpp new file mode 100644 index 0000000000000..f3485408373b9 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_out_bounds.cpp @@ -0,0 +1,22 @@ +//==-------- joint_matrix_out_bounds.cpp - DPC++ joint_matrix--------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix + +// UNSUPPORTED: gpu-intel-dg2 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// XFAIL:* + +#include "common.hpp" + +constexpr size_t TN = 16; +constexpr size_t MATRIX_K = 1024 + 24; + +#include "joint_matrix_out_bounds_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_out_bounds_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_out_bounds_impl.hpp new file mode 100644 index 0000000000000..4be21beea9c45 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_out_bounds_impl.hpp @@ -0,0 +1,124 @@ +//===---joint_matrix_out_bounds_impl.hpp - DPC++ joint_matrix--------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include + +constexpr size_t TM = 8; +constexpr size_t TK = 16; + +template class mult; + +template +void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q) { + size_t M = NUM_ROWS_C; + size_t N = NUM_COLS_C; + size_t K = NUM_COLS_A; + + assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B * vnniFactor); + // Add one iteration for the out of bounds dpas instruction + size_t NDRangeM = M / TM + (((M % TM) != 0) ? 1 : 0); + size_t NDRangeN = N / TN; + size_t sg_size = get_sg_size>(q); + + q.submit([&](handler &cgh) { + cgh.parallel_for>( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + auto pA = + address_space_cast(A); + auto pB = + address_space_cast(B); + auto pC = + address_space_cast(C); + // The submatrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup no + // code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix + sub_a; + + // For B, since current implementation does not support non-packed + // layout, users need to specify the packed_b layout. + joint_matrix sub_b; + joint_matrix sub_c; + // bounds-checked load where width and height are added + ext::intel::experimental::matrix::joint_matrix_fill_checked( + sg, sub_c, 1, N, M, N, sg_startx * TM, sg_starty / sg_size * TN); + for (int k = 0; k < K; k += TK) { + // bounds-checked load where width and height are added + ext::intel::experimental::matrix::joint_matrix_load_checked( + sg, sub_a, pA, K, M, K, sg_startx * TM, k); + // Assume we alreay in vnni format. + // bounds-checked load where width and height are added + ext::intel::experimental::matrix::joint_matrix_load_checked( + sg, sub_b, pB, N * vnniFactor, K / vnniFactor, N * vnniFactor, + k, sg_starty / sg_size * TN * vnniFactor); + joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); + } + // bounds-checked store where width and height are added + ext::intel::experimental::matrix::joint_matrix_store_checked( + sg, sub_c, pC, N, layout::row_major, M, N, sg_startx * TM, + sg_starty / sg_size * TN); + }); // parallel for + }).wait(); +} + +int main() { + static constexpr size_t MATRIX_M = 1024 + 14; + static constexpr size_t MATRIX_N = 1024; + static constexpr unsigned int vnniFactor = 2; + + queue q; + bfloat16 *A = malloc_shared(MATRIX_M * MATRIX_K, q); + bfloat16 *B = malloc_shared(MATRIX_K * MATRIX_N, q); + bfloat16 *vnniB = malloc_shared(MATRIX_K * MATRIX_N, q); + float *C = malloc_shared(MATRIX_M * MATRIX_N, q); + float *D = malloc_shared(MATRIX_M * MATRIX_N, q); + + matrix_rand(MATRIX_M, MATRIX_K, A, (bfloat16)5); + matrix_rand(MATRIX_K, MATRIX_N, B, (bfloat16)5); + matrix_fill(MATRIX_M, MATRIX_N, C, (float)1); + matrix_fill(MATRIX_M, MATRIX_N, D, (float)1); + + matrix_vnni(MATRIX_K, MATRIX_N, B, vnniB, vnniFactor); + + matrix_multiply_ref(A, B, D, MATRIX_M, MATRIX_N, MATRIX_K); + matrix_multiply(C, A, vnniB, q); + bool res = matrix_compare(MATRIX_M, MATRIX_N, C, D); + + matrix_multiply(C, A, B, q); + res = res && matrix_compare(MATRIX_M, MATRIX_N, C, D); + + std::cout << (res ? "passed" : "failed") << std::endl; + + free(A, q); + free(B, q); + free(vnniB, q); + free(C, q); + free(D, q); + + return !res; +} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_prefetch.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_prefetch.cpp index fe3c25e338094..df5c4b27a122b 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_prefetch.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_prefetch.cpp @@ -5,11 +5,11 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix -// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out +// REQUIRES: aspect-ext_intel_matrix +// RUN: %{build} -o %t.out // RUN: %{run} %t.out -#include "../common.hpp" +#include "common.hpp" constexpr size_t TN = 16; -#include "../joint_matrix_prefetch_impl.hpp" +#include "joint_matrix_prefetch_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_prefetch_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_prefetch_impl.hpp new file mode 100644 index 0000000000000..1e665f618860f --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_prefetch_impl.hpp @@ -0,0 +1,148 @@ +//===---joint_matrix_prefetch_impl.hpp - DPC++ joint_matrix----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#define TM 8 +#define TK 16 + +template class mult; + +template +void joint_matrix_gemm_vnni(sub_group sg, size_t sg_startx, size_t sg_starty, + size_t sg_size, T1 *A, T2 *B, T *C) { + auto pA = address_space_cast(A); + auto pB = address_space_cast(B); + auto pC = address_space_cast(C); + + joint_matrix sub_a; + joint_matrix sub_b; + joint_matrix sub_c; + joint_matrix_prefetch(sg, A + (sg_startx * TM) * K, K, + layout::row_major, + syclex::properties{syclex::prefetch_hint_L1}); + joint_matrix_prefetch(sg, B + sg_starty / sg_size * TN * vnniFactor, + N * vnniFactor, B_layout, + syclex::properties{syclex::prefetch_hint_L1}); + + if constexpr (C_layout == layout::col_major) + joint_matrix_prefetch( + sg, C + (sg_starty / sg_size * TN) * M + (sg_startx * TM), M, C_layout, + syclex::properties{syclex::prefetch_hint_L1}); + else + joint_matrix_prefetch( + sg, C + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, C_layout, + syclex::properties{syclex::prefetch_hint_L1}); + + joint_matrix_fill(sg, sub_c, 1); + for (int k = 0; k < K; k += TK) { + joint_matrix_load(sg, sub_a, pA + (sg_startx * TM) * K + k, K); + joint_matrix_load(sg, sub_b, + pB + k * N + sg_starty / sg_size * TN * vnniFactor, + N * vnniFactor); + joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); + } + + if constexpr (C_layout == layout::col_major) + joint_matrix_store(sg, sub_c, + pC + (sg_starty / sg_size * TN) * M + (sg_startx * TM), + M, C_layout); + else + joint_matrix_store(sg, sub_c, + pC + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, + C_layout); +} + +template +void matrix_multiply(T *C, T1 *A, T2 *B, queue q) { + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + + size_t sg_size = get_sg_size>(q); + q.submit([&](handler &cgh) { + cgh.parallel_for>( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix_gemm_vnni(sg, sg_startx, sg_starty, sg_size, + A, B, C); + }); // parallel for + }).wait(); +} + +int main() { + queue q; + std::vector combinations = + q.get_device() + .get_info(); + bool support_p = false; + // joint_matrix_prefetch is not supported on DG2 + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0 || combinations[i].nsize == 16) { + support_p = true; + break; + } + } + if (!support_p) { + std::cout << "Prefetch not supported on this device" << std::endl; + return 0; + } + static constexpr size_t M = TM * 2; + static constexpr size_t N = TN * 2; + static constexpr size_t K = TK * 2; + static constexpr unsigned int vnniFactor = 2; + bfloat16 *A = malloc_shared(M * K, q); + bfloat16 *B = malloc_shared(K * N, q); + bfloat16 *vnniB = malloc_shared(K * N, q); + float *C = malloc_shared(M * N, q); + float *D = malloc_shared(M * N, q); + float *Dt = malloc_shared(M * N, q); + + matrix_fill(M, K, A, [](int i, int j) { return 1.0f * (i + j); }); + matrix_fill(K, N, (bfloat16 *)B, + [](int i, int j) { return 2.0f * i + 3.0f * j; }); + matrix_fill(M, N, C, 1.0f); + matrix_fill(M, N, D, 1.0f); + matrix_fill(M, N, Dt, 1.0f); + + matrix_vnni(K, N, B, vnniB, vnniFactor); + + matrix_multiply_ref(A, B, D, M, N, K); + matrix_multiply(C, A, B, q); + + bool res = matrix_compare(M, N, C, D); + matrix_multiply(C, A, vnniB, q); + + res = res && matrix_compare(M, N, C, D); + + matrix_multiply_ref(A, B, Dt, M, N, K, true); + matrix_multiply(C, A, B, q); + + res = res && matrix_compare(M, N, C, Dt); + std::cout << (res ? "passed" : "failed") << std::endl; + return !res; +} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_query_default.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_query_default.cpp new file mode 100644 index 0000000000000..f160c09525aa7 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_query_default.cpp @@ -0,0 +1,174 @@ +//==-------- joint_matrix_query_default.cpp - DPC++ joint_matrix-----------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Needs AMX. +// REQUIRES: cpu +// REQUIRES: matrix + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include +#include +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; + +template struct big_matrix { +public: + T *mat; + +public: + T *get_data() { return mat; } + void set_data(T *data) { mat = data; } + big_matrix(T *data) : mat(data) {} +}; + +template +void matrix_multiply(big_matrix &C, + big_matrix &A, + big_matrix &B) { + size_t M = NUM_ROWS_C; + size_t N = NUM_COLS_C; + size_t K = NUM_COLS_A; + assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B * 4); + + using myparams2 = matrix_params< + sycl::ext::oneapi::experimental::architecture::intel_cpu_spr, int8_t, + int8_t, int>; + constexpr int TM = myparams2::M; + constexpr int TN = myparams2::N; + constexpr int TK = myparams2::K; + + std::cout << "AMX query sizes are: M " << TM << " N " << TN << " K " << TK + << std::endl; + + constexpr int SG_SZ = TN; + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufB(B.get_data(), range<2>(K, N)); + buffer bufC(C.get_data(), range<2>(M, N)); + + queue q; + q.submit([&](handler &cgh) { + auto accC = bufC.get_access(cgh); + auto accA = bufA.get_access(cgh); + auto accB = bufB.get_access(cgh); + + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), + [accA, accB, accC, M, N, K](nd_item<2> spmd_item) + [[intel::reqd_sub_group_size(SG_SZ)]] + + { + // The submatrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup no + // code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sycl::sub_group sg = spmd_item.get_sub_group(); + + myparams2::joint_matrix_a sub_a; + myparams2::joint_matrix_b sub_b; + myparams2::joint_matrix_c sub_c; + + joint_matrix_load( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + N, layout::row_major); + for (int k = 0; k < K / TK; k += 1) { + joint_matrix_load( + sg, sub_a, + accA.template get_multi_ptr() + + (sg_startx * TM) * K + k * TK, + K); + // Assuming B data is already in VNNI format. + joint_matrix_load( + sg, sub_b, + accB.template get_multi_ptr() + + (k * TK / 4) * (N * 4) + sg_starty / SG_SZ * TN * 4, + N * 4); + joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); + } + joint_matrix_store( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + N, layout::row_major); + }); // parallel for + }).wait(); +} + +static constexpr size_t MATRIX_M = 128; +static constexpr size_t MATRIX_N = 128; +static constexpr size_t MATRIX_K = 128; +int8_t A[MATRIX_M][MATRIX_K]; +int8_t B[MATRIX_K / 4][MATRIX_N * 4]; +int32_t C[MATRIX_M][MATRIX_N]; +int32_t D[MATRIX_M][MATRIX_N]; + +void matrix_multiply_ref(int32_t *A_mem, int32_t *B_mem, int32_t *C_mem, int M, + int N, int K) { + // tiling + for (int m = 0; m < M; m++) + for (int n = 0; n < N; n++) { + for (int k = 0; k < K; k++) { + char *va = (char *)(A_mem + m * K + k); + char *vb = (char *)(B_mem + k * N + n); + int acc = *(C_mem + m * N + n); + for (int i = 0; i < 4; i++) { + acc += (va[i] * vb[i]); + } + *(C_mem + m * N + n) = acc; + } + } +} + +int main() { + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_K; j++) { + A[i][j] = i + 2 * j; + } + } + for (int i = 0; i < MATRIX_K / 4; i++) { + for (int j = 0; j < MATRIX_N * 4; j++) { + B[i][j] = i + j; + } + } + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) { + C[i][j] = 1; + D[i][j] = 1; + } + } + + big_matrix MC((int32_t *)&C); + big_matrix MD((int32_t *)&D); + big_matrix MA((int8_t *)&A); + big_matrix MB((int8_t *)&B); + matrix_multiply(MC, MA, MB); + matrix_multiply_ref((int32_t *)A, (int32_t *)B, (int32_t *)D, MATRIX_M, + MATRIX_N, MATRIX_K / 4); + + bool res = true; + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) { + if (C[i][j] != D[i][j]) + res = false; + } + } + std::cout << (res ? "passed" : "failed") << std::endl; + return !res; +} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_rowmajorA_rowmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_rowmajorA_rowmajorB.cpp new file mode 100644 index 0000000000000..bfcf624abe97b --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_rowmajorA_rowmajorB.cpp @@ -0,0 +1,22 @@ +//==-------joint_matrix_rowmajorA_rowmajorB.cpp - DPC++ joint_matrix-------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix +// VNNI transform is not supported yet by IGC on DG2 +// UNSUPPORTED: gpu-intel-dg2 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=2 %{run} %t.out %} +// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=1 %{run} %t.out %} +// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=0 %{run} %t.out %} + +// This tests support of row major layout for matrix B which does automatic VNNI +// transform. This is currently only available on AMX and XMX of PVC + +#include "common.hpp" +#include "joint_matrix_rowmajorA_rowmajorB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_rowmajorA_rowmajorB_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_rowmajorA_rowmajorB_impl.hpp new file mode 100644 index 0000000000000..754d49c354d01 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_rowmajorA_rowmajorB_impl.hpp @@ -0,0 +1,151 @@ +//==-----joint_matrix_rowmajorA_rowmajorB_impl.hpp - DPC++ joint_matrix----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +template +void matrix_multiply(big_matrix &C, big_matrix &A, + big_matrix &B) { + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + buffer bufA((TA *)A.get_data(), range<2>(M, K)); + buffer bufB((TB *)B.get_data(), range<2>(K, N)); + buffer bufC((TC *)C.get_data(), range<2>(M, N)); + + queue q; + size_t sg_size = get_sg_size(q); + q.submit([&](handler &cgh) { + sycl::accessor accC{bufC, cgh, sycl::read_write}; + sycl::accessor accA{bufA, cgh, sycl::read_only}; + sycl::accessor accB{bufB, cgh, sycl::read_only}; + + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + // The submatrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup no + // code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a; + joint_matrix sub_b; + joint_matrix sub_c; + + joint_matrix_load( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::row_major); + for (int k = 0; k < K / TK; k += 1) { + joint_matrix_load( + sg, sub_a, + accA.template get_multi_ptr() + + (sg_startx * TM) * K + k * TK, + K); + joint_matrix_load( + sg, sub_b, + accB.template get_multi_ptr() + + (k * TK) * (N) + sg_starty / sg_size * TN, + N); + joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); + } + joint_matrix_store( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::row_major); + }); // parallel for + }).wait(); +} + +template +int gemm_row_major() { + static constexpr size_t MATRIX_M = TM * 2; + static constexpr size_t MATRIX_N = TN * 2; + static constexpr size_t MATRIX_K = TK * 2; + TA A[MATRIX_M][MATRIX_K]; + TB B[MATRIX_K][MATRIX_N]; + TC C[MATRIX_M][MATRIX_N]; + TC D[MATRIX_M][MATRIX_N]; + + matrix_fill(MATRIX_M, MATRIX_K, (TA *)A, + [](int i, int j) { return 1 * (i + j); }); + matrix_fill(MATRIX_K, MATRIX_N, (TB *)B, + [](int i, int j) { return 2 * i + 3 * j; }); + matrix_fill(MATRIX_M, MATRIX_N, (TC *)C, (TC)1); + matrix_fill(MATRIX_M, MATRIX_N, (TC *)D, (TC)1); + + big_matrix MC((TC *)&C); + big_matrix MD((TC *)&D); + big_matrix MA((TA *)&A); + big_matrix MB((TB *)&B); + matrix_multiply(MC, MA, MB); + matrix_multiply_ref((TA *)A, (TB *)B, (TC *)D, MATRIX_M, MATRIX_N, MATRIX_K); + + bool res = matrix_compare(MATRIX_M, MATRIX_N, (TC *)C, (TC *)D); + std::cout << TM << "x" << TN << "x" << TK << ": "; + std::cout << (res ? "passed" : "failed") << std::endl; + return !res; +} + +int main() { + queue q; + std::vector combinations = + q.get_device() + .get_info(); + int res = 0; + for (auto &combination : combinations) { + if (combination.nsize == 0 || + combination.nsize == 16) { // Intel AMX or architecture::intel_gpu_pvc + res += gemm_row_major<8, 16, 16, class bf16_8x16x16, bfloat16, bfloat16, + float>(); + res += gemm_row_major<8, 16, 32, class ss_8x16x32, int8_t, int8_t, + int32_t>(); + res += gemm_row_major<8, 16, 32, class us_8x16x32, uint8_t, int8_t, + int32_t>(); + res += gemm_row_major<8, 16, 32, class su_8x16x32, int8_t, uint8_t, + int32_t>(); + res += gemm_row_major<8, 16, 32, class uu_8x16x32, uint8_t, uint8_t, + int32_t>(); + + if (combination.nsize == 16) { // architecture::intel_gpu_pvc + res += gemm_row_major<16, 16, 16, class bf16_16x16x16, bfloat16, + bfloat16, float>(); + res += gemm_row_major<1, 64, 16, class bf16_1x64x16, bfloat16, bfloat16, + float>(); + res += gemm_row_major<32, 64, 16, class bf16_32x64x16, bfloat16, + bfloat16, float>(); + } + break; + } + + if (combination.nsize == 8) { // architecture::intel_gpu_dg2* + res += gemm_row_major<8, 8, 16, class bf16_8x8x16, bfloat16, bfloat16, + float>(); + res += + gemm_row_major<8, 8, 32, class ss_8x8x32, int8_t, int8_t, int32_t>(); + res += + gemm_row_major<8, 8, 32, class us_8x8x32, uint8_t, int8_t, int32_t>(); + res += + gemm_row_major<8, 8, 32, class su_8x8x32, int8_t, uint8_t, int32_t>(); + res += gemm_row_major<8, 8, 32, class uu_8x8x32, uint8_t, uint8_t, + int32_t>(); + break; + } + } + return res; +} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_ss_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_ss_int8.cpp index ac650910f237a..215ceb1cbc6c9 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_ss_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_ss_int8.cpp @@ -5,10 +5,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix -// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out +// RUN: %{build} -o %t.out // RUN: %{run} %t.out -#include "../common.hpp" -#include "../joint_matrix_ss_int8_impl.hpp" +#include "common.hpp" +#include "joint_matrix_ss_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_ss_int8_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_ss_int8_impl.hpp new file mode 100644 index 0000000000000..974a489002b47 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_ss_int8_impl.hpp @@ -0,0 +1,128 @@ +//===---joint_matrix_ss_int8_impl.hpp - DPC++ joint_matrix-----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +template class mult; + +template +void matrix_multiply(big_matrix &C, big_matrix &A, + big_matrix &B) { + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufB(B.get_data(), range<2>(K, N)); + buffer bufC(C.get_data(), range<2>(M, N)); + + queue q; + size_t sg_size = get_sg_size>(q); + q.submit([&](handler &cgh) { + accessor accA{bufA, cgh}; + accessor accB{bufB, cgh}; + accessor accC{bufC, cgh}; + + cgh.parallel_for>( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + // The submatrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup no + // code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a; + // For B, we assume B has been already VNNIed. + joint_matrix + sub_b; + joint_matrix sub_c; + + joint_matrix_fill(sg, sub_c, 0); + for (int k = 0; k < K / TK; k += 1) { + joint_matrix_load( + sg, sub_a, + accA.template get_multi_ptr() + + (sg_startx * TM) * K + k * TK, + K); + joint_matrix_load( + sg, sub_b, + accB.template get_multi_ptr() + + (k * TK / VNNI) * (N * VNNI) + + sg_starty / sg_size * TN * VNNI, + N * VNNI); + joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); + } + joint_matrix_store( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::row_major); + }); // parallel for + }).wait(); +} + +template +void test() { + static constexpr size_t MATRIX_M = TM * 2; + static constexpr size_t MATRIX_N = TN * 2; + static constexpr size_t MATRIX_K = TK * 2; + T A[MATRIX_M][MATRIX_K]; + T B[MATRIX_K / VNNI][MATRIX_N * VNNI]; + TResult C[MATRIX_M][MATRIX_N]; + TResult D[MATRIX_M][MATRIX_N]; + + matrix_fill(MATRIX_M, MATRIX_K, (T *)A, + [](int i, int j) { return i + 2 * j; }); + matrix_fill(MATRIX_K / VNNI, MATRIX_N * VNNI, (T *)B, + [](int i, int j) { return i + j; }); + matrix_fill(MATRIX_M, MATRIX_N, (TResult *)C, 0); + matrix_fill(MATRIX_M, MATRIX_N, (TResult *)D, 0); + + big_matrix MC((TResult *)&C); + big_matrix MD((TResult *)&D); + big_matrix MA((T *)&A); + big_matrix MB((T *)&B); + matrix_multiply( + MC, MA, MB); + matrix_multiply_ref((T *)A, (T *)B, (TResult *)D, + MATRIX_M, MATRIX_N, MATRIX_K / VNNI); + + assert(matrix_compare(MATRIX_M, MATRIX_N, (TResult *)C, (TResult *)D)); +} + +int main() { + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0) { // Intel AMX + test(); + break; + } + + if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc + test(); + break; + } + + if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* + test(); + break; + } + } + return 0; +} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_su_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_su_int8.cpp index 55d85decd7caa..f1932a3a55314 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_su_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_su_int8.cpp @@ -5,10 +5,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix -// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out +// RUN: %{build} -o %t.out // RUN: %{run} %t.out -#include "../common.hpp" -#include "../joint_matrix_su_int8_impl.hpp" +#include "common.hpp" +#include "joint_matrix_su_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_su_int8_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_su_int8_impl.hpp new file mode 100644 index 0000000000000..17c9d47f61c36 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_su_int8_impl.hpp @@ -0,0 +1,132 @@ +//===---joint_matrix_su_int8_impl.hpp - DPC++ joint_matrix-----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +template class mult; + +template +void matrix_multiply(big_matrix &C, big_matrix &A, + big_matrix &B) { + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufB(B.get_data(), range<2>(K, N)); + buffer bufC(C.get_data(), range<2>(M, N)); + + queue q; + size_t sg_size = get_sg_size>(q); + q.submit([&](handler &cgh) { + accessor accA{bufA, cgh}; + accessor accB{bufB, cgh}; + accessor accC{bufC, cgh}; + + cgh.parallel_for>( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + // The submatrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup no + // code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a; + // For B, we assume B has been already VNNIed. + joint_matrix + sub_b; + joint_matrix sub_c; + + joint_matrix_load( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::row_major); + for (int k = 0; k < K / TK; k += 1) { + joint_matrix_load( + sg, sub_a, + accA.template get_multi_ptr() + + (sg_startx * TM) * K + k * TK, + K); + joint_matrix_load( + sg, sub_b, + accB.template get_multi_ptr() + + (k * TK / VNNI) * (N * VNNI) + + sg_starty / sg_size * TN * VNNI, + N * VNNI); + joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); + } + joint_matrix_store( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::row_major); + }); // parallel for + }).wait(); +} + +template +void test() { + static constexpr size_t MATRIX_M = TM * 2; + static constexpr size_t MATRIX_N = TN * 2; + static constexpr size_t MATRIX_K = TK * 2; + TS A[MATRIX_M][MATRIX_K]; + TU B[MATRIX_K / VNNI][MATRIX_N * VNNI]; + TResult C[MATRIX_M][MATRIX_N]; + TResult D[MATRIX_M][MATRIX_N]; + + matrix_fill(MATRIX_M, MATRIX_K, (TS *)A, + [](int i, int j) { return i + 2 * j; }); + matrix_fill(MATRIX_K / VNNI, MATRIX_N * VNNI, (TU *)B, + [](int i, int j) { return i + j; }); + matrix_fill(MATRIX_M, MATRIX_N, (TResult *)C, 1); + matrix_fill(MATRIX_M, MATRIX_N, (TResult *)D, 1); + + big_matrix MC((TResult *)&C); + big_matrix MD((TResult *)&D); + big_matrix MA((TS *)&A); + big_matrix MB((TU *)&B); + matrix_multiply(MC, MA, MB); + matrix_multiply_ref( + (TS *)A, (TU *)B, (TResult *)D, MATRIX_M, MATRIX_N, MATRIX_K / VNNI); + + assert(matrix_compare(MATRIX_M, MATRIX_N, (TResult *)C, (TResult *)D)); +} + +int main() { + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0) { // Intel AMX + test(); + break; + } + + if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc + test(); + break; + } + + if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* + test(); + break; + } + } + return 0; +} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_tf32.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_tf32.cpp index bdf6afc5bc395..922b79f356e78 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_tf32.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_tf32.cpp @@ -5,14 +5,14 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, matrix-tf32 +// REQUIRES: matrix-tf32 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out +// RUN: %{build} -o %t.out // RUN: %{run} %t.out -#include "../common.hpp" +#include "common.hpp" constexpr size_t TN = 16; -#include "../joint_matrix_tf32_impl.hpp" +#include "joint_matrix_tf32_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_tf32_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_tf32_impl.hpp new file mode 100644 index 0000000000000..69991884c0710 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_tf32_impl.hpp @@ -0,0 +1,120 @@ +//===---joint_matrix_tf32_impl.hpp - DPC++ joint_matrix--------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +constexpr size_t TM = 8; +constexpr size_t TK = 8; + +template +void matrix_multiply(big_matrix &C, + big_matrix &A, + big_matrix &B) { + size_t M = NUM_ROWS_C; + size_t N = NUM_COLS_C; + size_t K = NUM_COLS_A; + + assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B); + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufB(B.get_data(), range<2>(K, N)); + buffer bufC((float *)C.get_data(), range<2>(M, N)); + + queue q; + size_t sg_size = get_sg_size(q); + q.submit([&](handler &cgh) { + auto accC = bufC.get_access(cgh); + auto accA = bufA.get_access(cgh); + auto accB = bufB.get_access(cgh); + + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + // The matrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup no + // code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix + sub_a; + joint_matrix + sub_b; + joint_matrix sub_c; + joint_matrix_load( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::row_major); + for (int k = 0; k < K; k += TK) { + joint_matrix_load( + sg, sub_a, + accA.template get_multi_ptr() + + (sg_startx * TM) * K + k, + K); + joint_matrix_load( + sg, sub_b, + accB.template get_multi_ptr() + + (k) * (N) + sg_starty / sg_size * TN, + N); + // If no rounding to tf32 function is called, joint_matrix_mad + // function will work on truncated floats. + joint_matrix_apply(sg, sub_a, + [=](float &x) { x = round_to_tf32(x); }); + joint_matrix_apply(sg, sub_b, + [=](float &x) { x = round_to_tf32(x); }); + joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); + } + + joint_matrix_store( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::row_major); + }); // parallel for + }).wait(); +} + +int main() { + static constexpr size_t MATRIX_M = TM * 2; + static constexpr size_t MATRIX_N = TN * 2; + static constexpr size_t MATRIX_K = TK * 2; + float A[MATRIX_M][MATRIX_K]; + float B[MATRIX_K][MATRIX_N]; + float C[MATRIX_M][MATRIX_N]; + float D[MATRIX_M][MATRIX_N]; + + matrix_fill(MATRIX_M, MATRIX_K, (float *)A, + [](int i, int j) { return 1.0f * (i + j); }); + matrix_fill(MATRIX_K, MATRIX_N, (float *)B, + [](int i, int j) { return 2.0f * i + 3.0f * j; }); + matrix_fill(MATRIX_M, MATRIX_N, (float *)C, 1.0f); + matrix_fill(MATRIX_M, MATRIX_N, (float *)D, 1.0f); + + big_matrix MC((float *)&C); + big_matrix MD((float *)&D); + big_matrix MA((float *)&A); + big_matrix MB((float *)&B); + matrix_multiply(MC, MA, MB); + matrix_multiply_ref((float *)A, (float *)B, (float *)D, MATRIX_M, MATRIX_N, + MATRIX_K); + + bool res = matrix_compare(MATRIX_M, MATRIX_N, (float *)C, (float *)D); + std::cout << (res ? "passed" : "failed") << std::endl; + return !res; +} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_transposeC.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_transposeC.cpp new file mode 100644 index 0000000000000..b81093293cd33 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_transposeC.cpp @@ -0,0 +1,14 @@ +//==----------- joint_matrix_transposeC.cpp - DPC++ joint_matrix-----------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include "common.hpp" +#include "joint_matrix_transposeC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_transposeC_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_transposeC_impl.hpp new file mode 100644 index 0000000000000..278e5da5cf441 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_transposeC_impl.hpp @@ -0,0 +1,151 @@ +//===---joint_matrix_transposeC_impl.hpp - DPC++ joint_matrix--------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +template class LS; + +template +void matrix_load_and_store(T1 *input, T1 *out_col_major, T1 *out_row_major, + queue q) { + size_t M = NUM_ROWS; + size_t N = NUM_COLS; + + static_assert((NUM_ROWS % TM) == 0); + static_assert((NUM_COLS % TN) == 0); + + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + size_t sg_size = get_sg_size>(q); + + q.submit([&](handler &cgh) { + cgh.parallel_for>( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + auto p_input = + address_space_cast(input); + + auto p_out_col_major = + address_space_cast(out_col_major); + auto p_out_row_major = + address_space_cast(out_row_major); + + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_matrix; + + auto row_major_offset = + (sg_startx * TM) * N + (sg_starty / sg_size * TN); + auto col_major_offset = + (sg_startx * TM) + (sg_starty / sg_size * TN) * M; + + joint_matrix_load(sg, sub_matrix, p_input + col_major_offset, M, + layout::col_major); + + joint_matrix_store(sg, sub_matrix, + p_out_col_major + row_major_offset, N, + layout::row_major); + + joint_matrix_store(sg, sub_matrix, + p_out_row_major + col_major_offset, M, + layout::col_major); + }); // parallel for + }).wait(); +} + +template void run_matrix_test() { + static constexpr size_t MATRIX_M = TM * 16; + static constexpr size_t MATRIX_N = TN * 16; + + queue q; + T *input = malloc_shared(MATRIX_M * MATRIX_N, q); + T *out_col_major = malloc_shared(MATRIX_M * MATRIX_N, q); + T *out_row_major = malloc_shared(MATRIX_M * MATRIX_N, q); + T *ref_col_major = malloc_shared(MATRIX_M * MATRIX_N, q); + + // input is column majot matrix so it is of NxM shape + matrix_rand(MATRIX_N, MATRIX_M, input, (T)5.0); + matrix_fill(MATRIX_M, MATRIX_N, out_col_major, (T)0); + matrix_fill(MATRIX_N, MATRIX_M, out_row_major, (T)0); + matrix_transpose(MATRIX_N, MATRIX_M, ref_col_major, input); + + matrix_load_and_store(input, out_col_major, + out_row_major, q); + + // we use exact comparison as no low precision calculation is used in this + // test + std::cout << "compare results for: " << TM << " x " << TN << " [TM x TN]" + << std::endl; + bool res = + matrix_compare(MATRIX_M, MATRIX_N, out_col_major, + ref_col_major) && + matrix_compare(MATRIX_N, MATRIX_M, out_row_major, input); + free(input, q); + free(out_col_major, q); + free(out_row_major, q); + free(ref_col_major, q); + assert(res); +} + +int main() { + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0) { // Intel AMX + run_matrix_test(); + run_matrix_test(); + run_matrix_test(); + run_matrix_test(); + run_matrix_test(); + run_matrix_test(); + run_matrix_test(); + run_matrix_test(); + break; + } + + if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc + run_matrix_test(); + run_matrix_test(); + run_matrix_test(); + run_matrix_test(); + run_matrix_test(); + run_matrix_test(); + run_matrix_test(); + run_matrix_test(); + break; + } + + if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* + run_matrix_test(); + run_matrix_test(); + run_matrix_test(); + run_matrix_test(); + run_matrix_test(); + run_matrix_test(); + run_matrix_test(); + run_matrix_test(); + break; + } + } + return 0; +} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_unaligned_k.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_unaligned_k.cpp new file mode 100644 index 0000000000000..e1cf6cb6cf8bb --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_unaligned_k.cpp @@ -0,0 +1,20 @@ +//==-------- joint_matrix_unaligned_k.cpp - DPC++ joint_matrix-------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// XFAIL:* + +#include "common.hpp" + +constexpr size_t TN = 16; +static constexpr size_t MATRIX_K = 1024 + 14; + +#include "joint_matrix_out_bounds_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_us_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_us_int8.cpp index 08e2f532b887c..e433935be58ea 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_us_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_us_int8.cpp @@ -5,10 +5,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix -// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out +// RUN: %{build} -o %t.out // RUN: %{run} %t.out -#include "../common.hpp" -#include "../joint_matrix_us_int8_impl.hpp" +#include "common.hpp" +#include "joint_matrix_us_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_us_int8_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_us_int8_impl.hpp new file mode 100644 index 0000000000000..baef5c195a1e6 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_us_int8_impl.hpp @@ -0,0 +1,133 @@ +//===---joint_matrix_us_int8_impl.hpp - DPC++ joint_matrix-----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +template class mult; + +template +void matrix_multiply(big_matrix &C, big_matrix &A, + big_matrix &B) { + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufB(B.get_data(), range<2>(K, N)); + buffer bufC(C.get_data(), range<2>(M, N)); + + queue q; + size_t sg_size = get_sg_size>(q); + q.submit([&](handler &cgh) { + accessor accA{bufA, cgh}; + accessor accB{bufB, cgh}; + accessor accC{bufC, cgh}; + + cgh.parallel_for>( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + // The submatrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup no + // code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a; + // For B, we assume B has been already VNNIed. + joint_matrix + sub_b; + joint_matrix sub_c; + + joint_matrix_load( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::row_major); + for (int k = 0; k < K / TK; k += 1) { + joint_matrix_load( + sg, sub_a, + accA.template get_multi_ptr() + + (sg_startx * TM) * K + k * TK, + K); + // Assuming B data is already in VNNI format. + joint_matrix_load( + sg, sub_b, + accB.template get_multi_ptr() + + (k * TK / VNNI) * (N * VNNI) + + sg_starty / sg_size * TN * VNNI, + N * VNNI); + joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); + } + joint_matrix_store( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::row_major); + }); // parallel for + }).wait(); +} + +template +void test() { + static constexpr size_t MATRIX_M = TM * 2; + static constexpr size_t MATRIX_N = TN * 2; + static constexpr size_t MATRIX_K = TK * 2; + TU A[MATRIX_M][MATRIX_K]; + TS B[MATRIX_K / VNNI][MATRIX_N * VNNI]; + TResult C[MATRIX_M][MATRIX_N]; + TResult D[MATRIX_M][MATRIX_N]; + + matrix_fill(MATRIX_M, MATRIX_K, (TU *)A, + [](int i, int j) { return i + 2 * j; }); + matrix_fill(MATRIX_K / VNNI, MATRIX_N * VNNI, (TS *)B, + [](int i, int j) { return i + j; }); + matrix_fill(MATRIX_M, MATRIX_N, (TResult *)C, 1); + matrix_fill(MATRIX_M, MATRIX_N, (TResult *)D, 1); + + big_matrix MC((TResult *)&C); + big_matrix MD((TResult *)&D); + big_matrix MA((TU *)&A); + big_matrix MB((TS *)&B); + matrix_multiply(MC, MA, MB); + matrix_multiply_ref( + (TU *)A, (TS *)B, (TResult *)D, MATRIX_M, MATRIX_N, MATRIX_K / VNNI); + + assert(matrix_compare(MATRIX_M, MATRIX_N, (TResult *)C, (TResult *)D)); +} + +int main() { + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0) { // Intel AMX + test(); + break; + } + + if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc + test(); + break; + } + + if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* + test(); + break; + } + } + return 0; +} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_uu_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_uu_int8.cpp index f1b19bc257b1d..f5838138e1ae2 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_uu_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_uu_int8.cpp @@ -5,10 +5,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix -// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out +// RUN: %{build} -o %t.out // RUN: %{run} %t.out -#include "../common.hpp" -#include "../joint_matrix_uu_int8_impl.hpp" +#include "common.hpp" +#include "joint_matrix_uu_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_uu_int8_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_uu_int8_impl.hpp new file mode 100644 index 0000000000000..090b552848f0b --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_uu_int8_impl.hpp @@ -0,0 +1,133 @@ +//===---joint_matrix_uu_int8_impl.hpp - DPC++ joint_matrix-----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +template class mult; + +template +void matrix_multiply(big_matrix &C, big_matrix &A, + big_matrix &B) { + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufB(B.get_data(), range<2>(K, N)); + buffer bufC(C.get_data(), range<2>(M, N)); + + queue q; + size_t sg_size = get_sg_size>(q); + q.submit([&](handler &cgh) { + accessor accA{bufA, cgh}; + accessor accB{bufB, cgh}; + accessor accC{bufC, cgh}; + + cgh.parallel_for>( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + // The submatrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup no + // code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a; + // For B, we assume B has been already VNNIed. + joint_matrix + sub_b; + joint_matrix sub_c; + + joint_matrix_load( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::row_major); + for (int k = 0; k < K / TK; k += 1) { + joint_matrix_load( + sg, sub_a, + accA.template get_multi_ptr() + + (sg_startx * TM) * K + k * TK, + K); + // Assuming B data is already in VNNI format. + joint_matrix_load( + sg, sub_b, + accB.template get_multi_ptr() + + (k * TK / VNNI) * (N * VNNI) + + sg_starty / sg_size * TN * VNNI, + N * VNNI); + joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); + } + joint_matrix_store( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::row_major); + }); // parallel for + }).wait(); +} + +template +void test() { + static constexpr size_t MATRIX_M = TM * 2; + static constexpr size_t MATRIX_N = TN * 2; + static constexpr size_t MATRIX_K = TK * 2; + T A[MATRIX_M][MATRIX_K]; + T B[MATRIX_K / VNNI][MATRIX_N * VNNI]; + TResult C[MATRIX_M][MATRIX_N]; + TResult D[MATRIX_M][MATRIX_N]; + + matrix_fill(MATRIX_M, MATRIX_K, (T *)A, + [](int i, int j) { return i + 2 * j; }); + matrix_fill(MATRIX_K / VNNI, MATRIX_N * VNNI, (T *)B, + [](int i, int j) { return i + j; }); + matrix_fill(MATRIX_M, MATRIX_N, (TResult *)C, 1); + matrix_fill(MATRIX_M, MATRIX_N, (TResult *)D, 1); + + big_matrix MC((TResult *)&C); + big_matrix MD((TResult *)&D); + big_matrix MA((T *)&A); + big_matrix MB((T *)&B); + matrix_multiply( + MC, MA, MB); + matrix_multiply_ref((T *)A, (T *)B, (TResult *)D, + MATRIX_M, MATRIX_N, MATRIX_K / VNNI); + + assert(matrix_compare(MATRIX_M, MATRIX_N, (TResult *)C, (TResult *)D)); +} + +int main() { + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0) { // Intel AMX + test(); + break; + } + + if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc + test(); + break; + } + + if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* + test(); + break; + } + } + return 0; +} From d91d68a43bde909c8a67f0327a16f42bdea73212 Mon Sep 17 00:00:00 2001 From: "Sidorov, Dmitry" Date: Mon, 12 Aug 2024 07:58:12 -0700 Subject: [PATCH 06/19] apply scripts Signed-off-by: Sidorov, Dmitry --- .../Matrix/SPVCooperativeMatrix/SG32/element_wise_abc.cpp | 2 +- .../SPVCooperativeMatrix/SG32/element_wise_all_ops.cpp | 2 +- .../SG32/element_wise_all_ops_half.cpp | 2 +- .../SG32/element_wise_all_ops_int8.cpp | 2 +- .../SG32/element_wise_all_ops_int8_packed.cpp | 2 +- .../SG32/element_wise_all_ops_tf32.cpp | 2 +- .../SPVCooperativeMatrix/SG32/element_wise_all_sizes.cpp | 2 +- .../Matrix/SPVCooperativeMatrix/SG32/element_wise_ops.cpp | 2 +- .../SPVCooperativeMatrix/SG32/get_coord_float_matC.cpp | 2 +- .../SPVCooperativeMatrix/SG32/get_coord_int8_matA.cpp | 2 +- .../SPVCooperativeMatrix/SG32/get_coord_int8_matB.cpp | 2 +- .../SPVCooperativeMatrix/SG32/joint_matrix_all_sizes.cpp | 2 +- .../SG32/joint_matrix_annotated_ptr.cpp | 2 +- .../SPVCooperativeMatrix/SG32/joint_matrix_apply_bf16.cpp | 2 +- .../SG32/joint_matrix_apply_two_matrices.cpp | 2 +- .../SG32/joint_matrix_bf16_fill_k_cache.cpp | 2 +- .../SG32/joint_matrix_bf16_fill_k_cache_init.cpp | 2 +- .../SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp | 2 +- .../SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp | 2 +- .../SPVCooperativeMatrix/SG32/joint_matrix_bfloat16.cpp | 2 +- .../SG32/joint_matrix_bfloat16_array.cpp | 2 +- .../SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp | 2 +- .../SG32/joint_matrix_bfloat16_packedB.cpp | 2 +- .../SG32/joint_matrix_colA_rowB_colC.cpp | 2 +- .../SG32/joint_matrix_down_convert.cpp | 2 +- .../SPVCooperativeMatrix/SG32/joint_matrix_half.cpp | 2 +- .../SG32/joint_matrix_int8_colmajorA_colmajorB.cpp | 2 +- .../SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp | 2 +- .../SPVCooperativeMatrix/SG32/joint_matrix_out_bounds.cpp | 2 +- .../SPVCooperativeMatrix/SG32/joint_matrix_prefetch.cpp | 2 +- .../SG32/joint_matrix_rowmajorA_rowmajorB.cpp | 2 +- .../SPVCooperativeMatrix/SG32/joint_matrix_ss_int8.cpp | 2 +- .../SPVCooperativeMatrix/SG32/joint_matrix_su_int8.cpp | 2 +- .../SPVCooperativeMatrix/SG32/joint_matrix_tf32.cpp | 2 +- .../SPVCooperativeMatrix/SG32/joint_matrix_transposeC.cpp | 2 +- .../SG32/joint_matrix_unaligned_k.cpp | 2 +- .../SPVCooperativeMatrix/SG32/joint_matrix_us_int8.cpp | 2 +- .../SPVCooperativeMatrix/SG32/joint_matrix_uu_int8.cpp | 2 +- .../SPVCooperativeMatrix/element_wise_abc.cpp | 2 +- .../SPVCooperativeMatrix/element_wise_ops.cpp | 2 +- .../SPVCooperativeMatrix/joint_matrix_bfloat16.cpp | 2 +- .../SPVCooperativeMatrix/joint_matrix_bfloat16_array.cpp | 2 +- .../SPVCooperativeMatrix/joint_matrix_half.cpp | 2 +- .../SPVCooperativeMatrix/joint_matrix_prefetch.cpp | 2 +- .../SPVCooperativeMatrix/joint_matrix_ss_int8.cpp | 2 +- .../SPVCooperativeMatrix/joint_matrix_su_int8.cpp | 2 +- .../SPVCooperativeMatrix/joint_matrix_tf32.cpp | 2 +- .../SPVCooperativeMatrix/joint_matrix_us_int8.cpp | 2 +- .../SPVCooperativeMatrix/joint_matrix_uu_int8.cpp | 2 +- .../Matrix/SPVCooperativeMatrix/element_wise_abc.cpp | 8 ++++---- .../Matrix/SPVCooperativeMatrix/element_wise_all_ops.cpp | 8 ++++---- .../SPVCooperativeMatrix/element_wise_all_ops_1d.cpp | 8 ++++---- .../SPVCooperativeMatrix/element_wise_all_ops_1d_cont.cpp | 8 ++++---- .../SPVCooperativeMatrix/element_wise_all_ops_half.cpp | 8 ++++---- .../SPVCooperativeMatrix/element_wise_all_ops_int8.cpp | 8 ++++---- .../element_wise_all_ops_int8_packed.cpp | 8 ++++---- .../SPVCooperativeMatrix/element_wise_all_ops_scalar.cpp | 8 ++++---- .../SPVCooperativeMatrix/element_wise_all_ops_tf32.cpp | 8 ++++---- .../SPVCooperativeMatrix/element_wise_all_sizes.cpp | 8 ++++---- .../Matrix/SPVCooperativeMatrix/element_wise_ops.cpp | 8 ++++---- .../SPVCooperativeMatrix/elemwise_irreg_size_ops_bf16.cpp | 6 +++--- .../Matrix/SPVCooperativeMatrix/get_coord_float_matC.cpp | 8 ++++---- .../Matrix/SPVCooperativeMatrix/get_coord_int8_matA.cpp | 8 ++++---- .../Matrix/SPVCooperativeMatrix/get_coord_int8_matB.cpp | 8 ++++---- .../SPVCooperativeMatrix/joint_matrix_all_sizes.cpp | 8 ++++---- .../SPVCooperativeMatrix/joint_matrix_annotated_ptr.cpp | 8 ++++---- .../SPVCooperativeMatrix/joint_matrix_apply_bf16.cpp | 8 ++++---- .../joint_matrix_apply_two_matrices.cpp | 6 +++--- .../joint_matrix_bf16_fill_k_cache.cpp | 8 ++++---- .../joint_matrix_bf16_fill_k_cache_OOB.cpp | 6 +++--- .../joint_matrix_bf16_fill_k_cache_init.cpp | 8 ++++---- .../joint_matrix_bf16_fill_k_cache_unroll.cpp | 6 +++--- .../joint_matrix_bf16_fill_k_cache_unroll_init.cpp | 6 +++--- .../Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16.cpp | 8 ++++---- .../SPVCooperativeMatrix/joint_matrix_bfloat16_array.cpp | 8 ++++---- .../joint_matrix_bfloat16_colmajorA_colmajorB.cpp | 8 ++++---- .../joint_matrix_bfloat16_packedB.cpp | 8 ++++---- .../SPVCooperativeMatrix/joint_matrix_colA_rowB_colC.cpp | 8 ++++---- .../SPVCooperativeMatrix/joint_matrix_down_convert.cpp | 8 ++++---- .../Matrix/SPVCooperativeMatrix/joint_matrix_half.cpp | 8 ++++---- .../joint_matrix_int8_colmajorA_colmajorB.cpp | 8 ++++---- .../joint_matrix_int8_rowmajorA_rowmajorB.cpp | 8 ++++---- .../joint_matrix_opt_kernel_feature.cpp | 8 ++++---- .../joint_matrix_opt_kernel_feature_unsupported_hw.cpp | 6 +++--- .../SPVCooperativeMatrix/joint_matrix_out_bounds.cpp | 8 ++++---- .../Matrix/SPVCooperativeMatrix/joint_matrix_prefetch.cpp | 8 ++++---- .../SPVCooperativeMatrix/joint_matrix_query_default.cpp | 4 ++-- .../joint_matrix_rowmajorA_rowmajorB.cpp | 8 ++++---- .../Matrix/SPVCooperativeMatrix/joint_matrix_ss_int8.cpp | 8 ++++---- .../Matrix/SPVCooperativeMatrix/joint_matrix_su_int8.cpp | 8 ++++---- .../Matrix/SPVCooperativeMatrix/joint_matrix_tf32.cpp | 8 ++++---- .../SPVCooperativeMatrix/joint_matrix_transposeC.cpp | 8 ++++---- .../SPVCooperativeMatrix/joint_matrix_unaligned_k.cpp | 8 ++++---- .../Matrix/SPVCooperativeMatrix/joint_matrix_us_int8.cpp | 8 ++++---- .../Matrix/SPVCooperativeMatrix/joint_matrix_uu_int8.cpp | 8 ++++---- 95 files changed, 225 insertions(+), 225 deletions(-) diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_abc.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_abc.cpp index 1fdd989ae091f..7a471ae628584 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_abc.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_abc.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops.cpp index fdcc7d2aef9f7..8192567a039f3 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_half.cpp index 847f4a7812aa2..e65396893c54a 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_half.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_half.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: aspect-fp16 +// REQUIRES: gpu, aspect-fp16 // REQUIRES: aspect-ext_intel_matrix, gpu // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8.cpp index 984ff9a9b082f..25e5a628eb894 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8_packed.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8_packed.cpp index af2f4df82b648..2f199d13355b5 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8_packed.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8_packed.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_tf32.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_tf32.cpp index 06d459a2a3ce5..e42fc1d04fe8e 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_tf32.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_tf32.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix-tf32 +// REQUIRES: gpu, matrix-tf32 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_sizes.cpp index 4624110577ea2..66ce7cf96e40a 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_sizes.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix +// REQUIRES: gpu, matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_ops.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_ops.cpp index 1a09518e65ffb..c18c37858646b 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_ops.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_ops.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_float_matC.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_float_matC.cpp index 9b4777faced20..f2d82933d4012 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_float_matC.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_float_matC.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matA.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matA.cpp index 09c3ccf9983ab..a5436b255e513 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matA.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matA.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matB.cpp index 80e0c0c6b845d..ed6b730faf129 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matB.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_all_sizes.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_all_sizes.cpp index 3fe1f33343b17..fe5b0c57b0c25 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_all_sizes.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_annotated_ptr.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_annotated_ptr.cpp index 7a6aca1024418..a3e450846f45b 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_annotated_ptr.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_annotated_ptr.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix +// REQUIRES: gpu, matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_bf16.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_bf16.cpp index 88d5ba29922de..1f5ed4df8d3a8 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_bf16.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_bf16.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_two_matrices.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_two_matrices.cpp index 8e5ebeed1c222..d4caa86132d85 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_two_matrices.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_two_matrices.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache.cpp index 10334f93afa80..0c99f34aa5d82 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -o %t.out -ffp-model=precise diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp index 32af965ec431a..9b4ca122d8f59 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: aspect-ext_intel_matrix, gpu +// REQUIRES: gpu, aspect-ext_intel_matrix, gpu // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -o %t.out -DINIT_LIST -ffp-model=precise diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp index 1c7533e331e73..53230a2d1d40e 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -mllvm -inline-threshold=5000 -ffp-model=precise -o %t.out -DMANUAL_UNROLL diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp index f8d30cdc26756..6c46816878b15 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: aspect-ext_intel_matrix, gpu +// REQUIRES: gpu, aspect-ext_intel_matrix, gpu // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -mllvm -inline-threshold=5000 -ffp-model=precise -o %t_gpu.out -DINIT_LIST -DMANUAL_UNROLL diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16.cpp index 4857ded37a011..fbdbe6df4ead4 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_array.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_array.cpp index 87fd837446618..abd4de12d8e85 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_array.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_array.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp index 6532bcfe47bff..0cf5b40563c0e 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix +// REQUIRES: gpu, matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_packedB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_packedB.cpp index d2ba1cdbdd38e..555d779ae1c10 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_packedB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_packedB.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix +// REQUIRES: gpu, matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_colA_rowB_colC.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_colA_rowB_colC.cpp index 70e53441cb48f..e1f7cc9c24cb6 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_colA_rowB_colC.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_colA_rowB_colC.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix +// REQUIRES: gpu, matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_down_convert.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_down_convert.cpp index b474f846d11d5..f173f12b604c9 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_down_convert.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_down_convert.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix +// REQUIRES: gpu, matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_half.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_half.cpp index 2b990ea3c6098..07f83c5b24315 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_half.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_half.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: aspect-fp16 +// REQUIRES: gpu, aspect-fp16 // REQUIRES: aspect-ext_intel_matrix // REQUIRES: matrix-fp16 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp index c89c657c77fbc..b978573afdcae 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix +// REQUIRES: gpu, matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp index d73a2300888d3..8512f84df4a57 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_out_bounds.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_out_bounds.cpp index 1848a480a0eb7..fb571b4dc12c3 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_out_bounds.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_out_bounds.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix +// REQUIRES: gpu, matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_prefetch.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_prefetch.cpp index a6ce09fe13b9f..6f328ec11fffd 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_prefetch.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_prefetch.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix // RUN: %{build} -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_rowmajorA_rowmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_rowmajorA_rowmajorB.cpp index 0fe45a24c499f..ada0c37286b96 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_rowmajorA_rowmajorB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_rowmajorA_rowmajorB.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // This tests support of row major layout for matrix B which does automatic VNNI -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // VNNI transform and sub-group size 32 are not supported yet on DG2 by IGC // UNSUPPORTED: gpu-intel-dg2 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_ss_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_ss_int8.cpp index 08f44d2a00090..97cd8bb817a6b 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_ss_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_ss_int8.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_su_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_su_int8.cpp index c260cb7558ed5..3feb8272fdfa1 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_su_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_su_int8.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_tf32.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_tf32.cpp index 18da250bc808d..c57542890d1ec 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_tf32.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_tf32.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix-tf32 +// REQUIRES: gpu, matrix-tf32 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_transposeC.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_transposeC.cpp index 6cea5a248e0b2..4a1d8fd618775 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_transposeC.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_transposeC.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_unaligned_k.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_unaligned_k.cpp index f4b2426af93a8..74a5b6fefa1a9 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_unaligned_k.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_unaligned_k.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix +// REQUIRES: gpu, matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_us_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_us_int8.cpp index a0ed38373a43a..9413b5e20ef82 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_us_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_us_int8.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_uu_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_uu_int8.cpp index d5c5281f67c9f..501264f2ce1f1 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_uu_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_uu_int8.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/element_wise_abc.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/element_wise_abc.cpp index 3bf9030658112..c8bc57c62912b 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/element_wise_abc.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/element_wise_abc.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: gpu, gpu, aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/element_wise_ops.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/element_wise_ops.cpp index 19c2d5ef9c273..ae9e21855bcfb 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/element_wise_ops.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/element_wise_ops.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: gpu, gpu, aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_bfloat16.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_bfloat16.cpp index db664554a1a14..2dd6b2f9e7ebf 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_bfloat16.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_bfloat16.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: gpu, gpu, aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array.cpp index bcbb20228ac00..a0a67f55fa4ab 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: gpu, gpu, aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_half.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_half.cpp index 424bb83050048..378556e190a7a 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_half.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_half.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-fp16 +// REQUIRES: gpu, aspect-fp16 // REQUIRES: aspect-ext_intel_matrix // REQUIRES: matrix-fp16 // REQUIRES: gpu diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_prefetch.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_prefetch.cpp index fe3c25e338094..918c3df7db407 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_prefetch.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_prefetch.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: gpu, gpu, aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_ss_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_ss_int8.cpp index ac650910f237a..0b6fa4ac88fe3 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_ss_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_ss_int8.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: gpu, gpu, aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_su_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_su_int8.cpp index 55d85decd7caa..b9f746f4ea8fc 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_su_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_su_int8.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: gpu, gpu, aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_tf32.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_tf32.cpp index bdf6afc5bc395..c006e6b401b4a 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_tf32.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_tf32.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, matrix-tf32 +// REQUIRES: gpu, gpu, matrix-tf32 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_us_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_us_int8.cpp index 08e2f532b887c..38d01c67df783 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_us_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_us_int8.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: gpu, gpu, aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_uu_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_uu_int8.cpp index f1b19bc257b1d..ac01a98568e9a 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_uu_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_uu_int8.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: gpu, gpu, aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_abc.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_abc.cpp index 8a4e277bf6d11..3bf9030658112 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_abc.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_abc.cpp @@ -5,10 +5,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "common.hpp" -#include "element_wise_abc_impl.hpp" +#include "../common.hpp" +#include "../element_wise_abc_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops.cpp index d14240b145e81..0fe6ccae0cc83 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops.cpp @@ -5,13 +5,13 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out // TODO: Reenable, see https://github.com/intel/llvm/issues/14598 // UNSUPPORTED: windows, linux -#include "common.hpp" -#include "element_wise_all_ops_impl.hpp" +#include "../common.hpp" +#include "../element_wise_all_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d.cpp index ad6bd117044a3..72d7bf86884cf 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d.cpp @@ -5,13 +5,13 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix, gpu +// REQUIRES: gpu, aspect-ext_intel_matrix, gpu -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: env IGC_JointMatrixLoadStoreOpt=1 %{run} %t.out // TODO: Reenable, see https://github.com/intel/llvm/issues/14598 // UNSUPPORTED: windows, linux -#include "common.hpp" -#include "element_wise_all_ops_impl.hpp" +#include "../common.hpp" +#include "../element_wise_all_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d_cont.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d_cont.cpp index 46e0771238de6..f96a0ab5305aa 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d_cont.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d_cont.cpp @@ -5,13 +5,13 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix, gpu +// REQUIRES: gpu, aspect-ext_intel_matrix, gpu -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: env IGC_JointMatrixLoadStoreOpt=2 %{run} %t.out // TODO: Reenable, see https://github.com/intel/llvm/issues/14598 // UNSUPPORTED: windows, linux -#include "common.hpp" -#include "element_wise_all_ops_impl.hpp" +#include "../common.hpp" +#include "../element_wise_all_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_half.cpp index bb651568f9251..c2f4173890b01 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_half.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_half.cpp @@ -5,12 +5,12 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-fp16 +// REQUIRES: gpu, aspect-fp16 // REQUIRES: aspect-ext_intel_matrix // REQUIRES: matrix-fp16 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "common.hpp" -#include "element_wise_all_ops_half_impl.hpp" +#include "../common.hpp" +#include "../element_wise_all_ops_half_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8.cpp index ba538e4ebffef..53826649f7c12 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8.cpp @@ -5,10 +5,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "common.hpp" -#include "element_wise_all_ops_int8_impl.hpp" +#include "../common.hpp" +#include "../element_wise_all_ops_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8_packed.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8_packed.cpp index be27718279b79..f19401cbb0d3f 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8_packed.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8_packed.cpp @@ -5,12 +5,12 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out // This test stores the matrix B that is VNNIed (packed). -#include "common.hpp" -#include "element_wise_all_ops_int8_packed_impl.hpp" +#include "../common.hpp" +#include "../element_wise_all_ops_int8_packed_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_scalar.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_scalar.cpp index 93432bca4a03e..e599ab9015a09 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_scalar.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_scalar.cpp @@ -5,13 +5,13 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix, gpu +// REQUIRES: gpu, aspect-ext_intel_matrix, gpu -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: env IGC_JointMatrixLoadStoreOpt=0 %{run} %t.out // TODO: Reenable, see https://github.com/intel/llvm/issues/14598 // UNSUPPORTED: windows, linux -#include "common.hpp" -#include "element_wise_all_ops_impl.hpp" +#include "../common.hpp" +#include "../element_wise_all_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_tf32.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_tf32.cpp index 6e2f8dcff6384..9612d677506d0 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_tf32.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_tf32.cpp @@ -5,14 +5,14 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix-tf32 +// REQUIRES: gpu, matrix-tf32 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "common.hpp" +#include "../common.hpp" constexpr size_t TN = 16; -#include "element_wise_all_ops_tf32_impl.hpp" +#include "../element_wise_all_ops_tf32_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_sizes.cpp index 83db076d2be20..301b8b30e69c5 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_sizes.cpp @@ -5,13 +5,13 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // TODO: Reenable, see https://github.com/intel/llvm/issues/14598 // UNSUPPORTED: windows, linux -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out // This is a version of the test with disabled device code @@ -19,5 +19,5 @@ // RUN: %{build} -fsycl-device-code-split=off -o %t_split.out // RUN: %if gpu-intel-dg2 %{ %{run} %t_split.out %} -#include "common.hpp" -#include "element_wise_all_sizes_impl.hpp" +#include "../common.hpp" +#include "../element_wise_all_sizes_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_ops.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_ops.cpp index 855ed65900a6f..19c2d5ef9c273 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_ops.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_ops.cpp @@ -5,10 +5,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "common.hpp" -#include "element_wise_ops_impl.hpp" +#include "../common.hpp" +#include "../element_wise_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/elemwise_irreg_size_ops_bf16.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/elemwise_irreg_size_ops_bf16.cpp index 5b4c785d46d3d..e8b6f18c21a9d 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/elemwise_irreg_size_ops_bf16.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/elemwise_irreg_size_ops_bf16.cpp @@ -8,13 +8,13 @@ // This test is for element wise operations when matrix size does not multiply // SG size. This corner case only applies to AMX. Also, it tests bf16 type. // only run this on AMX -// REQUIRES: cpu +// REQUIRES: gpu, cpu // REQUIRES: matrix -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "common.hpp" +#include "../common.hpp" #define SG_SZ 16 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_float_matC.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_float_matC.cpp index 044c8072bb3bc..dce56aebad781 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_float_matC.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_float_matC.cpp @@ -5,10 +5,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "common.hpp" -#include "get_coord_float_matC_impl.hpp" +#include "../common.hpp" +#include "../get_coord_float_matC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matA.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matA.cpp index 8e953c8689305..4ade538001d98 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matA.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matA.cpp @@ -5,10 +5,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "common.hpp" -#include "get_coord_int8_matA_impl.hpp" +#include "../common.hpp" +#include "../get_coord_int8_matA_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matB.cpp index a5166bbf54201..c8de880baccf3 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matB.cpp @@ -5,11 +5,11 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out // XFAIL: cpu -#include "common.hpp" -#include "get_coord_int8_matB_impl.hpp" +#include "../common.hpp" +#include "../get_coord_int8_matB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_all_sizes.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_all_sizes.cpp index e4a9523314ea0..6a93d9ba12747 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_all_sizes.cpp @@ -5,10 +5,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "common.hpp" -#include "joint_matrix_all_sizes_impl.hpp" +#include "../common.hpp" +#include "../joint_matrix_all_sizes_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_annotated_ptr.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_annotated_ptr.cpp index e21bdaa06a328..9881c24e35a22 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_annotated_ptr.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_annotated_ptr.cpp @@ -5,15 +5,15 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix +// REQUIRES: gpu, matrix -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out // RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=0 %{run} %t.out %} // RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=1 %{run} %t.out %} -#include "common.hpp" +#include "../common.hpp" constexpr size_t TN = 16; -#include "joint_matrix_annotated_ptr_impl.hpp" +#include "../joint_matrix_annotated_ptr_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_bf16.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_bf16.cpp index cdfa28c07673e..2e2f1294bab8f 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_bf16.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_bf16.cpp @@ -5,10 +5,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "common.hpp" -#include "joint_matrix_apply_bf16_impl.hpp" +#include "../common.hpp" +#include "../joint_matrix_apply_bf16_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_two_matrices.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_two_matrices.cpp index e61448852710b..d41424f1fa6f0 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_two_matrices.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_two_matrices.cpp @@ -5,10 +5,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix // RUN: %{build} -ffp-model=precise -o %t.out // RUN: %{run} %t.out -#include "common.hpp" -#include "joint_matrix_apply_two_matrices_impl.hpp" +#include "../common.hpp" +#include "../joint_matrix_apply_two_matrices_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache.cpp index 2be4c14615799..014f7cd68e9f6 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache.cpp @@ -5,12 +5,12 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix -// RUN: %{build} -o %t.out -ffp-model=precise +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out -ffp-model=precise // RUN: %{run} %t.out // -ffp-model=precise is added to not depend on compiler defaults. -#include "common.hpp" -#include "joint_matrix_bf16_fill_k_cache_impl.hpp" +#include "../common.hpp" +#include "../joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_OOB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_OOB.cpp index 4d84656c3d451..86cb8f6bf4704 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_OOB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_OOB.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix, gpu +// REQUIRES: gpu, aspect-ext_intel_matrix, gpu // UNSUPPORTED: gpu-intel-dg2 // RUN: %{build} -o %t_gpu.out -ffp-model=precise -DOOB @@ -15,5 +15,5 @@ // -ffp-model=precise is added to not depend on compiler defaults. -#include "common.hpp" -#include "joint_matrix_bf16_fill_k_cache_impl.hpp" +#include "../common.hpp" +#include "../joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_init.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_init.cpp index 0770e7881edc7..e116e885d5b2c 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_init.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_init.cpp @@ -5,12 +5,12 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix, gpu +// REQUIRES: gpu, aspect-ext_intel_matrix, gpu -// RUN: %{build} -o %t.out -DINIT_LIST -ffp-model=precise +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out -DINIT_LIST -ffp-model=precise // RUN: %{run} %t.out // -ffp-model=precise is added to not depend on compiler defaults. -#include "common.hpp" -#include "joint_matrix_bf16_fill_k_cache_impl.hpp" +#include "../common.hpp" +#include "../joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll.cpp index 4f5616d7e7f4f..752d0303b2dfa 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix // RUN: %{build} -mllvm -inline-threshold=2000 -ffp-model=precise -o %t.out -DMANUAL_UNROLL // RUN: %{run} %t.out @@ -14,5 +14,5 @@ // since IGC doesn't support some variants of IR for Joint Matrix currently // -ffp-model=precise is added to not depend on compiler defaults. -#include "common.hpp" -#include "joint_matrix_bf16_fill_k_cache_impl.hpp" +#include "../common.hpp" +#include "../joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp index ff4c29251200d..d27bc33dbb5a5 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix, gpu +// REQUIRES: gpu, aspect-ext_intel_matrix, gpu // RUN: %{build} -mllvm -inline-threshold=2000 -ffp-model=precise -o %t_gpu.out -DINIT_LIST -DMANUAL_UNROLL // RUN: %{run} %t_gpu.out @@ -14,5 +14,5 @@ // since IGC doesn't support some variants of IR for Joint Matrix currently // -ffp-model=precise is added to not depend on compiler defaults. -#include "common.hpp" -#include "joint_matrix_bf16_fill_k_cache_impl.hpp" +#include "../common.hpp" +#include "../joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16.cpp index d1fbd4cbda15b..db664554a1a14 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16.cpp @@ -5,10 +5,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "common.hpp" -#include "joint_matrix_bfloat16_impl.hpp" +#include "../common.hpp" +#include "../joint_matrix_bfloat16_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array.cpp index 5cd2a4dc1962f..bcbb20228ac00 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array.cpp @@ -5,10 +5,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "common.hpp" -#include "joint_matrix_bfloat16_array_impl.hpp" +#include "../common.hpp" +#include "../joint_matrix_bfloat16_array_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp index 21d5f1239cd8d..022e26d7f58b6 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp @@ -5,9 +5,9 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix +// REQUIRES: gpu, matrix -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out // This tests support of col major layout for matrix B which does transpose and @@ -15,8 +15,8 @@ // XFAIL: gpu -#include "common.hpp" +#include "../common.hpp" constexpr size_t TN = 16; -#include "joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp" +#include "../joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_packedB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_packedB.cpp index bf3a65d2a16d7..f9d2efc51e970 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_packedB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_packedB.cpp @@ -5,14 +5,14 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix +// REQUIRES: gpu, matrix // REQUIRES-INTEL-DRIVER: lin: 27868, win: 101.5181 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out // RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=2 %{run} %t.out %} // RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=1 %{run} %t.out %} // RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=0 %{run} %t.out %} -#include "common.hpp" -#include "joint_matrix_bfloat16_packedB_impl.hpp" +#include "../common.hpp" +#include "../joint_matrix_bfloat16_packedB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_colA_rowB_colC.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_colA_rowB_colC.cpp index 354a71006e129..76c10eb46ec1f 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_colA_rowB_colC.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_colA_rowB_colC.cpp @@ -5,15 +5,15 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix +// REQUIRES: gpu, matrix -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out // XFAIL:* -#include "common.hpp" +#include "../common.hpp" constexpr size_t TN = 16; -#include "joint_matrix_colA_rowB_colC_impl.hpp" +#include "../joint_matrix_colA_rowB_colC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_down_convert.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_down_convert.cpp index dee504c22e7f6..7f1953174b118 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_down_convert.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_down_convert.cpp @@ -5,10 +5,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix +// REQUIRES: gpu, matrix -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "common.hpp" -#include "joint_matrix_down_convert_impl.hpp" +#include "../common.hpp" +#include "../joint_matrix_down_convert_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_half.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_half.cpp index 44fa8cd86e7cb..7fbcfcd0ee957 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_half.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_half.cpp @@ -5,12 +5,12 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-fp16 +// REQUIRES: gpu, aspect-fp16 // REQUIRES: aspect-ext_intel_matrix // REQUIRES: matrix-fp16 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "common.hpp" -#include "joint_matrix_half_impl.hpp" +#include "../common.hpp" +#include "../joint_matrix_half_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_colmajorA_colmajorB.cpp index 37769a41f7003..dff506ba3effa 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_colmajorA_colmajorB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_colmajorA_colmajorB.cpp @@ -5,9 +5,9 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix +// REQUIRES: gpu, matrix -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out // This tests support of col major layout for matrix B which does transpose and @@ -15,8 +15,8 @@ // XFAIL: gpu -#include "common.hpp" +#include "../common.hpp" constexpr size_t TN = 16; -#include "joint_matrix_int8_colmajorA_colmajorB_impl.hpp" +#include "../joint_matrix_int8_colmajorA_colmajorB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_rowmajorA_rowmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_rowmajorA_rowmajorB.cpp index fe2506c2acf69..e37c5deb9ff51 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_rowmajorA_rowmajorB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_rowmajorA_rowmajorB.cpp @@ -5,9 +5,9 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out // Run these 2 tests on PVC only for now. Check can be updated to "gpu", @@ -15,5 +15,5 @@ // RUN: %if arch-intel_gpu_pvc %{ env IGC_JointMatrixLoadStoreOpt=0 %{run} %t.out %} // RUN: %if arch-intel_gpu_pvc %{ env IGC_JointMatrixLoadStoreOpt=1 %{run} %t.out %} -#include "common.hpp" -#include "joint_matrix_int8_rowmajorA_rowmajorB_impl.hpp" +#include "../common.hpp" +#include "../joint_matrix_int8_rowmajorA_rowmajorB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature.cpp index a6b72f80a989d..7f7c309c906fc 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature.cpp @@ -6,13 +6,13 @@ // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out // Test checks that exception will be thrown in case matrix parameters are // incompatible on the current device -#include "common.hpp" -#include "joint_matrix_opt_kernel_feature_impl.hpp" +#include "../common.hpp" +#include "../joint_matrix_opt_kernel_feature_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature_unsupported_hw.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature_unsupported_hw.cpp index abf1af49e6933..c8078e08f9151 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature_unsupported_hw.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature_unsupported_hw.cpp @@ -6,15 +6,15 @@ // //===----------------------------------------------------------------------===// -// REQUIRES: gpu-intel-gen12, gpu +// REQUIRES: gpu, gpu-intel-gen12, gpu -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out // Test checks that exception will be thrown in case object of joint_matrix type // is used on unsupported HW, in this case, on Gen12. -#include "common.hpp" +#include "../common.hpp" int main() { sycl::queue q; diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_out_bounds.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_out_bounds.cpp index f3485408373b9..391edcff2df8d 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_out_bounds.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_out_bounds.cpp @@ -5,18 +5,18 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix // UNSUPPORTED: gpu-intel-dg2 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out // XFAIL:* -#include "common.hpp" +#include "../common.hpp" constexpr size_t TN = 16; constexpr size_t MATRIX_K = 1024 + 24; -#include "joint_matrix_out_bounds_impl.hpp" +#include "../joint_matrix_out_bounds_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_prefetch.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_prefetch.cpp index df5c4b27a122b..fe3c25e338094 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_prefetch.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_prefetch.cpp @@ -5,11 +5,11 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix -// RUN: %{build} -o %t.out +// REQUIRES: gpu, aspect-ext_intel_matrix +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "common.hpp" +#include "../common.hpp" constexpr size_t TN = 16; -#include "joint_matrix_prefetch_impl.hpp" +#include "../joint_matrix_prefetch_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_query_default.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_query_default.cpp index f160c09525aa7..733ac11bdc516 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_query_default.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_query_default.cpp @@ -6,10 +6,10 @@ // //===----------------------------------------------------------------------===// // Needs AMX. -// REQUIRES: cpu +// REQUIRES: gpu, cpu // REQUIRES: matrix -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out #include diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_rowmajorA_rowmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_rowmajorA_rowmajorB.cpp index bfcf624abe97b..8397ad43a11cd 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_rowmajorA_rowmajorB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_rowmajorA_rowmajorB.cpp @@ -5,11 +5,11 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix // VNNI transform is not supported yet by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out // RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=2 %{run} %t.out %} // RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=1 %{run} %t.out %} @@ -18,5 +18,5 @@ // This tests support of row major layout for matrix B which does automatic VNNI // transform. This is currently only available on AMX and XMX of PVC -#include "common.hpp" -#include "joint_matrix_rowmajorA_rowmajorB_impl.hpp" +#include "../common.hpp" +#include "../joint_matrix_rowmajorA_rowmajorB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_ss_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_ss_int8.cpp index 215ceb1cbc6c9..ac650910f237a 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_ss_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_ss_int8.cpp @@ -5,10 +5,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "common.hpp" -#include "joint_matrix_ss_int8_impl.hpp" +#include "../common.hpp" +#include "../joint_matrix_ss_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_su_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_su_int8.cpp index f1932a3a55314..55d85decd7caa 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_su_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_su_int8.cpp @@ -5,10 +5,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "common.hpp" -#include "joint_matrix_su_int8_impl.hpp" +#include "../common.hpp" +#include "../joint_matrix_su_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_tf32.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_tf32.cpp index 922b79f356e78..bdf6afc5bc395 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_tf32.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_tf32.cpp @@ -5,14 +5,14 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix-tf32 +// REQUIRES: gpu, matrix-tf32 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "common.hpp" +#include "../common.hpp" constexpr size_t TN = 16; -#include "joint_matrix_tf32_impl.hpp" +#include "../joint_matrix_tf32_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_transposeC.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_transposeC.cpp index b81093293cd33..7aa8c5cbd41ba 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_transposeC.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_transposeC.cpp @@ -5,10 +5,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "common.hpp" -#include "joint_matrix_transposeC_impl.hpp" +#include "../common.hpp" +#include "../joint_matrix_transposeC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_unaligned_k.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_unaligned_k.cpp index e1cf6cb6cf8bb..79cca59d80b15 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_unaligned_k.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_unaligned_k.cpp @@ -5,16 +5,16 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix +// REQUIRES: gpu, matrix -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out // XFAIL:* -#include "common.hpp" +#include "../common.hpp" constexpr size_t TN = 16; static constexpr size_t MATRIX_K = 1024 + 14; -#include "joint_matrix_out_bounds_impl.hpp" +#include "../joint_matrix_out_bounds_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_us_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_us_int8.cpp index e433935be58ea..08e2f532b887c 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_us_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_us_int8.cpp @@ -5,10 +5,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "common.hpp" -#include "joint_matrix_us_int8_impl.hpp" +#include "../common.hpp" +#include "../joint_matrix_us_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_uu_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_uu_int8.cpp index f5838138e1ae2..f1b19bc257b1d 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_uu_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_uu_int8.cpp @@ -5,10 +5,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: gpu, aspect-ext_intel_matrix -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "common.hpp" -#include "joint_matrix_uu_int8_impl.hpp" +#include "../common.hpp" +#include "../joint_matrix_uu_int8_impl.hpp" From 4e6912e384bb15964ef3f34eae6f717140d45d28 Mon Sep 17 00:00:00 2001 From: "Sidorov, Dmitry" Date: Mon, 12 Aug 2024 07:58:40 -0700 Subject: [PATCH 07/19] remove cpu tests Signed-off-by: Sidorov, Dmitry --- .../elemwise_irreg_size_ops_bf16.cpp | 145 --------------- .../joint_matrix_query_default.cpp | 174 ------------------ 2 files changed, 319 deletions(-) delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/elemwise_irreg_size_ops_bf16.cpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_query_default.cpp diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/elemwise_irreg_size_ops_bf16.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/elemwise_irreg_size_ops_bf16.cpp deleted file mode 100644 index e8b6f18c21a9d..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/elemwise_irreg_size_ops_bf16.cpp +++ /dev/null @@ -1,145 +0,0 @@ -//==-------- elemwise_irreg_size_ops_bf16.cpp - DPC++ joint_matrix---- ----==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// This test is for element wise operations when matrix size does not multiply -// SG size. This corner case only applies to AMX. Also, it tests bf16 type. -// only run this on AMX -// REQUIRES: gpu, cpu -// REQUIRES: matrix - -// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out -// RUN: %{run} %t.out - -#include "../common.hpp" - -#define SG_SZ 16 - -// 10x12 is not multiply the sg size, slicing implementation will have to insert -// padding -#define TM 10 -#define TN 12 -#define TK 16 - -template -void matrix_multiply(big_matrix &C, - big_matrix &A, - big_matrix &B) { - size_t M = NUM_ROWS_C; - size_t N = NUM_COLS_C; - size_t K = NUM_COLS_A; - - assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B * 2); - size_t NDRangeM = M / TM; - size_t NDRangeN = N / TN; - buffer bufA(A.get_data(), range<2>(M, K)); - buffer bufB(B.get_data(), range<2>(K / 2, N * 2)); - buffer bufC((float *)C.get_data(), range<2>(M, N)); - - queue q; - q.submit([&](handler &cgh) { - auto accC = bufC.get_access(cgh); - auto accA = bufA.get_access(cgh); - auto accB = bufB.get_access(cgh); - - cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [accA, accB, accC, M, N, K](nd_item<2> spmd_item) - [[intel::reqd_sub_group_size(SG_SZ)]] - - { - // The submatrix API has to be accessed by all the workitems in a - // subgroup these functions will be called once by the subgroup no - // code divergence between the workitems - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix - sub_a; - // For B, we assume B has been already VNNIed. - joint_matrix - sub_b; - joint_matrix sub_c; - joint_matrix_load( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, - N, layout::row_major); - for (int k = 0; k < K; k += TK) { - joint_matrix_load( - sg, sub_a, - accA.template get_multi_ptr() + - (sg_startx * TM) * K + k, - K); - // Assume we alreay in vnni format. - joint_matrix_load( - sg, sub_b, - accB.template get_multi_ptr() + - (k) * (N) + sg_starty / SG_SZ * TN * 2, - N * 2); - joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); - } - joint_matrix_apply(sg, sub_c, [](float &x) { x += 5.0; }); - joint_matrix_store( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, - N, layout::row_major); - }); // parallel for - }).wait(); -} - -static constexpr size_t MATRIX_M = TM * 2; -static constexpr size_t MATRIX_N = TN * 2; -static constexpr size_t MATRIX_K = TK * 2; -bfloat16 A[MATRIX_M][MATRIX_K]; -bfloat16 B[MATRIX_K / 2][MATRIX_N * 2]; -float C[MATRIX_M][MATRIX_N]; -float D[MATRIX_M][MATRIX_N]; - -int main() { - for (int i = 0; i < MATRIX_M; i++) { - for (int j = 0; j < MATRIX_K; j++) { - A[i][j] = bfloat16(1.0f * (i + j)); - } - } - for (int i = 0; i < MATRIX_K / 2; i++) { - for (int j = 0; j < MATRIX_N * 2; j++) { - B[i][j] = bfloat16(2.0f * i + 3.0f * j); - } - } - for (int i = 0; i < MATRIX_M; i++) { - for (int j = 0; j < MATRIX_N; j++) { - C[i][j] = 1.0; - D[i][j] = 1.0; - } - } - - big_matrix MC((float *)&C); - big_matrix MD((float *)&D); - big_matrix MA((bfloat16 *)&A); - big_matrix MB((bfloat16 *)&B); - matrix_multiply(MC, MA, MB); - matrix_multiply_ref( - (bfloat16 *)A, (bfloat16 *)B, (float *)D, MATRIX_M, MATRIX_N, - MATRIX_K / 2, false, false, false, [](float &x) { x = x + 5.0; }); - - bool res = true; - for (int i = 0; i < MATRIX_M; i++) { - for (int j = 0; j < MATRIX_N; j++) { - if (C[i][j] != D[i][j]) - res = false; - } - } - std::cout << (res ? "passed" : "failed") << std::endl; - return !res; -} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_query_default.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_query_default.cpp deleted file mode 100644 index 733ac11bdc516..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_query_default.cpp +++ /dev/null @@ -1,174 +0,0 @@ -//==-------- joint_matrix_query_default.cpp - DPC++ joint_matrix-----------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// Needs AMX. -// REQUIRES: gpu, cpu -// REQUIRES: matrix - -// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out -// RUN: %{run} %t.out - -#include -#include -#include - -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - -template struct big_matrix { -public: - T *mat; - -public: - T *get_data() { return mat; } - void set_data(T *data) { mat = data; } - big_matrix(T *data) : mat(data) {} -}; - -template -void matrix_multiply(big_matrix &C, - big_matrix &A, - big_matrix &B) { - size_t M = NUM_ROWS_C; - size_t N = NUM_COLS_C; - size_t K = NUM_COLS_A; - assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B * 4); - - using myparams2 = matrix_params< - sycl::ext::oneapi::experimental::architecture::intel_cpu_spr, int8_t, - int8_t, int>; - constexpr int TM = myparams2::M; - constexpr int TN = myparams2::N; - constexpr int TK = myparams2::K; - - std::cout << "AMX query sizes are: M " << TM << " N " << TN << " K " << TK - << std::endl; - - constexpr int SG_SZ = TN; - size_t NDRangeM = M / TM; - size_t NDRangeN = N / TN; - buffer bufA(A.get_data(), range<2>(M, K)); - buffer bufB(B.get_data(), range<2>(K, N)); - buffer bufC(C.get_data(), range<2>(M, N)); - - queue q; - q.submit([&](handler &cgh) { - auto accC = bufC.get_access(cgh); - auto accA = bufA.get_access(cgh); - auto accB = bufB.get_access(cgh); - - cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [accA, accB, accC, M, N, K](nd_item<2> spmd_item) - [[intel::reqd_sub_group_size(SG_SZ)]] - - { - // The submatrix API has to be accessed by all the workitems in a - // subgroup these functions will be called once by the subgroup no - // code divergence between the workitems - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sycl::sub_group sg = spmd_item.get_sub_group(); - - myparams2::joint_matrix_a sub_a; - myparams2::joint_matrix_b sub_b; - myparams2::joint_matrix_c sub_c; - - joint_matrix_load( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, - N, layout::row_major); - for (int k = 0; k < K / TK; k += 1) { - joint_matrix_load( - sg, sub_a, - accA.template get_multi_ptr() + - (sg_startx * TM) * K + k * TK, - K); - // Assuming B data is already in VNNI format. - joint_matrix_load( - sg, sub_b, - accB.template get_multi_ptr() + - (k * TK / 4) * (N * 4) + sg_starty / SG_SZ * TN * 4, - N * 4); - joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); - } - joint_matrix_store( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, - N, layout::row_major); - }); // parallel for - }).wait(); -} - -static constexpr size_t MATRIX_M = 128; -static constexpr size_t MATRIX_N = 128; -static constexpr size_t MATRIX_K = 128; -int8_t A[MATRIX_M][MATRIX_K]; -int8_t B[MATRIX_K / 4][MATRIX_N * 4]; -int32_t C[MATRIX_M][MATRIX_N]; -int32_t D[MATRIX_M][MATRIX_N]; - -void matrix_multiply_ref(int32_t *A_mem, int32_t *B_mem, int32_t *C_mem, int M, - int N, int K) { - // tiling - for (int m = 0; m < M; m++) - for (int n = 0; n < N; n++) { - for (int k = 0; k < K; k++) { - char *va = (char *)(A_mem + m * K + k); - char *vb = (char *)(B_mem + k * N + n); - int acc = *(C_mem + m * N + n); - for (int i = 0; i < 4; i++) { - acc += (va[i] * vb[i]); - } - *(C_mem + m * N + n) = acc; - } - } -} - -int main() { - for (int i = 0; i < MATRIX_M; i++) { - for (int j = 0; j < MATRIX_K; j++) { - A[i][j] = i + 2 * j; - } - } - for (int i = 0; i < MATRIX_K / 4; i++) { - for (int j = 0; j < MATRIX_N * 4; j++) { - B[i][j] = i + j; - } - } - for (int i = 0; i < MATRIX_M; i++) { - for (int j = 0; j < MATRIX_N; j++) { - C[i][j] = 1; - D[i][j] = 1; - } - } - - big_matrix MC((int32_t *)&C); - big_matrix MD((int32_t *)&D); - big_matrix MA((int8_t *)&A); - big_matrix MB((int8_t *)&B); - matrix_multiply(MC, MA, MB); - matrix_multiply_ref((int32_t *)A, (int32_t *)B, (int32_t *)D, MATRIX_M, - MATRIX_N, MATRIX_K / 4); - - bool res = true; - for (int i = 0; i < MATRIX_M; i++) { - for (int j = 0; j < MATRIX_N; j++) { - if (C[i][j] != D[i][j]) - res = false; - } - } - std::cout << (res ? "passed" : "failed") << std::endl; - return !res; -} From 208f2543b65334b8c7448d146f4d65abb555e858 Mon Sep 17 00:00:00 2001 From: "Sidorov, Dmitry" Date: Mon, 12 Aug 2024 07:58:57 -0700 Subject: [PATCH 08/19] Revert "wip" This reverts commit 877998948e69f8ff2d6535653cfb1332193e216a. --- sycl/test-e2e/replace.sh | 19 ------------------- sycl/test-e2e/replace2.sh | 15 --------------- 2 files changed, 34 deletions(-) delete mode 100644 sycl/test-e2e/replace.sh delete mode 100644 sycl/test-e2e/replace2.sh diff --git a/sycl/test-e2e/replace.sh b/sycl/test-e2e/replace.sh deleted file mode 100644 index 902efda078010..0000000000000 --- a/sycl/test-e2e/replace.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash - -for file in *.cpp; do - if grep -q "// RUN: %{build} -o %t.out" "$file"; then - sed -i 's|// RUN: %{build} -o %t.out|// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out|' "$file" - echo "Updated $file" - else - echo "No matching line found in $file" - fi - - if grep -q '#include "' "$file"; then - sed -i -E 's|#include "([^"]+)"|#include "../\1"|' "$file" - echo "Updated all #include lines in $file" - else - echo "No #include lines found in $file" - fi - -done - diff --git a/sycl/test-e2e/replace2.sh b/sycl/test-e2e/replace2.sh deleted file mode 100644 index 0f19f0486f68f..0000000000000 --- a/sycl/test-e2e/replace2.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash - -find . -name "*.cpp" | while read -r file; do - - line_number=$(grep -n -m 1 "// REQUIRES: .*" "$file" | cut -d: -f1) - - if [ ! -z "$line_number" ]; then - sed -i "${line_number}s|// REQUIRES: |// REQUIRES: gpu, |" "$file" - echo "Updated line $line_number in $file" - else - echo "No matching line found in $file" - fi - -done - From 193b0ebba9b485744b03a1c2499fa2f2269a6887 Mon Sep 17 00:00:00 2001 From: "Sidorov, Dmitry" Date: Mon, 12 Aug 2024 08:13:59 -0700 Subject: [PATCH 09/19] Disable 3 tests waiting for IGC fix Signed-off-by: Sidorov, Dmitry --- .../Matrix/SPVCooperativeMatrix/element_wise_all_ops_half.cpp | 1 + .../Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8.cpp | 1 + .../SPVCooperativeMatrix/element_wise_all_ops_int8_packed.cpp | 1 + 3 files changed, 3 insertions(+) diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_half.cpp index c2f4173890b01..24694a44a8ca9 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_half.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_half.cpp @@ -5,6 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +// UNSUPPORTED: windows, linux // REQUIRES: gpu, aspect-fp16 // REQUIRES: aspect-ext_intel_matrix // REQUIRES: matrix-fp16 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8.cpp index 53826649f7c12..cc18fa57cec4e 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8.cpp @@ -5,6 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +// UNSUPPORTED: windows, linux // REQUIRES: gpu, aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8_packed.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8_packed.cpp index f19401cbb0d3f..d57758c88698f 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8_packed.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8_packed.cpp @@ -5,6 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +// UNSUPPORTED: windows, linux // REQUIRES: gpu, aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out From 4b354437c9acb23ab1ea087bd0b724c2f77a12e0 Mon Sep 17 00:00:00 2001 From: "Sidorov, Dmitry" Date: Mon, 12 Aug 2024 08:15:50 -0700 Subject: [PATCH 10/19] remove .hpp files Signed-off-by: Sidorov, Dmitry --- .../Matrix/SPVCooperativeMatrix/common.hpp | 213 ----------- .../element_wise_abc_impl.hpp | 119 ------- .../element_wise_all_ops_half_impl.hpp | 127 ------- .../element_wise_all_ops_impl.hpp | 248 ------------- .../element_wise_all_ops_int8_impl.hpp | 127 ------- .../element_wise_all_ops_int8_packed_impl.hpp | 130 ------- .../element_wise_all_ops_tf32_impl.hpp | 102 ------ .../element_wise_all_sizes_impl.hpp | 130 ------- .../element_wise_ops_impl.hpp | 150 -------- .../get_coord_float_matC_impl.hpp | 130 ------- .../get_coord_int8_matA_impl.hpp | 176 --------- .../get_coord_int8_matB_impl.hpp | 214 ----------- .../joint_matrix_all_sizes_impl.hpp | 155 -------- .../joint_matrix_annotated_ptr_impl.hpp | 128 ------- .../joint_matrix_apply_bf16_impl.hpp | 113 ------ .../joint_matrix_apply_two_matrices_impl.hpp | 157 -------- .../joint_matrix_bf16_fill_k_cache_impl.hpp | 337 ------------------ .../joint_matrix_bfloat16_array_impl.hpp | 153 -------- ...trix_bfloat16_colmajorA_colmajorB_impl.hpp | 105 ------ .../joint_matrix_bfloat16_impl.hpp | 142 -------- .../joint_matrix_bfloat16_packedB_impl.hpp | 132 ------- .../joint_matrix_colA_rowB_colC_impl.hpp | 99 ----- .../joint_matrix_down_convert_impl.hpp | 82 ----- .../joint_matrix_half_impl.hpp | 131 ------- ...t_matrix_int8_colmajorA_colmajorB_impl.hpp | 114 ------ ...t_matrix_int8_rowmajorA_rowmajorB_impl.hpp | 123 ------- .../joint_matrix_opt_kernel_feature_impl.hpp | 109 ------ .../joint_matrix_out_bounds_impl.hpp | 124 ------- .../joint_matrix_prefetch_impl.hpp | 148 -------- .../joint_matrix_rowmajorA_rowmajorB_impl.hpp | 151 -------- .../joint_matrix_ss_int8_impl.hpp | 128 ------- .../joint_matrix_su_int8_impl.hpp | 132 ------- .../joint_matrix_tf32_impl.hpp | 120 ------- .../joint_matrix_transposeC_impl.hpp | 151 -------- .../joint_matrix_us_int8_impl.hpp | 133 ------- .../joint_matrix_uu_int8_impl.hpp | 133 ------- 36 files changed, 5166 deletions(-) delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/common.hpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_abc_impl.hpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_half_impl.hpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_impl.hpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8_impl.hpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8_packed_impl.hpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_tf32_impl.hpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_sizes_impl.hpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_ops_impl.hpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_float_matC_impl.hpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matA_impl.hpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matB_impl.hpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_all_sizes_impl.hpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_annotated_ptr_impl.hpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_bf16_impl.hpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_two_matrices_impl.hpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_impl.hpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array_impl.hpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_impl.hpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_packedB_impl.hpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_colA_rowB_colC_impl.hpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_down_convert_impl.hpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_half_impl.hpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_colmajorA_colmajorB_impl.hpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_rowmajorA_rowmajorB_impl.hpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature_impl.hpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_out_bounds_impl.hpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_prefetch_impl.hpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_rowmajorA_rowmajorB_impl.hpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_ss_int8_impl.hpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_su_int8_impl.hpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_tf32_impl.hpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_transposeC_impl.hpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_us_int8_impl.hpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_uu_int8_impl.hpp diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/common.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/common.hpp deleted file mode 100644 index 2b134f8144667..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/common.hpp +++ /dev/null @@ -1,213 +0,0 @@ -//==------------------ common.hpp - DPC++ joint_matrix---------------------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#include -#include -#include -#include -#include - -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; -namespace syclex = sycl::ext::oneapi::experimental; -namespace syclintelex = sycl::ext::intel::experimental; -using bfloat16 = sycl::ext::oneapi::bfloat16; - -// Most of the time, failures related to floating-point calculations (both float -// and bfloat16) are caused by accumulation errors rather than the algorithm -// itself. If it is an algorithm issue, the calculated result gap from the -// reference would be much bigger. To avoid flaky test results while catching -// algorithm errors, we are increasing the accuracy threshold. -// Something like this should be good enough to catch algorithm errors: -// fabs(ref[i] - val[i])/max(fabs(ref)) < 10e-2 -constexpr float FLOAT_EPSILON = 10e-2; - -template struct big_matrix { -public: - T *mat; - -public: - T *get_data() { return mat; } - void set_data(T *data) { mat = data; } - big_matrix(T *data) : mat(data) {} -}; - -float make_fp32(bfloat16 x) { - unsigned int y = *((int *)&x); - y = y << 16; - float *res = reinterpret_cast(&y); - return *res; -} - -template -void matrix_multiply_ref(Ta *A, Tb *B, Tc *C, int M, int N, int K, - bool transpose_c = false, bool colmajor_a = false, - bool colmajor_b = false, F &&lambda = {}) { - for (unsigned int m = 0; m < M; m++) { - for (unsigned int n = 0; n < N; n++) { - int c_ind = transpose_c ? (n * M + m) : m * N + n; - Tc acc = *(C + c_ind); - - for (unsigned int k = 0; k < K; k++) { - int a_ind = colmajor_a ? (k * M + m) : m * K + k; - int b_ind = colmajor_b ? (n * K + k) : k * N + n; - Ta *va = (Ta *)(A + a_ind * VF); - Tb *vb = (Tb *)(B + b_ind * VF); - - for (unsigned int i = 0; i < VF; i++) { - if constexpr (std::is_same_v && - std::is_same_v) - acc += make_fp32(va[i]) * make_fp32(vb[i]); - else if constexpr (std::is_same_v && - std::is_same_v || - std::is_integral_v && std::is_integral_v || - (std::is_same_v && - std::is_same_v)) - acc += va[i] * vb[i]; - else if constexpr (std::is_same_v && - std::is_same_v) - acc += (float)va[i] * (float)vb[i]; - else - assert(false && "Unsupported type in matrix_multiply_ref."); - } - } - - if constexpr (!std::is_same_v) { - lambda(acc); - } - *(C + c_ind) = acc; - } - } -} - -template -void matrix_vnni(unsigned int rows, unsigned int cols, T *src, T *dest, - unsigned int vnniFactor = 2) { - for (unsigned int i = 0; i < rows / vnniFactor; i++) { - for (unsigned int j = 0; j < cols; j++) { - for (unsigned int k = 0; k < vnniFactor; k++) { - dest[i * cols * vnniFactor + j * vnniFactor + k] = - src[(i * vnniFactor + k) * cols + j]; - } - } - } -} - -template -void matrix_transpose(unsigned int rows, unsigned int cols, T *dst, T *src) { - for (unsigned int i = 0; i < rows; i++) { - for (unsigned int j = 0; j < cols; j++) { - dst[i + j * rows] = src[i * cols + j]; - } - } -} - -template -void matrix_fill(unsigned int rows, unsigned int cols, T *src, T val) { - for (unsigned int i = 0; i < rows; i++) { - for (unsigned int j = 0; j < cols; j++) { - src[i * cols + j] = val; - } - } -} - -template -void matrix_fill(unsigned int rows, unsigned int cols, T *src, F op) { - for (unsigned int i = 0; i < rows; i++) { - for (unsigned int j = 0; j < cols; j++) { - src[i * cols + j] = T(op(i, j)); - } - } -} - -template -void matrix_rand(unsigned int rows, unsigned int cols, T *src, T val) { - std::random_device dev; - std::uniform_real_distribution fdistr(-val, val); - std::uniform_int_distribution idistr((int)-val, (int)val); - - for (unsigned int i = 0; i < rows; i++) { - for (unsigned int j = 0; j < cols; j++) { - if constexpr (std::is_same_v || std::is_same_v || - std::is_same_v) { - src[i * cols + j] = T(fdistr(dev)); - } else if constexpr (std::is_integral_v) { - src[i * cols + j] = T(idistr(dev)); - } else { - assert(false && "Unsupported type in matrix_rand."); - } - } - } -} - -template -void matrix_copy(unsigned int rows, unsigned int cols, T *src, T *dst) { - for (unsigned int i = 0; i < rows; i++) { - for (unsigned int j = 0; j < cols; j++) { - dst[i * cols + j] = src[i * cols + j]; - } - } -} - -template -bool matrix_compare(unsigned int rows, unsigned int cols, T1 *src, T2 *ref) { - for (int i = 0; i < rows; i++) { - for (int j = 0; j < cols; j++) { - if constexpr (!exact && (std::is_same_v || - std::is_same_v || - (std::is_same_v && - std::is_same_v))) { - float diff = std::fabs(src[i * cols + j] - (T1)ref[i * cols + j]); - if (diff > FLOAT_EPSILON || std::isnan(src[i * cols + j])) { - std::cout << "Incorrect result in matrix. " - << "i: " << i << ", j: " << j - << ", Ref: " << (T1)ref[i * cols + j] - << ", Val: " << src[i * cols + j] << ", Diff: " << diff - << ", Epsilon: " << FLOAT_EPSILON << "\n"; - return false; - } - } else if constexpr (exact || std::is_same_v) { - if (src[i * cols + j] != ref[i * cols + j]) { - std::cout << "Incorrect result in matrix." - << "i: " << i << ", j: " << j - << ", Ref: " << ref[i * cols + j] - << ", Val: " << src[i * cols + j] << "\n"; - return false; - } - } else { - std::cout << "Unsupported type in matrix_compare\n"; - return false; - } - } - } - return true; -} - -bool is_type_supported_by_device(queue q, matrix_type type) { - std::vector combinations = - q.get_device() - .get_info(); - for (int i = 0; i < combinations.size(); i++) { - if (combinations[i].atype == type) { - return true; - } - } - return false; -} - -template size_t get_sg_size(queue q) { - auto KernelID = get_kernel_id(); - auto KB = - get_kernel_bundle(q.get_context(), {KernelID}); - auto kernel = KB.get_kernel(KernelID); - - return kernel - .template get_info( - q.get_device()); -} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_abc_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_abc_impl.hpp deleted file mode 100644 index 5caf6d3e0a3e7..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_abc_impl.hpp +++ /dev/null @@ -1,119 +0,0 @@ -//==----------- element_wise_abc_impl.hpp - DPC++ joint_matrix-------------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include -#include -#include - -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - -template class add; - -template -void matrix_elem_wise_ops(big_matrix &C, big_matrix &A, - big_matrix &B) { - size_t NDRangeM = 1; - size_t NDRangeN = 1; - buffer bufA(A.get_data(), range<2>(M, K)); - buffer bufB(B.get_data(), range<2>(K, N)); - buffer bufC(C.get_data(), range<2>(M, N)); - - queue q; - size_t sg_size = get_sg_size>(q); - q.submit([&](handler &cgh) { - accessor accC{bufC, cgh}; - accessor accA{bufA, cgh}; - accessor accB{bufB, cgh}; - - cgh.parallel_for>( - nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif - { - // The submatrix API has to be accessed by all the workitems in a - // subgroup these functions will be called once by the subgroup no - // code divergence between the workitems - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix sub_a; - // For B, we assume B has been already VNNIed. - joint_matrix - sub_b; - joint_matrix sub_c; - - joint_matrix_load( - sg, sub_a, - accA.template get_multi_ptr() + - (sg_startx * M) * K, - K); - joint_matrix_apply(sg, sub_a, [](T2 &x) { x += 1; }); - - joint_matrix_load( - sg, sub_b, - accB.template get_multi_ptr() + - sg_starty / sg_size * N * vnniFactor, - N * vnniFactor); - joint_matrix_apply(sg, sub_b, [](T2 &x) { x += 1; }); - - joint_matrix_load( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * M) * N + sg_starty / sg_size * N, - N, layout::row_major); - joint_matrix_apply(sg, sub_c, [](T1 &x) { x += 1; }); - }); // parallel for - }).wait(); -} - -template -void test() { - Tc A[TM][TK]; - Tc B[TK / VF][TN * VF]; - Ta C[TM][TN]; - - big_matrix MC((Ta *)&C); - big_matrix MA((Tc *)&A); - big_matrix MB((Tc *)&B); - - return matrix_elem_wise_ops(MC, MA, MB); -} - -int main() { - queue q; - std::vector combinations = - q.get_device() - .get_info(); - - for (unsigned int i = 0; i < combinations.size(); i++) { - if (combinations[i].nsize == 0) { // Intel AMX - test(); - break; - } - - if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc - test(); - break; - } - - if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* - test(); - break; - } - } - - return 0; -} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_half_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_half_impl.hpp deleted file mode 100644 index 4065c7a78a566..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_half_impl.hpp +++ /dev/null @@ -1,127 +0,0 @@ -//===---element_wise_all_ops_half_impl.hpp - DPC++ joint_matrix------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -template class add; -template class sub; -template class mul; -template class divide; -template class logic; - -template -void assert_ops_ref(host_accessor C, const R ref) { - for (size_t i = 0; i < Rows; i++) - for (size_t j = 0; j < Cols; j++) { - auto diff = C[i][j] - ref; - assert(std::fabs(static_cast(diff)) < - std::numeric_limits::epsilon()); - } -} - -template -void matrix_verify_op(big_matrix &A, const R ref, OP op) { - buffer bufA(A.get_data(), range<2>(Rows, Cols)); - - queue q; - size_t sg_size = get_sg_size(q); - nd_range<2> r({Rows / TileRows, Cols / TileCols * sg_size}, {1, 1 * sg_size}); - - q.submit([&](handler &cgh) { - sycl::accessor accA{bufA, cgh, sycl::read_write}; - - cgh.parallel_for( - r, [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif - { - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix - sub_a; - - joint_matrix_fill(sg, sub_a, 5); - - joint_matrix_apply(sg, sub_a, op); - ext::intel::experimental::matrix::joint_matrix_store( - sg, sub_a, - accA.template get_multi_ptr() + - (sg_startx * TileRows) * Cols + - sg_starty / sg_size * TileCols, - Cols); - }); // parallel for - }).wait(); - assert_ops_ref(bufA.get_host_access(read_only), ref); -} - -template void test() { - static constexpr size_t Rows = TM * 2; - static constexpr size_t Cols = TK * 2; - Ta A[Rows][Cols]; - - big_matrix MA((Ta *)&A); - - matrix_verify_op, TResult>( - MA, 7, [=](Ta &x) { x = x + static_cast(2); }); - matrix_verify_op, TResult>( - MA, 3, [=](Ta &x) { x = x - static_cast(2); }); - matrix_verify_op, TResult>( - MA, 10, [=](Ta &x) { x = x * static_cast(2); }); - matrix_verify_op, TResult>( - MA, 2.5, [=](Ta &x) { x = x / static_cast(2); }); - matrix_verify_op, TResult>( - MA, 7, [=](Ta &x) { - if (x) { - if (x > static_cast(2) || x >= static_cast(2) || - x < static_cast(2) || x <= static_cast(2)) { - Ta val = (x != static_cast(2)) ? x : static_cast(2); - val--; - val++; - if (x == static_cast(2)) { - val -= static_cast(2); - val *= static_cast(3); - val /= static_cast(2); - } else { - val += static_cast(2); - } - x = val; - } - } - }); -} - -int main() { - queue q; - std::vector combinations = - q.get_device() - .get_info(); - - for (unsigned int i = 0; i < combinations.size(); i++) { - if (combinations[i].nsize == 0) { // Intel AMX - test(); - break; - } - - if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc - test(); - break; - } - - if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* - test(); - break; - } - } - - return 0; -} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_impl.hpp deleted file mode 100644 index db686d23ae05d..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_impl.hpp +++ /dev/null @@ -1,248 +0,0 @@ -//==----------- element_wise_all_ops_impl.hpp - DPC++ joint_matrix---------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -template -void assert_ops_ref(host_accessor mat, - const float ref) { - for (size_t i = 0; i < NUM_ROWS; i++) - for (size_t j = 0; j < NUM_COLS; j++) { - float diff; - if constexpr (std::is_same_v) - diff = make_fp32(mat[i][j]) - ref; - else - diff = mat[i][j] - ref; - assert(std::fabs(static_cast(diff)) < - std::numeric_limits::epsilon()); - } -} - -template -void verify_op_ab(const T l, const T r, const float ref, OP op) { - T mat[NUM_ROWS / VF][NUM_COLS * VF]; - big_matrix big_mat((T *)&mat); - - buffer bufMat(big_mat.get_data(), - range<2>(NUM_ROWS / VF, NUM_COLS * VF)); - - queue q; - size_t sg_size = get_sg_size(q); - q.submit([&](handler &cgh) { - sycl::accessor accessMat{bufMat, cgh, sycl::read_write}; - cgh.parallel_for( - nd_range<2>({NUM_ROWS / SUB_ROWS, NUM_COLS / SUB_COLS * sg_size}, - {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif - { - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix sub_mat; - joint_matrix_fill(sg, sub_mat, l); - joint_matrix_apply(sg, sub_mat, [=](T &x) { x = op(x, r); }); - ext::intel::experimental::matrix::joint_matrix_store( - sg, sub_mat, - accessMat.template get_multi_ptr() + - (sg_startx * SUB_ROWS / VF) * NUM_COLS * VF + - sg_starty / sg_size * SUB_COLS * VF, - NUM_COLS * VF); - }); // parallel for - }).wait(); - assert_ops_ref( - bufMat.get_host_access(read_only), ref); -} - -template -void verify_op_c(const T l, const T r, const float ref, OP op) { - T mat[NUM_ROWS][NUM_COLS]; - big_matrix big_mat((T *)&mat); - - buffer bufMat(big_mat.get_data(), range<2>(NUM_ROWS, NUM_COLS)); - queue q; - size_t sg_size = get_sg_size(q); - q.submit([&](handler &cgh) { - sycl::accessor accessMat{bufMat, cgh, sycl::read_write}; - cgh.parallel_for( - nd_range<2>({NUM_ROWS / SUB_ROWS, NUM_COLS / SUB_COLS * sg_size}, - {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif - { - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix - sub_mat; - joint_matrix_fill(sg, sub_mat, l); - joint_matrix_apply(sg, sub_mat, [=](T &x) { x = op(x, r); }); - - joint_matrix_store( - sg, sub_mat, - accessMat.template get_multi_ptr() + - (sg_startx * SUB_ROWS) * NUM_COLS + - sg_starty / sg_size * SUB_COLS, - NUM_COLS, layout::row_major); - }); // parallel for - }).wait(); - assert_ops_ref(bufMat.get_host_access(read_only), ref); -} - -// Avoid same kernel name for different types -template -class ewops_ab {}; -template -void test_ewops_ab() { - if constexpr (Use == use::a) - std::cout << "Test A "; - else - std::cout << "Test B "; - std::cout << SROWS << "x" << SCOLS << "\n"; - - static constexpr size_t NROWS = SROWS * 2; - static constexpr size_t NCOLS = SCOLS * 2; - - verify_op_ab>( - T(5.0), T(2.0), 7.0, [](auto l, auto r) { return l + r; }); - verify_op_ab>( - T(5.0), T(2.0), 3.0, [](auto l, auto r) { return l - r; }); - verify_op_ab>( - T(5.0), T(2.0), 10.0, [](auto l, auto r) { return l * r; }); - verify_op_ab>( - T(5.0), T(2.0), 2.5, [](auto l, auto r) { return l / r; }); - verify_op_ab>( - T(5.0), T(5.0), 5.0, [](auto l, auto r) { return l == r ? l : T(1.0); }); - verify_op_ab>( - T(5.0), T(4.0), 4.0, [](auto l, auto r) { return l == r ? l : r; }); - verify_op_ab>( - T(5.0), T(5.0), 1.0, [](auto l, auto r) { return l != r ? l : T(1.0); }); - verify_op_ab>( - T(5.0), T(2.0), 3.0, - [](auto l, auto r) { return l > r ? T(3.0) : T(2.0); }); - verify_op_ab>( - T(5.0), T(2.0), 2.0, - [](auto l, auto r) { return l < r ? T(3.0) : T(2.0); }); - verify_op_ab>( - T(5.0), T(2.0), 3.0, - [](auto l, auto r) { return l >= r ? T(3.0) : T(2.0); }); - verify_op_ab>( - T(5.0), T(2.0), 2.0, - [](auto l, auto r) { return l <= r ? T(3.0) : T(2.0); }); -} - -// Avoid same kernel name for different types and numbers of columns -template class ewops_c {}; -template void test_ewops_c() { - std::cout << "Test C " << SROWS << "x" << SCOLS << "\n"; - - static constexpr size_t NROWS = SROWS * 2; - static constexpr size_t NCOLS = SCOLS * 2; - - verify_op_c>( - T(5.0), T(2.0), 7.0, [](auto l, auto r) { return l + r; }); - verify_op_c>( - T(5.0), T(2.0), 3.0, [](auto l, auto r) { return l - r; }); - verify_op_c>( - T(5.0), T(2.0), 10.0, [](auto l, auto r) { return l * r; }); - verify_op_c>( - T(5.0), T(2.0), 2.5, [](auto l, auto r) { return l / r; }); - verify_op_c>( - T(5.0), T(5.0), 5.0, [](auto l, auto r) { return l == r ? l : T(1.0); }); - verify_op_c>( - T(5.0), T(4.0), 4.0, [](auto l, auto r) { return l == r ? l : r; }); - verify_op_c>( - T(5.0), T(5.0), 1.0, [](auto l, auto r) { return l != r ? l : T(1.0); }); - verify_op_c>( - T(5.0), T(2.0), 3.0, - [](auto l, auto r) { return l > r ? T(3.0) : T(2.0); }); - verify_op_c>( - T(5.0), T(2.0), 2.0, - [](auto l, auto r) { return l < r ? T(3.0) : T(2.0); }); - verify_op_c>( - T(5.0), T(2.0), 3.0, - [](auto l, auto r) { return l >= r ? T(3.0) : T(2.0); }); - verify_op_c>( - T(5.0), T(2.0), 2.0, - [](auto l, auto r) { return l <= r ? T(3.0) : T(2.0); }); -} - -int main() { - queue q; - std::vector combinations = - q.get_device() - .get_info(); - - for (auto &combination : combinations) { - if (combination.nsize == 0 || - combination.nsize == 16) { // Intel AMX or architecture::intel_gpu_pvc - test_ewops_ab(); - test_ewops_ab(); - test_ewops_ab(); - test_ewops_c(); - test_ewops_c(); - - if (combination.nsize == 16) { // architecture::intel_gpu_pvc - test_ewops_ab(); - test_ewops_c(); -// This combination is not currently supported for sub group size = 32 in IGC -#if (!defined(SG_SZ) || SG_SZ != 32) - test_ewops_ab(); - test_ewops_ab(); - test_ewops_c(); - test_ewops_c(); -#endif - } - break; - } - - if (combination.nsize == 8) { // architecture::intel_gpu_dg2* - test_ewops_ab(); - test_ewops_ab(); - test_ewops_c(); - break; - } - } - - return 0; -} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8_impl.hpp deleted file mode 100644 index 8a2f1f495e41d..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8_impl.hpp +++ /dev/null @@ -1,127 +0,0 @@ -//===---element_wise_all_ops_int8_impl.hpp - DPC++ joint_matrix------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -template class add; -template class sub; -template class mul; -template class divide; -template class logic; - -template -void assert_ops_ref(host_accessor C, const R ref) { - for (size_t i = 0; i < Rows; i++) - for (size_t j = 0; j < Cols; j++) { - auto diff = C[i][j] - ref; - assert(std::fabs(static_cast(diff)) <= - std::numeric_limits::epsilon()); - } -} - -template -void matrix_verify_op(big_matrix &A, const R ref, OP op) { - buffer bufA(A.get_data(), range<2>(Rows, Cols)); - - queue q; - size_t sg_size = get_sg_size(q); - nd_range<2> r({Rows / TileRows, Cols / TileCols * sg_size}, {1, 1 * sg_size}); - - q.submit([&](handler &cgh) { - sycl::accessor accA{bufA, cgh, sycl::read_write}; - - cgh.parallel_for( - r, [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif - { - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix - sub_a; - - joint_matrix_fill(sg, sub_a, 5); - - joint_matrix_apply(sg, sub_a, op); - ext::intel::experimental::matrix::joint_matrix_store( - sg, sub_a, - accA.template get_multi_ptr() + - (sg_startx * TileRows) * Cols + - sg_starty / sg_size * TileCols, - Cols); - }); // parallel for - }).wait(); - assert_ops_ref(bufA.get_host_access(read_only), ref); -} - -template void test() { - static constexpr size_t Rows = TM * 2; - static constexpr size_t Cols = TK * 2; - Ta A[Rows][Cols]; - - big_matrix MA((Ta *)&A); - - matrix_verify_op, TResult>( - MA, 7, [=](auto &x) { x = x + 2; }); - matrix_verify_op, TResult>( - MA, 3, [=](auto &x) { x = x - 2; }); - matrix_verify_op, TResult>( - MA, 10, [=](auto &x) { x = x * 2; }); - matrix_verify_op, TResult>( - MA, 2, [=](auto &x) { x = x / 2; }); // truncation is expected - matrix_verify_op, TResult>( - MA, 7, [=](auto &x) { - if (x) { - if (x > 2 || x >= 2 || x < 2 || x <= 2) { - Ta val = (x != 2) ? x : 2; - val--; - val++; - if (x == 2) { - val -= 2; - val *= 3; - val /= 2; - } else { - val += 2; - } - x = val; - } - } - }); -} - -int main() { - queue q; - std::vector combinations = - q.get_device() - .get_info(); - - for (unsigned int i = 0; i < combinations.size(); i++) { - if (combinations[i].nsize == 0) { // Intel AMX - test(); - break; - } - - if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc - test(); - break; - } - - if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* - test(); - break; - } - } - - return 0; -} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8_packed_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8_packed_impl.hpp deleted file mode 100644 index 7336bb8467fa5..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8_packed_impl.hpp +++ /dev/null @@ -1,130 +0,0 @@ -//===---element_wise_all_ops_int8_packed_impl.hpp - DPC++ joint_matrix-----===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -template class add; -template class sub; -template class mul; -template class divide; -template class logic; - -template -void assert_ops_ref(host_accessor C, - const TResult ref) { - for (size_t i = 0; i < Rows; i++) - for (size_t j = 0; j < Cols; j++) { - TResult diff = C[i][j] - ref; - assert(std::fabs(static_cast(diff)) <= - std::numeric_limits::epsilon()); - } -} - -template -void matrix_verify_op(big_matrix &B, const TResult ref, OP op) { - buffer bufB(B.get_data(), range<2>(Rows, Cols)); - - queue q; - size_t sg_size = get_sg_size(q); - nd_range<2> r({Rows / TileRows, Cols / TileCols * sg_size}, {1, 1 * sg_size}); - - q.submit([&](handler &cgh) { - sycl::accessor accB{bufB, cgh, sycl::read_write}; - - cgh.parallel_for( - r, [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif - { - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix - sub_b; - - joint_matrix_fill(sg, sub_b, 5); - - joint_matrix_apply(sg, sub_b, op); - ext::intel::experimental::matrix::joint_matrix_store( - sg, sub_b, - accB.template get_multi_ptr() + - (sg_startx * TileRows / VNNI) * Cols * VNNI + - sg_starty / sg_size * TileCols * VNNI, - Cols * VNNI); - }); // parallel for - }).wait(); - assert_ops_ref(bufB.get_host_access(read_only), ref); -} - -template -void test() { - static constexpr size_t Rows = TK * 2; - static constexpr size_t Cols = TN * 2; - T B[Rows][Cols]; - - big_matrix MB((T *)&B); - - matrix_verify_op, TResult>( - MB, 7, [=](auto &x) { x = x + 2; }); - matrix_verify_op, TResult>( - MB, 3, [=](auto &x) { x = x - 2; }); - matrix_verify_op, TResult>( - MB, 10, [=](auto &x) { x = x * 2; }); - matrix_verify_op, TResult>( - MB, 2, [=](auto &x) { x = x / 2; }); // truncation is expected - matrix_verify_op, TResult>( - MB, 7, [=](auto &x) { - if (x) { - if (x > 2 || x >= 2 || x < 2 || x <= 2) { - T val = (x != 2) ? x : 2; - val--; - val++; - if (x == 2) { - val -= 2; - val *= 3; - val /= 2; - } else { - val += 2; - } - x = val; - } - } - }); -} - -int main() { - queue q; - std::vector combinations = - q.get_device() - .get_info(); - - for (unsigned int i = 0; i < combinations.size(); i++) { - if (combinations[i].nsize == 0) { // Intel AMX - test(); - break; - } - - if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc - test(); - break; - } - - if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* - test(); - break; - } - } - - return 0; -} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_tf32_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_tf32_impl.hpp deleted file mode 100644 index ba63c45deea2d..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_tf32_impl.hpp +++ /dev/null @@ -1,102 +0,0 @@ -//===---element_wise_all_ops_tf32_impl.hpp - DPC++ joint_matrix------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#define TM 8 -#define TK 8 - -template -void assert_ops_ref(host_accessor C, - const float ref) { - for (size_t i = 0; i < M; i++) - for (size_t j = 0; j < N; j++) { - auto diff = C[i][j] - ref; - assert(std::fabs(static_cast(diff)) < - std::numeric_limits::epsilon()); - } -} - -template -void matrix_verify_op(big_matrix &A, const float ref, OP op) { - buffer bufA(A.get_data(), range<2>(M, K)); - - queue q; - size_t sg_size = get_sg_size(q); - nd_range<2> r({M / TileM, K / TileK * sg_size}, {1, 1 * sg_size}); - - q.submit([&](handler &cgh) { - sycl::accessor accA{bufA, cgh, sycl::read_write}; - - cgh.parallel_for( - r, [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif - { - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix - sub_a; - joint_matrix_fill(sg, sub_a, round_to_tf32(5.0)); - - joint_matrix_apply(sg, sub_a, op); - ext::intel::experimental::matrix::joint_matrix_store( - sg, sub_a, - accA.template get_multi_ptr() + - (sg_startx * TileM) * K + sg_starty / sg_size * TileK, - K); - }); // parallel for - }).wait(); - assert_ops_ref(bufA.get_host_access(sycl::read_only), ref); -} - -static constexpr size_t MATRIX_M = TM * 2; -static constexpr size_t MATRIX_K = TK * 2; -float A[MATRIX_M][MATRIX_K]; - -int main() { - - big_matrix MA((float *)&A); - - matrix_verify_op(MA, 7.0, - [=](auto &x) { x = x + round_to_tf32(2); }); - matrix_verify_op(MA, 3.0, - [=](auto &x) { x = x - round_to_tf32(2); }); - matrix_verify_op(MA, 10.0, - [=](auto &x) { x = x * round_to_tf32(2); }); - matrix_verify_op(MA, 2.5, - [=](auto &x) { x = x / round_to_tf32(2); }); - matrix_verify_op(MA, 7.0, [=](auto &x) { - if (x) { - if (x > 2 || x >= 2 || x < 2 || x <= 2) { - float val = (x != 2) ? x : 2; - val--; - val++; - if (x == 2) { - val -= 2; - val *= 3; - val /= 2; - } else { - val += 2; - } - x = val; - } - } - }); - - return 0; -} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_sizes_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_sizes_impl.hpp deleted file mode 100644 index 5228a154e9f6f..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_sizes_impl.hpp +++ /dev/null @@ -1,130 +0,0 @@ -//===---element_wise_all_ops_all_sizes_impl.hpp - DPC++ joint_matrix-------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -static constexpr size_t M_MULTIPLIER = 16; -template class add; - -template -void assert_ops_ref(host_accessor C, - const T ref) { - for (size_t i = 0; i < M; i++) - for (size_t j = 0; j < N; j++) { - if (std::is_same_v) { - auto diff = make_fp32(C[i][j]) - make_fp32(ref); - assert(std::fabs(static_cast(diff)) < - std::numeric_limits::epsilon()); - } else if (std::is_same_v) { - assert(C[i][j] == ref); - } - } -} - -template -void matrix_verify_add(const T1 val1, const T1 val2, const T1 result) { - static constexpr size_t M = TM * M_MULTIPLIER; - static constexpr size_t K = 128; - T MatA[M][K]; - - size_t NDRangeM = M / TM; - size_t NDRangeK = K / TK; - queue q; - size_t sg_size = get_sg_size(q); - nd_range<2> r({NDRangeM, NDRangeK * sg_size}, {1, 1 * sg_size}); - big_matrix A((T *)&MatA); - - buffer bufA(A.get_data(), range<2>(M, K)); - - q.submit([&](handler &cgh) { - sycl::accessor accA{bufA, cgh, sycl::read_write}; - - cgh.parallel_for( - r, [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif - { - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix sub_a; - - joint_matrix_fill(sg, sub_a, val1); - - joint_matrix_apply(sg, sub_a, [=](T &x) { x += val2; }); - - ext::intel::experimental::matrix::joint_matrix_store( - sg, sub_a, - accA.template get_multi_ptr() + - (sg_startx * TM) * K + sg_starty / sg_size * TK, - K); - }); // parallel for - }).wait(); - assert_ops_ref(bufA.get_host_access(), result); -} - -template -void add_ref() { - if constexpr (std::is_same_v) { - // Tests whether 5 + 2 = 7 operation is successful. - matrix_verify_add( - bfloat16(5.0), bfloat16(2.0), bfloat16(7.0)); - } - if constexpr (std::is_same_v) { - matrix_verify_add( - 5 /*val1*/, 2 /*val2*/, 7 /*result*/); - } -} - -template void test() { - add_ref>(); - add_ref>(); - add_ref>(); - add_ref>(); - add_ref>(); - add_ref>(); - add_ref>(); - - add_ref>(); - add_ref>(); - add_ref>(); - add_ref>(); - add_ref>(); - add_ref>(); - add_ref>(); -} - -int main() { - queue q; - std::vector combinations = - q.get_device() - .get_info(); - - for (unsigned int i = 0; i < combinations.size(); i++) { - if (combinations[i].nsize == 0) { // Intel AMX - test<16>(); - break; - } - - if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc - test<16>(); - break; - } - - if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* - test<8>(); - break; - } - } - - std::cout << "Passed\n"; -} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_ops_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_ops_impl.hpp deleted file mode 100644 index 8ffbbf8eabff1..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_ops_impl.hpp +++ /dev/null @@ -1,150 +0,0 @@ -//==----------- element_wise_ops_impl.hpp - DPC++ joint_matrix---------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -template -void matrix_multiply(big_matrix &C, big_matrix &A, - big_matrix &B) { - // stride should be X's cols, e.g., B's stirde = N*4 - size_t NDRangeM = M / TM; - size_t NDRangeN = N / TN; - buffer bufA(A.get_data(), range<2>(M, K)); - buffer bufB(B.get_data(), range<2>(K, N)); - buffer bufC(C.get_data(), range<2>(M, N)); - - queue q; - size_t sg_size = get_sg_size(q); - q.submit([&](handler &cgh) { - auto accC = bufC.template get_access(cgh); - auto accA = bufA.template get_access(cgh); - auto accB = bufB.template get_access(cgh); - - cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif - { - // The submatrix API has to be accessed by all the workitems in a - // subgroup these functions will be called once by the subgroup no - // code divergence between the workitems - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sycl::sub_group sg = spmd_item.get_sub_group(); - joint_matrix sub_a; - // For B, we assume B has been already VNNIed. - joint_matrix - sub_b; - joint_matrix sub_c; - - joint_matrix_load( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::row_major); - for (int k = 0; k < K / TK; k += 1) { - joint_matrix_load( - sg, sub_a, - accA.template get_multi_ptr() + - (sg_startx * TM) * K + k * TK, - K); - joint_matrix_load( - sg, sub_b, - accB.template get_multi_ptr() + - (k * TK / VF) * (N * VF) + sg_starty / sg_size * TN * VF, - N * VF); - joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); - } - joint_matrix_apply(sg, sub_c, [](Tc &x) { x = x * 2; }); - joint_matrix_store( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::row_major); - }); // parallel for - }).wait(); -} - -template -bool test() { - - static constexpr size_t MATRIX_M = TM * 2; - static constexpr size_t MATRIX_N = TN * 2; - static constexpr size_t MATRIX_K = TK * 2; - - Ta A[MATRIX_M][MATRIX_K]; - Ta B[MATRIX_K / VF][MATRIX_N * VF]; - Tc C[MATRIX_M][MATRIX_N]; - Tc D[MATRIX_M][MATRIX_N]; - - matrix_rand(MATRIX_M, MATRIX_K, (Ta *)A, (Ta)100); - matrix_rand(MATRIX_K / VF, MATRIX_N * VF, (Ta *)B, (Ta)100); - matrix_fill(MATRIX_M, MATRIX_N, (Tc *)C, (Tc)1); - matrix_fill(MATRIX_M, MATRIX_N, (Tc *)D, (Tc)1); - - big_matrix MC((Tc *)&C); - big_matrix MD((Tc *)&D); - big_matrix MA((Ta *)&A); - big_matrix MB((Ta *)&B); - - matrix_multiply(MC, MA, MB); - matrix_multiply_ref((Ta *)A, (Ta *)B, (Tc *)D, MATRIX_M, - MATRIX_N, MATRIX_K / VF, false, false, - false, [](Tc &x) { x = x * 2; }); - bool res = matrix_compare(MATRIX_M, MATRIX_N, (Tc *)C, (Tc *)D); - - std::cout << TM << "x" << TN << "x" << TK << ": " - << (res ? "passed" : "failed") << std::endl; - return res; -} - -int main() { - queue q; - std::vector combinations = - q.get_device() - .get_info(); - - bool passed = true; - for (unsigned int i = 0; i < combinations.size(); i++) { - if (combinations[i].nsize == 0) { // Intel AMX - passed &= test(); - passed &= test(); - passed &= test(); - break; - } - - if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc - passed &= test(); - passed &= test(); - passed &= test(); -#if (!defined(SG_SZ) || SG_SZ != 32) - // These combination are not currently supported for subgroup size = 32 in - // IGC - passed &= test(); - passed &= test(); - passed &= test(); -#endif - break; - } - - if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* - passed &= test(); - passed &= test(); - passed &= test(); - break; - } - } - - return !passed; -} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_float_matC_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_float_matC_impl.hpp deleted file mode 100644 index bb0b6336a0ec5..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_float_matC_impl.hpp +++ /dev/null @@ -1,130 +0,0 @@ -//==----------- get_coord_float_matC_impl.hpp - DPC++ joint_matrix---------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#include -#include - -template class add_rows; - -// clang-format off -/* -Here's how the data is distributed for sub group size = 16 on PVC -W0 --> 0 1 2 3 4 5 6 7 -wi [0,0] -> i=0, [0, 0] wi [0,1] --> i=0, [0, 1] wi [0,15] --> i=0, [0, 15] - i=1, [1, 0] i=1, [1, 1] i=1, [1, 15] - i=2, [2, 0] i=2, [2, 1] ... - ... .... - i=7, [7, 0] i=7, [7, 1] -*/ -// clang-format on - -template -void matrix_sum_rows(big_matrix &C, T *sum_rows) { - buffer bufC((T *)C.get_data(), range<2>(Rows, Cols)); - buffer sum_rows_v(sum_rows, Rows); - - queue q; - size_t sg_size = get_sg_size>(q); - q.submit([&](handler &cgh) { - sycl::accessor accC{bufC, cgh, sycl::read_write}; - sycl::accessor v{sum_rows_v, cgh, sycl::read_write}; - - cgh.parallel_for>( - nd_range<2>({Rows / TileRows, Cols / TileCols * sg_size}, - {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif - { - // The submatrix API has to be accessed by all the workitems in a - // subgroup these functions will be called once by the subgroup no - // code divergence between the workitems - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix - sub_c; - - joint_matrix_load( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TileRows) * Cols + - sg_starty / sg_size * TileCols, - Cols, layout::row_major); - - T sum_local_rows[Rows] = {0}; - - ext::intel::experimental::matrix::joint_matrix_apply( - sg, sub_c, [&](T &x, size_t row, size_t col) { - sum_local_rows[row + global_idx * TileRows] += x; - }); - for (int i = 0; i < Rows; i++) { - sum_local_rows[i] = - reduce_over_group(sg, sum_local_rows[i], sycl::plus<>()); - // only Groups leader perform the global reduction - if (global_idy % sg_size == 0) { - sycl::atomic_ref - aref(v[i]); - aref.fetch_add(sum_local_rows[i]); - } - } - }); // parallel for - }).wait(); -} - -template void test() { - constexpr size_t SCALE = 2; - static constexpr size_t Rows = TM * SCALE; - static constexpr size_t Cols = TN * SCALE; - - T sum_rows[Rows] = {0}; - T sum_rows_ref[Rows] = {0}; - T C[Rows][Cols]; - big_matrix MC((T *)&C); - - matrix_rand(Rows, Cols, (T *)&C, (T)100); - matrix_sum_rows(MC, sum_rows); - - for (int i = 0; i < Rows; i++) { - for (int j = 0; j < Cols; j++) { - sum_rows_ref[i] += C[i][j]; - } - assert(std::fabs(sum_rows_ref[i] - sum_rows[i]) <= FLOAT_EPSILON); - } -} - -int main() { - queue q; - std::vector combinations = - q.get_device() - .get_info(); - - for (unsigned int i = 0; i < combinations.size(); i++) { - if (combinations[i].nsize == 0) { // Intel AMX - test(); - break; - } - - if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc - test(); - break; - } - - if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* - test(); - break; - } - } - return 0; -} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matA_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matA_impl.hpp deleted file mode 100644 index f3d89cc717ef4..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matA_impl.hpp +++ /dev/null @@ -1,176 +0,0 @@ -//==----------- get_coord_int8_matA_impl.hpp - DPC++ joint_matrix---------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#include -#include - -template class add_rows; - -template -void sum_rows_ref( - host_accessor A, - host_accessor sum_rows) { - int sum_rows_ref[Rows] = {0}; - for (size_t i = 0; i < Rows; i++) { - for (size_t j = 0; j < Cols; j++) { - sum_rows_ref[i] += A[i][j]; - } - auto diff = sum_rows[i] - sum_rows_ref[i]; - assert(std::fabs(static_cast(diff)) <= - std::numeric_limits::epsilon()); - } -} - -// clang-format off -/* For sub group size = 16: -wi [0,0] -> i=0, [0, 0] wi [0,1] --> i=0, [0, 2] wi [0,15] --> i=0, [0, 30] - i=1, [0, 1] i=1, [0, 3] i=1, [0, 31] - i=2, [1, 0] i=2, [1, 2] i=2, [1, 30] - i=3, [1, 1] i=3, [1, 3] i=3, [1, 31] - i=4, [2, 0] i=4, [2, 2] ... - i=5, [2, 1] i=5, [2, 3] - ... .... - i=14,[7, 0] i=14, [7, 2] - i=15,[7, 1] i=15, [7, 3] i=15, [7, 31] - -Here's how the distribution of the A matrix looks like for this test case -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -<--------------------------------- SG1 ---------------------------------> -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -<0> <1> <2> <3> <4> <5> <6> <7> ..... WORK ITEMS -Each work item has 16 elements <8 rows and 2 cols of the original matrix> -the data_slice holds the matrix elements in the following order: -0 0 0 0 - / - / -1 1 1 1 - / - / -2 2 2 2 - / - / -3 3 3 3 -W0 --> 0 0 1 1 2 2 3 3 .... 7 7 -*/ -// clang-format on - -template -void matrix_sum_rows(big_matrix &A) { - buffer bufA(A.get_data(), range<2>(Rows, Cols)); - - // size of vector is equal to number of rows in big matrix - TResult sum_rows[Rows] = {0}; - buffer sum_rows_v(sum_rows, Rows); - queue q; - size_t sg_size = get_sg_size>(q); - nd_range<2> r({Rows / TileRows, Cols / TileCols * sg_size}, {1, 1 * sg_size}); - q.submit([&](handler &cgh) { - sycl::accessor accA{bufA, cgh, sycl::read_write}; - sycl::accessor v{sum_rows_v, cgh, sycl::read_write}; - - cgh.parallel_for>( - r, [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif - { - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sycl::sub_group sg = spmd_item.get_sub_group(); - joint_matrix - sub_a; - joint_matrix_load( - sg, sub_a, - accA.template get_multi_ptr() + - (sg_startx * TileRows * Cols) + - sg_starty / sg_size * TileCols, - Cols); - - TResult sum_local_rows[Rows] = {0}; - - ext::intel::experimental::matrix::joint_matrix_apply( - sg, sub_a, [&](T &x, size_t row, size_t col) { - sum_local_rows[row + global_idx * TileRows] += x; - }); - for (int i = 0; i < Rows; ++i) { - sum_local_rows[i] = - reduce_over_group(sg, sum_local_rows[i], sycl::plus<>()); - - // only Groups leader performs the global reduction - if (global_idy % sg_size == 0) { - sycl::atomic_ref - aref(v[i]); - aref.fetch_add(sum_local_rows[i]); - } - } - }); // parallel for - }).wait(); - sum_rows_ref(bufA.get_host_access(), - sum_rows_v.get_host_access()); -} - -template void test() { - static constexpr size_t Rows = TM * 2; - static constexpr size_t Cols = TK * 2; - T A[Rows][Cols]; - - big_matrix MA((T *)&A); - - for (int i = 0; i < Rows; i++) { - for (int j = 0; j < Cols; j++) { - A[i][j] = i + j; - } - } - - matrix_sum_rows(MA); -} - -int main() { - queue q; - std::vector combinations = - q.get_device() - .get_info(); - - for (unsigned int i = 0; i < combinations.size(); i++) { - if (combinations[i].nsize == 0) { // Intel AMX - test(); - break; - } - - if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc - test(); - break; - } - - if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* - test(); - break; - } - } - return 0; -} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matB_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matB_impl.hpp deleted file mode 100644 index 8b63dadc029b3..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matB_impl.hpp +++ /dev/null @@ -1,214 +0,0 @@ -//==----------- get_coord_int8_matB_impl.hpp - DPC++ joint_matrix---------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#include -#include - -template class add_cols; - -template -void sum_cols_ref( - host_accessor B, - host_accessor sum_cols) { - TResult sum_cols_ref[Cols] = {0}; - for (size_t j = 0; j < Cols; j++) { - for (size_t i = 0; i < Rows; i++) { - sum_cols_ref[j] += B[i][j]; - } - auto diff = sum_cols[j] - sum_cols_ref[j]; - assert(std::fabs(static_cast(diff)) <= - std::numeric_limits::epsilon()); - } -} - -// clang-format off -/* - Here is a demonstration of how matrix B will be divided across - work items for this test case for sub group size = 16 on PVC. - < --------------- 128 ----------------------------------> - x x x x x x x x x x x x x x x x .......... x x x x x x ^ - x x x x x x x x x x x x x x x x .......... x x x x x x 16 - x x x x x x x x x x x x x x x x .......... x x x x x x | - ..... | - x x x x x x x x x x x x x x x x .......... x x x x x x | - x x x x x x x x x x x x x x x x .......... x x x x x x v - - --------------- 64 ----------------> - x x x x x x .......... x x x x x x ^ - x x x x x x .......... x x x x x x 8 - x x x x x x .......... x x x x x x | <-- part of (VNNI-ed) - ..... | original matrix each SG - x x x x x x .......... x x x x x x | holds - x x x x x x .......... x x x x x x v - < WI0 > < WI15 > - <-------- 16 -------------> - x x x .......... x x x ^ - x x x .......... x x x | - x x x .......... x x x | <-- part of (non-VNNI-ed) original matrix - ..... | each SG holds - x x x .......... x x x | - x x x .......... x x x | - x x x .......... x x x 32 - x x x .......... x x x | - x x x .......... x x x | - x x x .......... x x x | - x x x .......... x x x | - x x x .......... x x x | - x x x .......... x x x v - If we divide the above matrix across 16 (SG_SZ) work items, - each WI will hold 32 elements. And these 32 elements will be - 8x4 chunks as shown in the VNNI-ed matrix figure. - -The total distribution among the WIs in ALL the sub-groups is as follows: -This is useful to figure out how the global index is to be calculated - -W0 --> 0 0 0 0 1 1 1 1 ... 7 7 7 7 --> total 32 elements -wi [0,0] --> i=0, [0, 0] wi [0,1] --> i=0, [0, 4] wi [0,15] --> i=0, [0, 60] | wi [0,16] --> i=0, [0, 64] - i=1, [0, 1] i=1, [0, 5] i=1, [0, 61] | i=1, [0, 65] - i=2, [0, 2] i=2, [0, 6] i=2, [0, 62] | i=2, [0, 66] - i=3, [0, 3] i=3, [0, 7] i=3, [0, 63] | i=3, [0, 67] - i=4, [1, 0] i=4, [1, 4] i=4, [1, 60] | .... - i=5, [1, 1] i=5, [1, 5] i=5, [1, 61] | - i=6, [1, 2] i=6, [1, 6] i=6, [1, 62] | - i=7, [1, 3] i=7, [1, 7] i=7, [1, 63] | - ... ... .... | - i=28,[7, 0] i=28,[7, 4] i=28,[7, 60] | i=28, [7, 124] - i=29,[7, 1] i=29,[7, 5] i=29,[7, 61] | i=29, [7, 125] - i=30,[7, 2] i=30,[7, 6] i=30,[7, 62] | i=30, [7, 126] - i=31,[7, 3] i=31,[7, 7] i=31,[7, 63] | i=31, [7, 127] ----------------------------------------------------------------------------------------- --------------------------- -wi [1,0] --> i=0, [8, 0] - i=1, [8, 1] - i=2, [8, 2] - i=3, [8, 2] - ... - i=28, [15, 0] - i=29, [15, 1] - i=30, [15, 2] - i=31, [15, 3] -*/ - -// clang-format on - -template -void matrix_sum_cols(big_matrix &B, - big_matrix &Bvnni) { - buffer bufB(B.get_data(), range<2>(Rows, Cols)); - buffer bufBvnni(Bvnni.get_data(), range<2>(Rows / VNNI, Cols * VNNI)); - - TResult sum_cols[Cols] = {0}; - buffer sum_cols_v(sum_cols, Cols); - - size_t NDRangeK = Rows / TileRows; - size_t NDRangeN = Cols / TileCols; - queue q; - size_t sg_size = get_sg_size>(q); - nd_range<2> r({NDRangeK, NDRangeN * sg_size}, {1, 1 * sg_size}); - - q.submit([&](handler &cgh) { - sycl::accessor accB{bufBvnni, cgh, sycl::read_write}; - sycl::accessor v{sum_cols_v, cgh, sycl::read_write}; - - cgh.parallel_for>( - r, [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif - { - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sycl::sub_group sg = spmd_item.get_sub_group(); - - joint_matrix - sub_b; - - joint_matrix_load( - sg, sub_b, - accB.template get_multi_ptr() + - (sg_startx * (TileRows / VNNI) * Cols * VNNI) + - sg_starty / sg_size * TileCols * VNNI, - Cols * VNNI); - - TResult sum_local_cols[Cols] = {0}; - ext::intel::experimental::matrix::joint_matrix_apply( - sg, sub_b, [&](T &x, size_t row, size_t col) { - // the coordinates returned are in the logical range - // [Rows,Cols] If users want to retrieve the VNNIed - // coordinates, they can be obtained using colVNNI = col/VNNI - // rowVNNI = row*VNNI - size_t global_index = col + global_idy / sg_size * TileCols; - sum_local_cols[global_index] += x; - }); - - for (int i = 0; i < Cols; i++) { - sum_local_cols[i] = - reduce_over_group(sg, sum_local_cols[i], sycl::plus<>()); - if (global_idy % sg_size == 0) { - sycl::atomic_ref - aref(v[i]); - aref.fetch_add(sum_local_cols[i]); - } - } - }); // parallel for - }).wait(); - sum_cols_ref(bufB.get_host_access(), - sum_cols_v.get_host_access()); -} - -template -void test() { - static constexpr size_t scale = 2; - static constexpr size_t MATRIX_K = TK * scale; - static constexpr size_t MATRIX_N = TN * scale; - - T B[MATRIX_K][MATRIX_N]; - big_matrix MB((T *)&B); - - T Bvnni[MATRIX_K / VNNI][MATRIX_N * VNNI]; - big_matrix MBvnni((T *)&Bvnni); - - for (int i = 0; i < MATRIX_K; i++) { - for (int j = 0; j < MATRIX_N; j++) { - B[i][j] = i + j; - } - } - matrix_vnni(MATRIX_K, MATRIX_N, *B, *Bvnni, VNNI); - // This test calculates sum of columns in the non VNNI B matrix - matrix_sum_cols(MB, MBvnni); -} - -int main() { - queue q; - std::vector combinations = - q.get_device() - .get_info(); - - for (unsigned int i = 0; i < combinations.size(); i++) { - if (combinations[i].nsize == 0) { // Intel AMX - test(); - break; - } - - if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc - test(); - break; - } - - if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* - test(); - break; - } - } - return 0; -} \ No newline at end of file diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_all_sizes_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_all_sizes_impl.hpp deleted file mode 100644 index b3001a68bb227..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_all_sizes_impl.hpp +++ /dev/null @@ -1,155 +0,0 @@ -//===---joint_matrix_all_sizes_impl.hpp - DPC++ joint_matrix---------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -static constexpr size_t M_MULTIPLIER = 16; - -template class mult; - -template -void matrix_multiply(big_matrix &C, big_matrix &A, - big_matrix &B) { - size_t NDRangeM = M / TM; - size_t NDRangeN = N / TN; - buffer bufA(A.get_data(), range<2>(M, K)); - buffer bufB(B.get_data(), range<2>(K, N)); - buffer bufC(C.get_data(), range<2>(M, N)); - - queue q; - size_t sg_size = get_sg_size(q); - q.submit([&](handler &cgh) { - sycl::accessor accC{bufC, cgh, sycl::read_write}; - sycl::accessor accA{bufA, cgh, sycl::read_only}; - sycl::accessor accB{bufB, cgh, sycl::read_only}; - - cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif - { - // The submatrix API has to be accessed by all the workitems in a - // subgroup these functions will be called once by the subgroup no - // code divergence between the workitems - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix sub_a; - // For B, we assume B has been already VNNIed. - joint_matrix - sub_b; - joint_matrix sub_c; - - joint_matrix_load( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::row_major); - for (int k = 0; k < K / TK; k += 1) { - joint_matrix_load( - sg, sub_a, - accA.template get_multi_ptr() + - (sg_startx * TM) * K + k * TK, - K); - joint_matrix_load( - sg, sub_b, - accB.template get_multi_ptr() + - (k * TK / vnniFactor) * (N * vnniFactor) + - sg_starty / sg_size * TN * vnniFactor, - N * vnniFactor); - joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); - } - joint_matrix_store( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::row_major); - }); // parallel for - }).wait(); -} - -template -void init_and_multiply() { - static constexpr size_t MATRIX_M = tM * M_MULTIPLIER; - static constexpr size_t MATRIX_N = 128; - static constexpr size_t MATRIX_K = 128; - - std::cout << "MATRIX_M=" << MATRIX_M << "\n"; - - T A[MATRIX_M][MATRIX_K]; - T B[MATRIX_K][MATRIX_N]; - T Bvnni[MATRIX_K / vnni_factor][MATRIX_N * vnni_factor]; - TResult C[MATRIX_M][MATRIX_N]; - TResult D[MATRIX_M][MATRIX_N]; - - matrix_rand(MATRIX_M, MATRIX_K, (T *)A, (T)50); - matrix_rand(MATRIX_K, MATRIX_N, (T *)B, (T)50); - matrix_fill(MATRIX_M, MATRIX_N, (TResult *)C, (TResult)1); - matrix_fill(MATRIX_M, MATRIX_N, (TResult *)D, (TResult)1); - - big_matrix MC((TResult *)&C); - big_matrix MD((TResult *)&D); - big_matrix MA((T *)&A); - matrix_vnni(MATRIX_K, MATRIX_N, (T *)&B, (T *)&Bvnni, vnni_factor); - big_matrix MBvnni( - (T *)&Bvnni); - - matrix_multiply(MC, MA, MBvnni); - matrix_multiply_ref((T *)A, (T *)B, (TResult *)D, MATRIX_M, MATRIX_N, - MATRIX_K); - - assert(matrix_compare(MATRIX_M, MATRIX_N, (TResult *)C, (TResult *)D)); -} - -template -void test() { - init_and_multiply>(); - init_and_multiply>(); - init_and_multiply>(); - init_and_multiply>(); - init_and_multiply>(); - init_and_multiply>(); - init_and_multiply>(); - init_and_multiply>(); -} - -int main() { - queue q; - std::vector combinations = - q.get_device() - .get_info(); - - for (unsigned int i = 0; i < combinations.size(); i++) { - if (combinations[i].nsize == 0) { // Intel AMX - test(); - test(); - break; - } - - if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc - test(); - test(); - break; - } - - if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* - test(); - test(); - break; - } - } - - return 0; -} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_annotated_ptr_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_annotated_ptr_impl.hpp deleted file mode 100644 index 5463ea040d1eb..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_annotated_ptr_impl.hpp +++ /dev/null @@ -1,128 +0,0 @@ -//===---joint_matrix_annotated_ptr_impl.hpp - DPC++ joint_matrix-----------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include - -#define TM 8 -#define TK 16 - -template class mult; - -template -void matrix_multiply(T1 *C, T2 *A, T2 *B, queue &q) { - size_t NDRangeM = M / TM; - size_t NDRangeN = N / TN; - size_t sg_size = get_sg_size>(q); - q.submit([&](handler &cgh) { - cgh.parallel_for>( - nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif - { - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix - sub_a; - joint_matrix - sub_b; - joint_matrix - sub_bp; - joint_matrix sub_c; - - auto C_ptr = syclex::annotated_ptr{ - C, syclex::properties{ - syclintelex::read_assertion>}}; - auto A_ptr = syclex::annotated_ptr{ - A, - syclex::properties{syclintelex::read_assertion< - syclintelex::cache_control>}}; - auto B_ptr = syclex::annotated_ptr{ - B, - syclex::properties{syclintelex::read_hint< - syclintelex::cache_control>}}; - joint_matrix_load(sg, sub_c, - C_ptr + (sg_startx * TM) * N + - sg_starty / sg_size * TN, - N, layout::row_major); - for (int k = 0; k < K / TK; k += 1) { - joint_matrix_load(sg, sub_a, A_ptr + (sg_startx * TM) * K + k * TK, - K); - if constexpr (vnniFactor == 0) { - joint_matrix_load( - sg, sub_b, B_ptr + (k * TK) * N + sg_starty / sg_size * TN, - N); - joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); - } else { - joint_matrix_load(sg, sub_bp, - B_ptr + - (k * TK / vnniFactor) * (N * vnniFactor) + - sg_starty / sg_size * TN * vnniFactor, - N * vnniFactor); - - joint_matrix_mad(sg, sub_c, sub_a, sub_bp, sub_c); - } - } - auto C_w_ptr = syclex::annotated_ptr{ - C, - syclex::properties{syclintelex::write_hint< - syclintelex::cache_control>}}; - joint_matrix_store(sg, sub_c, - C_w_ptr + (sg_startx * TM) * N + - sg_starty / sg_size * TN, - N, layout::row_major); - }); // parallel for - }).wait(); -} - -int main() { - queue q; - static constexpr size_t M = TM * 2; - static constexpr size_t N = TN * 2; - static constexpr size_t K = TK * 2; - static constexpr unsigned int vnniFactor = 2; - bfloat16 *A = malloc_shared(M * K, q); - bfloat16 *B = malloc_shared(K * N, q); - bfloat16 *vnniB = malloc_shared(K * N, q); - float *C = malloc_shared(M * N, q); - float *D = malloc_shared(M * N, q); - - matrix_fill(M, K, A, [](int i, int j) { return 1.0f * (i + j); }); - matrix_fill(K, N, (bfloat16 *)B, - [](int i, int j) { return 2.0f * i + 3.0f * j; }); - matrix_fill(M, N, C, 1.0f); - matrix_fill(M, N, D, 1.0f); - - matrix_vnni(K, N, B, vnniB, vnniFactor); - - matrix_multiply_ref(A, B, D, M, N, K); - - // Currently row major B fails when annotated_ptr is used - matrix_multiply(C, A, B, q); - bool res0 = matrix_compare(M, N, C, D); - std::cout << (res0 ? "B row major passed" : "failed") << std::endl; - - matrix_fill(M, N, C, 1.0f); - matrix_multiply(C, A, vnniB, q); - bool res1 = matrix_compare(M, N, C, D); - std::cout << (res1 ? "B VNNI passed" : "failed") << std::endl; - - return !(res0 & res1); -} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_bf16_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_bf16_impl.hpp deleted file mode 100644 index 3fc96f77e020a..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_bf16_impl.hpp +++ /dev/null @@ -1,113 +0,0 @@ -//===---joint_matrix_apply_bf16_impl.hpp - DPC++ joint_matrix--------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -template class add; -template class add_func; - -template struct apply_add { - void operator()(T &x) const { x = x + T(2); } -}; - -template -void matrix_verify_add(big_matrix &A, const TResult ref, - F &&lambda) { - buffer bufA(A.get_data(), range<2>(Rows, Cols)); - - queue q; - size_t sg_size = get_sg_size(q); - nd_range<2> r({Rows / TileRows, Cols / TileCols * sg_size}, {1, 1 * sg_size}); - - q.submit([&](handler &cgh) { - accessor accA{bufA, cgh}; - - cgh.parallel_for( - r, [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif - { - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix - sub_a; - - joint_matrix_fill(sg, sub_a, T(5.0)); - - joint_matrix_apply(sg, sub_a, lambda); - - ext::intel::experimental::matrix::joint_matrix_store( - sg, sub_a, - accA.template get_multi_ptr() + - (sg_startx * TileRows) * Cols + - sg_starty / sg_size * TileCols, - Cols); - }); // parallel for - }).wait(); - // Check if the results are correct - { - host_accessor Acc{bufA}; - assert(std::all_of(Acc.begin(), Acc.end(), [=](auto Elem) { - return (std::fabs(static_cast(make_fp32(Elem) - ref)) < - std::numeric_limits::epsilon()); - })); - } -} - -template void test() { - std::cout << "Testing: " << TM << " x " << TK << " [TM x TK]" << std::endl; - - static constexpr size_t Rows = TM * 2; - static constexpr size_t Cols = TK * 2; - T A[Rows][Cols]; - - big_matrix MA((T *)&A); - - matrix_verify_add>( - MA, 7.0, [=](T &x) { x = x + T(2); }); - matrix_verify_add>( - MA, 7.0, apply_add()); -} - -int main() { - queue q; - std::vector combinations = - q.get_device() - .get_info(); - - for (unsigned int i = 0; i < combinations.size(); i++) { - if (combinations[i].nsize == 0) { // Intel AMX - test(); - break; - } - - if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc - test(); - // This combination is not currently supported for sub group size = 32 in - // IGC -#if (!defined(SG_SZ) || SG_SZ != 32) - test(); - test(); - test(); -#endif - break; - } - - if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* - test(); - break; - } - } - return 0; -} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_two_matrices_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_two_matrices_impl.hpp deleted file mode 100644 index 9751571bcbcf5..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_two_matrices_impl.hpp +++ /dev/null @@ -1,157 +0,0 @@ -//==------- joint_matrix_apply_two_matrices_impl.hpp - DPC++ joint_matrix--==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#include - -template -bool apply_verify(Tc *C, Tc *D, Ta *A, Ta *Ar) { - for (size_t i = 0; i < M; i++) - for (size_t j = 0; j < N; j++) { - Tc diffc = D[i * N + j] - C[i * N + j] * 2; - Ta diffa = Ar[i * N + j] - (A[i * N + j] + 42); - if constexpr (std::is_same_v) { - if (std::fabs(diffc) > FLOAT_EPSILON || - std::fabs(diffa) > FLOAT_EPSILON || std::isnan(C[i * N + j]) || - std::isnan(A[i * N + j])) { - return false; - } - } else { - if (std::abs(diffc) > 0 || std::abs(diffa) > 0) { - return false; - } - } - } - return true; -} -template -bool apply_two_matrices(Tc *C, Tc *D, Ta *A, Ta *Ar, queue q) { - size_t NDRangeM = M / TM; - size_t NDRangeN = N / TN; - - size_t sg_size = get_sg_size(q); - q.submit([&](handler &cgh) { - cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif - { - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - auto pC = - address_space_cast(C); - auto pD = - address_space_cast(D); - auto pA = - address_space_cast(A); - auto pAr = - address_space_cast(Ar); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix sub_a; - joint_matrix - sub_ar; - joint_matrix sub_c; - joint_matrix sub_d; - - joint_matrix_load( - sg, sub_c, pC + (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::row_major); - joint_matrix_apply(sg, sub_c, sub_d, - [](const Tc &x, Tc &y) { y = x * 2; }); - joint_matrix_store( - sg, sub_d, pD + (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::row_major); - joint_matrix_load( - sg, sub_a, pA + (sg_startx * TM) * K + sg_starty / sg_size * TK, - K); - joint_matrix_apply(sg, sub_a, sub_ar, - [](const Ta &x, Ta &y) { y = x + 42; }); - ext::intel::experimental::matrix::joint_matrix_store( - sg, sub_ar, - pAr + (sg_startx * TM) * K + sg_starty / sg_size * TK, K); - }); // parallel for - }).wait(); - return apply_verify(C, D, A, Ar); -} - -template -bool test() { - static constexpr size_t M = TM * 2; - static constexpr size_t N = TN * 2; - static constexpr size_t K = TK * 2; - queue q; - - Tc *C = malloc_shared(M * N, q); - Tc *D = malloc_shared(M * N, q); - Ta *A = malloc_shared(M * K, q); - Ta *Ar = malloc_shared(M * K, q); - - matrix_rand(M, N, (Tc *)C, (Tc)100); - matrix_rand(M, K, (Ta *)A, (Ta)100); - - bool res = apply_two_matrices( - C, D, A, Ar, q); - - if constexpr (std::is_same_v) - std::cout << "bfloat16 " << TM << "x" << TN << "x" << TK << ": " - << (res ? "passed" : "failed") << std::endl; - else if constexpr (std::is_same_v) - std::cout << "int8_t " << TM << "x" << TN << "x" << TK << ": " - << (res ? "passed" : "failed") << std::endl; - free(C, q); - free(D, q); - free(A, q); - free(Ar, q); - - return res; -} - -int main() { - queue q; - std::vector combinations = - q.get_device() - .get_info(); - - bool passed = true; - for (unsigned int i = 0; i < combinations.size(); i++) { - if (combinations[i].nsize == 0) { // Intel AMX - passed &= test(); - passed &= test(); - break; - } - - if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc - passed &= test(); - passed &= test(); -// This combination is not currently supported for sub group size = 32 in IGC -#if (!defined(SG_SZ) || SG_SZ != 32) - passed &= test(); -#endif - break; - } - - if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* - passed &= test(); - passed &= test(); - break; - } - } - - return !passed; -} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_impl.hpp deleted file mode 100644 index 56250cf9fb3e1..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_impl.hpp +++ /dev/null @@ -1,337 +0,0 @@ -//------------------------------------------------------------------------------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===-------------------------------------------------------------------------===// - -#include -#include - -// number of test iterations -constexpr unsigned int testIterations = 100; -// start recording time after X iterations -constexpr unsigned int recordThresh = 10; - -#ifndef MATRIX_SIZE -#define MATRIX_SIZE 256 -#endif - -#ifdef MANUAL_UNROLL -template -static constexpr void loop(std::integer_sequence, F &&f) { - (f(std::integral_constant{}), ...); // C++17 fold expression -} - -template -static constexpr void manually_unroll_loop(F &&f) { - loop(std::make_integer_sequence{}, std::forward(f)); -} -#endif - -template class MatMul; - -template -double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { - size_t sgSize = get_sg_size>(q); - range<2> global{rowsA / MCache1, (colsB / NCache1) * sgSize}; - range<2> cachelocal{MCache2 / MCache1, NCache2 / NCache1 * sgSize}; - - // throw error if padding needed - assert(colsA == rowsB); - assert(rowsA % TM == 0); - assert(colsA % TK == 0); - assert(colsB % TN == 0); - // submit main kernel - std::chrono::high_resolution_clock::time_point start = - std::chrono::high_resolution_clock::now(); - - q.submit([&](handler &h) { - h.parallel_for>( // cache layer#1 - nd_range<2>{global, cachelocal}, - // loop global - // loop localrange - [=](nd_item<2> it) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif - { - auto pA = - address_space_cast(A); - auto pB = - address_space_cast(B); - auto pC = - address_space_cast(C); - auto m2 = it.get_group(0); - auto n2 = it.get_group(1); - auto m1 = it.get_local_id(0); - auto n1 = it.get_local_id(1) / sgSize; - auto sg = it.get_sub_group(); - joint_matrix - tC[MCache1 / TM][NCache1 / TN] -#ifdef INIT_LIST - = {}; // default initialization of all array elements -#else - ; // no initialization -#endif - -#ifdef MANUAL_UNROLL - manually_unroll_loop([&](auto m) { - manually_unroll_loop([&](auto n) { -#else - for (unsigned int m = 0; m < MCache1 / TM; m++) { - for (unsigned int n = 0; n < NCache1 / TN; n++) { -#endif - joint_matrix_fill(sg, tC[m][n], 0); -#ifdef MANUAL_UNROLL - }); - }); -#else - } - } -#endif - - for (unsigned int k2 = 0; k2 < colsA / KCache2; k2++) { - joint_matrix - tA[MCache1 / TM][KCache2 / KCache1] -#ifdef INIT_LIST - = {}; // default initialization of all array elements -#else - ; // no initialization -#endif - - joint_matrix - tB[NCache1 / TN][KCache2 / KCache1] -#ifdef INIT_LIST - = {}; // default initialization of all array elements -#else - ; // no initialization -#endif - -#ifdef MANUAL_UNROLL - manually_unroll_loop([&](auto k1) { -#else - for (unsigned int k1 = 0; k1 < KCache2 / KCache1; k1++) { -#endif - // physical layer - unsigned int k = (k2 * KCache2 + k1 * KCache1) / TK; -#ifdef MANUAL_UNROLL - manually_unroll_loop([&](auto m) { -#else - for (unsigned int m = 0; m < MCache1 / TM; m++) { -#endif -#ifdef OOB - ext::intel::experimental::matrix::joint_matrix_load_checked( - sg, tA[m][k1], pA, colsA, rowsA, colsA, - m2 * MCache2 + m1 * MCache1 + m * TM, k * TK); -#else - joint_matrix_load( - sg, tA[m][k1], - pA + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsA + - k * TK, - colsA); -#endif -#ifdef MANUAL_UNROLL - }); // m -#else - } // m -#endif -#ifdef MANUAL_UNROLL - manually_unroll_loop([&](auto n) { -#else - for (unsigned int n = 0; n < NCache1 / TN; n++) { -#endif -#ifdef OOB - ext::intel::experimental::matrix::joint_matrix_load_checked( - sg, tB[n][k1], pB, colsB * VNNI, rowsB / VNNI, colsB * VNNI, - k * TK / VNNI, - (n2 * NCache2 + n1 * NCache1 + n * TN) * VNNI); -#else - joint_matrix_load(sg, tB[n][k1], - pB + (k * TK / VNNI) * (colsB * VNNI) + - (n2 * NCache2 + n1 * NCache1 + n * TN) * - VNNI, - colsB * VNNI); -#endif -#ifdef MANUAL_UNROLL - }); -#else - } // n -#endif -#ifdef MANUAL_UNROLL - manually_unroll_loop([&](auto m) { -#else - for (unsigned int m = 0; m < MCache1 / TM; m++) { -#endif -#ifdef MANUAL_UNROLL - manually_unroll_loop([&](auto n) { -#else - for (unsigned int n = 0; n < NCache1 / TN; n++) { - -#endif - joint_matrix_mad(sg, tC[m][n], tA[m][k1], tB[n][k1], - tC[m][n]); -#ifdef MANUAL_UNROLL - }); // n - }); // m - }); // for k1 -#else - } // n - } // m - } // k1 -#endif - } // for k2 -#ifdef MANUAL_UNROLL - manually_unroll_loop([&](auto m) { -#else - for (unsigned int m = 0; m < MCache1 / TM; m++) { -#endif -#ifdef MANUAL_UNROLL - manually_unroll_loop([&](auto n) { -#else - for (unsigned int n = 0; n < NCache1 / TN; n++) { -#endif -#ifdef OOB - ext::intel::experimental::matrix::joint_matrix_store_checked( - sg, tC[m][n], pC, colsB, layout::row_major, rowsA, colsB, - m2 * MCache2 + m1 * MCache1 + m * TM, - n2 * NCache2 + n1 * NCache1 + n * TN); -#else - joint_matrix_store( - sg, tC[m][n], - pC + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsB + - (n2 * NCache2 + n1 * NCache1 + n * TN), - colsB, layout::row_major); -#endif -#ifdef MANUAL_UNROLL - }); // n - }); // m -#else - } // n - } // m -#endif - }); // parallel_for - }); // queue.submit - - if (i == testIterations - 1) - q.wait(); - std::chrono::duration duration = - std::chrono::high_resolution_clock::now() - start; - - return duration.count(); -} - -template -void test() { - assert(MATRIX_SIZE >= TM && MATRIX_SIZE >= TK && MATRIX_SIZE >= TN && - "invalid matrix size"); - assert((MATRIX_SIZE % TM) == 0 && (MATRIX_SIZE % TN) == 0 && - (MATRIX_SIZE % TK) == 0 && - "invalid matrix size detected: not a multiple of "); - - std::cout << "Testing: " << TM << " x " << TN << " x " << TK - << " [TM x TN x TK]" << std::endl; - - queue q; - T *A = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); - T *B = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); - T *vnniB = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); - TResult *C = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); - TResult *refC = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); - - matrix_rand(MATRIX_SIZE, MATRIX_SIZE, A, T(1)); - matrix_rand(MATRIX_SIZE, MATRIX_SIZE, B, T(1)); - matrix_vnni(MATRIX_SIZE, MATRIX_SIZE, B, vnniB, VNNI); - - matrix_multiply_ref(A, B, refC, MATRIX_SIZE, MATRIX_SIZE, - MATRIX_SIZE); - - // run testIterations time, aggregate and calculate average run time - double totalDuration = 0; - for (unsigned int i = 0; i < testIterations; i++) { - double duration = - joint_matmul(A, vnniB, C, q, i); - if (i >= recordThresh) { - totalDuration += duration; - } - } - - assert(matrix_compare(MATRIX_SIZE, MATRIX_SIZE, C, refC)); - - double msecPerMatrixMul = - totalDuration / static_cast(testIterations - recordThresh); - double gflops = (2.f * MATRIX_SIZE * MATRIX_SIZE * MATRIX_SIZE * 1.0e-9f) / - (msecPerMatrixMul / 1000.f); - - std::cout << "DONE for size " << MATRIX_SIZE << std::endl; - std::cout << "GOPS is " << gflops << " Gop/s" << std::endl; - - free(A, q); - free(B, q); - free(vnniB, q); - free(C, q); - free(refC, q); -} - -int main() { - queue q; - std::vector combinations = - q.get_device() - .get_info(); - - constexpr size_t MCache1 = 32; - constexpr size_t MCache2 = 256; - constexpr size_t NCache2 = 256; - constexpr size_t KCache2 = 32; - - for (unsigned int i = 0; i < combinations.size(); i++) { - if (combinations[i].nsize == 0) { // Intel AMX - constexpr size_t NCache1 = 32; - constexpr size_t KCache1 = 32; - - test(); - break; - } - - if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc - constexpr size_t NCache1 = 4 * /*TN*/ 16; - constexpr size_t KCache1 = 16; - - test(); -#if (!defined(SG_SZ) || SG_SZ != 32) - // These combination are not currently supported for subgroup size = 32 in - // IGC - test(); - test(); -#endif - break; - } - - if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* - constexpr size_t NCache1 = 4 * /*TN*/ 8; - constexpr size_t KCache1 = 16; - - test(); - break; - } - } - return 0; -} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array_impl.hpp deleted file mode 100644 index f393eaa5e8436..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array_impl.hpp +++ /dev/null @@ -1,153 +0,0 @@ -//==----------------------------------------------------------------------------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===-------------------------------------------------------------------------===// - -template class mult; - -static constexpr int JM_ARRAY_SZ = 2; - -template -void matrix_multiply(big_matrix &C, big_matrix &A, - big_matrix &B) { - size_t NDRangeM = M / (TM * JM_ARRAY_SZ); - size_t NDRangeN = N / TN; - buffer bufA(A.get_data(), range<2>(M, K)); - buffer bufB(B.get_data(), range<2>(K, N)); - buffer bufC((TResult *)C.get_data(), range<2>(M, N)); - - queue q; - size_t sg_size = get_sg_size>(q); - q.submit([&](handler &cgh) { - sycl::accessor accA{bufA, cgh, sycl::read_write}; - sycl::accessor accB{bufB, cgh, sycl::read_write}; - sycl::accessor accC{bufC, cgh, sycl::read_write}; - - cgh.parallel_for>( - nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif - { - // Matrix API has to be accessed by all the workitems in a - // subgroup. These functions will be called once by the subgroup. - // No code divergence between the workitems. - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix - sub_a[JM_ARRAY_SZ]; - - // For B, we assume B has been already VNNIed. - joint_matrix - sub_b; - joint_matrix - sub_c[JM_ARRAY_SZ]; - - for (int i = 0; i < JM_ARRAY_SZ; ++i) - joint_matrix_fill(sg, sub_c[i], TResult(1)); - - for (int k = 0; k < K / TK; ++k) { - joint_matrix_load( - sg, sub_b, - accB.template get_multi_ptr() + - (k * TK / VNNI) * (N * VNNI) + - sg_starty / sg_size * TN * VNNI, - N * VNNI); - - for (int i = 0; i < JM_ARRAY_SZ; ++i) { - joint_matrix_load( - sg, sub_a[i], - accA.template get_multi_ptr() + - (sg_startx * TM * JM_ARRAY_SZ + TM * i) * K + k * TK, - K); - joint_matrix_mad(sg, sub_c[i], sub_a[i], sub_b, sub_c[i]); - } - } - - for (int i = 0; i < JM_ARRAY_SZ; ++i) - joint_matrix_store( - sg, sub_c[i], - accC.template get_multi_ptr() + - (sg_startx * TM * JM_ARRAY_SZ + TM * i) * N + - sg_starty / sg_size * TN, - N, layout::row_major); - }); // parallel for - }).wait(); -} - -template -void test() { - std::cout << "Testing: " << TM << " x " << TN << " x " << TK - << " [TM x TN x TK]" << std::endl; - static constexpr size_t MATRIX_M = TM * 2; - static constexpr size_t MATRIX_N = TN * 2; - static constexpr size_t MATRIX_K = TK * 2; - - T A[MATRIX_M][MATRIX_K]; - T B[MATRIX_K / VNNI][MATRIX_N * VNNI]; - - TResult C[MATRIX_M][MATRIX_N]; - TResult D[MATRIX_M][MATRIX_N]; - - matrix_fill(MATRIX_M, MATRIX_K, (T *)A, - [](int i, int j) { return TResult(1) * (i + j); }); - matrix_fill(MATRIX_K / VNNI, MATRIX_N * VNNI, (T *)B, - [](int i, int j) { return TResult(2) * i + TResult(3) * j; }); - matrix_fill(MATRIX_M, MATRIX_N, (TResult *)C, TResult(1)); - matrix_fill(MATRIX_M, MATRIX_N, (TResult *)D, TResult(1)); - - big_matrix MC((TResult *)&C); - big_matrix MD((TResult *)&D); - big_matrix MA((T *)&A); - big_matrix MB((T *)&B); - - matrix_multiply( - MC, MA, MB); - matrix_multiply_ref((T *)A, (T *)B, (TResult *)D, - MATRIX_M, MATRIX_N, MATRIX_K / VNNI); - - assert(matrix_compare(MATRIX_M, MATRIX_N, (TResult *)C, (TResult *)D)); -} - -int main() { - queue q; - std::vector combinations = - q.get_device() - .get_info(); - - for (unsigned int i = 0; i < combinations.size(); i++) { - if (combinations[i].nsize == 0) { // Intel AMX - test(); - break; - } - - if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc - test(); -#if (!defined(SG_SZ) || SG_SZ != 32) - // These combination are not currently supported for subgroup size = 32 in - // IGC - test(); - test(); - test(); - break; -#endif - } - - if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* - test(); - break; - } - } - return 0; -} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp deleted file mode 100644 index e3234da2cd5d9..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp +++ /dev/null @@ -1,105 +0,0 @@ -//==-joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp- DPC++ joint_matrix-==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#define TM 8 -#define TK 16 - -template -void matrix_multiply(big_matrix &C, big_matrix &A, - big_matrix &B) { - size_t NDRangeM = M / TM; - size_t NDRangeN = N / TN; - buffer bufA(A.get_data(), range<2>(M, K)); - buffer bufB(B.get_data(), range<2>(K, N)); - buffer bufC((float *)C.get_data(), range<2>(M, N)); - - queue q; - size_t sg_size = get_sg_size(q); - q.submit([&](handler &cgh) { - auto accC = bufC.get_access(cgh); - auto accA = bufA.get_access(cgh); - auto accB = bufB.get_access(cgh); - - cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif - { - // The submatrix API has to be accessed by all the workitems in a - // subgroup these functions will be called once by the subgroup no - // code divergence between the workitems - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix - sub_a; - // For B, we assume B has been already VNNIed. - joint_matrix - sub_b; - joint_matrix sub_c; - - joint_matrix_load( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::row_major); - for (int k = 0; k < K / TK; k += 1) { - joint_matrix_load( - sg, sub_a, - accA.template get_multi_ptr() + - (k * TK) * M + sg_startx * TM, - M); - joint_matrix_load( - sg, sub_b, - accB.template get_multi_ptr() + - (sg_starty / sg_size * TN) * K + k * TK, - K); - joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); - } - joint_matrix_store( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::row_major); - }); // parallel for - }).wait(); -} - -int main() { - static constexpr size_t MATRIX_M = TM * 2; - static constexpr size_t MATRIX_N = TN * 2; - static constexpr size_t MATRIX_K = TK * 2; - bfloat16 A[MATRIX_K][MATRIX_M]; - bfloat16 B[MATRIX_N][MATRIX_K]; - float C[MATRIX_M][MATRIX_N]; - float D[MATRIX_M][MATRIX_N]; - - matrix_fill(MATRIX_K, MATRIX_M, (bfloat16 *)A, - [](int i, int j) { return 1.0f * (i + j); }); - matrix_fill(MATRIX_N, MATRIX_K, (bfloat16 *)B, - [](int i, int j) { return 2.0f * i + 3.0f * j; }); - matrix_fill(MATRIX_M, MATRIX_N, (float *)C, 1.0f); - matrix_fill(MATRIX_M, MATRIX_N, (float *)D, 1.0f); - - big_matrix MC((float *)&C); - big_matrix MD((float *)&D); - big_matrix MA((bfloat16 *)&A); - big_matrix MB((bfloat16 *)&B); - matrix_multiply(MC, MA, MB); - matrix_multiply_ref((bfloat16 *)A, (bfloat16 *)B, (float *)D, MATRIX_M, - MATRIX_N, MATRIX_K, false, true, true); - - bool res = matrix_compare(MATRIX_M, MATRIX_N, (float *)C, (float *)D); - std::cout << (res ? "passed" : "failed") << std::endl; - return !res; -} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_impl.hpp deleted file mode 100644 index fc1fc1963e047..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_impl.hpp +++ /dev/null @@ -1,142 +0,0 @@ -//===---joint_matrix_bfloat16_impl.hpp - DPC++ joint_matrix----------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -template class imatrix; - -template -void matrix_multiply(big_matrix &C, big_matrix &A, - big_matrix &B) { - size_t NDRangeM = M / TM; - size_t NDRangeN = N / TN; - buffer bufA(A.get_data(), range<2>(M, K)); - buffer bufB(B.get_data(), range<2>(K, N)); - buffer bufC((float *)C.get_data(), range<2>(M, N)); - - queue q; - size_t sg_size = get_sg_size>(q); - q.submit([&](handler &cgh) { - auto accC = bufC.get_access(cgh); - auto accA = bufA.get_access(cgh); - auto accB = bufB.get_access(cgh); - - cgh.parallel_for>( - nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif - { - // The submatrix API has to be accessed by all the workitems in a - // subgroup these functions will be called once by the subgroup no - // code divergence between the workitems - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix - sub_a; - // For B, we assume B has been already VNNIed. - joint_matrix - sub_b; - joint_matrix sub_c; - - joint_matrix_load( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::row_major); - for (int k = 0; k < K / TK; k += 1) { // - joint_matrix_load( - sg, sub_a, - accA.template get_multi_ptr() + - (sg_startx * TM) * K + k * TK, - K); - joint_matrix_load( - sg, sub_b, - accB.template get_multi_ptr() + - (k * TK / 2) * (N * 2) + sg_starty / sg_size * TN * 2, - N * 2); - joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); - } - joint_matrix_store( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::row_major); - }); // parallel for - }).wait(); -} - -template -void test() { - std::cout << "Testing: " << TM << " x " << TN << " x " << TK - << " [TM x TN x TK]" << std::endl; - - static constexpr size_t MATRIX_M = TM * 2; - static constexpr size_t MATRIX_N = TN * 2; - static constexpr size_t MATRIX_K = TK * 2; - T A[MATRIX_M][MATRIX_K]; - T B[MATRIX_K / 2][MATRIX_N * 2]; - TResult C[MATRIX_M][MATRIX_N]; - TResult D[MATRIX_M][MATRIX_N]; - - matrix_fill(MATRIX_M, MATRIX_K, (T *)A, - [](int i, int j) { return T(1) * (i + j); }); - matrix_fill(MATRIX_K / 2, MATRIX_N * 2, (T *)B, - [](int i, int j) { return T(2) * i + T(3) * j; }); - matrix_fill(MATRIX_M, MATRIX_N, (TResult *)C, TResult(1)); - matrix_fill(MATRIX_M, MATRIX_N, (TResult *)D, TResult(1)); - - big_matrix MC((TResult *)&C); - big_matrix MD((TResult *)&D); - big_matrix MA((T *)&A); - big_matrix MB((T *)&B); - matrix_multiply(MC, MA, - MB); - matrix_multiply_ref((T *)A, (T *)B, (TResult *)D, MATRIX_M, - MATRIX_N, MATRIX_K / 2); - - assert(matrix_compare(MATRIX_M, MATRIX_N, (TResult *)C, (TResult *)D)); -} -int main() { - queue q; - std::vector combinations = - q.get_device() - .get_info(); - - for (unsigned int i = 0; i < combinations.size(); i++) { - if (combinations[i].nsize == 0) { // Intel AMX - test(); - break; - } - - if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc - test(); - - // This combination is not currently supported for sub group size = 32 in - // IGC -#if (!defined(SG_SZ) || SG_SZ != 32) - test(); - test(); - test(); -#endif - break; - } - - if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* - test(); - break; - } - } - return 0; -} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_packedB_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_packedB_impl.hpp deleted file mode 100644 index 6a7182c41985d..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_packedB_impl.hpp +++ /dev/null @@ -1,132 +0,0 @@ -//=----- joint_matrix_bfloat16_packedB_impl.hpp - DPC++ joint_matrix -------=// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//=-------------------------------------------------------------------------=// - -template -void matrix_multiply(big_matrix &C, big_matrix &A, - big_matrix &B) { - size_t NDRangeM = M / TM; - size_t NDRangeN = N / TN; - buffer bufA(A.get_data(), range<2>(M, K)); - buffer bufB(B.get_data(), range<2>(K, N)); - buffer bufC((float *)C.get_data(), range<2>(M, N)); - - queue q; - size_t sg_size = get_sg_size(q); - q.submit([&](handler &cgh) { - auto accC = bufC.get_access(cgh); - auto accA = bufA.get_access(cgh); - auto accB = bufB.get_access(cgh); - - cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif - { - // The submatrix API has to be accessed by all the workitems in a - // subgroup these functions will be called once by the subgroup no - // code divergence between the workitems - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix - sub_a; - // For B, we assume B has been already VNNIed. - joint_matrix - sub_b; - joint_matrix sub_c; - - joint_matrix_load( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::row_major); - for (int k = 0; k < K / TK; k += 1) { // - joint_matrix_load( - sg, sub_a, - accA.template get_multi_ptr() + - (sg_startx * TM) * K + k * TK, - K); - // Assuming B data is already in VNNI format. - joint_matrix_load( - sg, sub_b, - accB.template get_multi_ptr() + - (k * TK / 2) * (N * 2) + sg_starty / sg_size * TN * 2, - N * 2); - joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); - } - joint_matrix_store( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::row_major); - }); // parallel for - }).wait(); -} - -template int test() { - static constexpr size_t MATRIX_M = TM * 2; - static constexpr size_t MATRIX_N = TN * 2; - static constexpr size_t MATRIX_K = TK * 2; - bfloat16 A[MATRIX_M][MATRIX_K]; - bfloat16 B[MATRIX_K / 2][MATRIX_N * 2]; - float C[MATRIX_M][MATRIX_N]; - float D[MATRIX_M][MATRIX_N]; - - matrix_fill(MATRIX_M, MATRIX_K, (bfloat16 *)A, - [](int i, int j) { return 1.0f * (i + j); }); - matrix_fill(MATRIX_K / 2, MATRIX_N * 2, (bfloat16 *)B, - [](int i, int j) { return 2.0f * i + 3.0f * j; }); - matrix_fill(MATRIX_M, MATRIX_N, (float *)C, 1.0f); - matrix_fill(MATRIX_M, MATRIX_N, (float *)D, 1.0f); - - big_matrix MC((float *)&C); - big_matrix MD((float *)&D); - big_matrix MA((bfloat16 *)&A); - big_matrix MB((bfloat16 *)&B); - matrix_multiply(MC, MA, MB); - matrix_multiply_ref( - (bfloat16 *)A, (bfloat16 *)B, (float *)D, MATRIX_M, MATRIX_N, - MATRIX_K / 2); - - bool res = matrix_compare(MATRIX_M, MATRIX_N, (float *)C, (float *)D); - std::cout << TM << "x" << TN << "x" << TK << " "; - std::cout << (res ? "passed" : "failed") << std::endl; - return !res; -} - -int main() { - queue q; - std::vector combinations = - q.get_device() - .get_info(); - - int ret = 0; - for (auto &combination : combinations) { - if (combination.nsize == 0) { // Intel AMX - ret += test<16, 16, 16, class amx16x16x16>(); - break; - } - - if (combination.nsize == 16) { // architecture::intel_gpu_pvc - ret += test<16, 16, 16, class pvc16x16x16>(); - ret += test<32, 64, 16, class pvc32x64x16>(); - ret += test<1, 64, 16, class pvc1x64x16>(); - break; - } - } - - return ret; -} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_colA_rowB_colC_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_colA_rowB_colC_impl.hpp deleted file mode 100644 index 373ec652cc063..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_colA_rowB_colC_impl.hpp +++ /dev/null @@ -1,99 +0,0 @@ -//===---joint_matrix_colA_rowB_colC_impl.hpp - DPC++ joint_matrix----------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include -#include -#include - -constexpr size_t TM = 8; -constexpr size_t TK = 16; - -template -void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q) { - size_t M = NUM_ROWS_C; - size_t N = NUM_COLS_C; - size_t K = NUM_COLS_A; - - assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B); - size_t NDRangeM = M / TM; - size_t NDRangeN = N / TN; - size_t sg_size = get_sg_size(q); - - q.submit([&](handler &cgh) { - cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif - { - auto pA = - address_space_cast(A); - auto pB = - address_space_cast(B); - auto pC = - address_space_cast(C); - - // The submatrix API has to be accessed by all the workitems in a - // subgroup these functions will be called once by the subgroup no - // code divergence between the workitems - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix - sub_a; - joint_matrix - sub_b; - joint_matrix sub_c; - joint_matrix_fill(sg, sub_c, 1); - for (int k = 0; k < K; k += TK) { - joint_matrix_load(sg, sub_a, pA + (sg_startx * TM) * K + k, K); - joint_matrix_load(sg, sub_b, pB + k * N + sg_starty / sg_size * TN, - N); - joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); - } - joint_matrix_store( - sg, sub_c, pC + (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::col_major); - }); // parallel for - }).wait(); -} - -int main() { - static constexpr size_t MATRIX_M = 1024; - static constexpr size_t MATRIX_N = 1024; - static constexpr size_t MATRIX_K = 1024; - queue q; - bfloat16 *A = malloc_shared(MATRIX_M * MATRIX_K, q); - bfloat16 *B = malloc_shared(MATRIX_K * MATRIX_N, q); - float *C = malloc_shared(MATRIX_M * MATRIX_N, q); - float *D = malloc_shared(MATRIX_M * MATRIX_N, q); - - matrix_rand(MATRIX_M, MATRIX_K, A, (bfloat16)5); - matrix_rand(MATRIX_K, MATRIX_N, B, (bfloat16)5); - matrix_fill(MATRIX_M, MATRIX_N, C, (float)1.0); - matrix_fill(MATRIX_M, MATRIX_N, D, (float)1.0); - - matrix_multiply(C, A, B, q); - matrix_multiply_ref(A, B, D, MATRIX_M, MATRIX_N, MATRIX_K, - true /*transposed c*/); - - bool res = matrix_compare(MATRIX_M, MATRIX_N, C, D); - - std::cout << (res ? "passed" : "failed") << std::endl; - return !res; -} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_down_convert_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_down_convert_impl.hpp deleted file mode 100644 index 8ac48511c7e10..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_down_convert_impl.hpp +++ /dev/null @@ -1,82 +0,0 @@ -//==-------- joint_matrix_down_convert_impl.hpp - DPC++ joint_matrix-------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#include - -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - -constexpr size_t TM = 8; -// TN and TK must be the same for this test. -constexpr size_t TN = 16; -constexpr size_t TK = 16; - -template -void matrix_copy(big_matrix &C, big_matrix &A) { - size_t NDRangeM = M / TM; - size_t NDRangeN = N / TN; - buffer bufA(A.get_data(), range<2>(M, K)); - buffer bufC((float *)C.get_data(), range<2>(M, N)); - - queue q; - size_t sg_size = get_sg_size(q); - q.submit([&](handler &cgh) { - auto accC = bufC.get_access(cgh); - auto accA = bufA.get_access(cgh); - - cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif - { - // The submatrix API has to be accessed by all the workitems in a - // subgroup these functions will be called once by the subgroup no - // code divergence between the workitems - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix - sub_a; - joint_matrix sub_c; - - joint_matrix_load( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::row_major); - joint_matrix_copy(sg, sub_c, sub_a); - ext::intel::experimental::matrix::joint_matrix_store( - sg, sub_a, - accA.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, - N); - }); // parallel for - }).wait(); -} - -int main() { - static constexpr size_t MATRIX_M = TM * 2; - static constexpr size_t MATRIX_N = TN * 2; - static constexpr size_t MATRIX_K = TK * 2; - bfloat16 A[MATRIX_M][MATRIX_K]; - float C[MATRIX_M][MATRIX_N]; - - matrix_rand(MATRIX_M, MATRIX_N, *C, (float)5); - - big_matrix MC((float *)&C); - big_matrix MA((bfloat16 *)&A); - matrix_copy(MC, MA); - - bool res = matrix_compare(MATRIX_M, MATRIX_N, (bfloat16 *)A, (float *)C); - std::cout << (res ? "passed" : "failed") << std::endl; - return !res; -} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_half_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_half_impl.hpp deleted file mode 100644 index a9de620af4237..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_half_impl.hpp +++ /dev/null @@ -1,131 +0,0 @@ -//===---joint_matrix_half_impl.hpp - DPC++ joint_matrix--------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -template class mult; - -template -void matrix_multiply(big_matrix &C, big_matrix &A, - big_matrix &B) { - size_t NDRangeM = M / TM; - size_t NDRangeN = N / TN; - buffer bufA(A.get_data(), range<2>(M, K)); - buffer bufB(B.get_data(), range<2>(K, N)); - buffer bufC(C.get_data(), range<2>(M, N)); - - queue q; - size_t sg_size = get_sg_size>(q); - q.submit([&](handler &cgh) { - accessor accA{bufA, cgh}; - accessor accB{bufB, cgh}; - accessor accC{bufC, cgh}; - - cgh.parallel_for>( - nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, sg_size}), - [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif - { - // The submatrix API has to be accessed by all the workitems in a - // subgroup these functions will be called once by the subgroup - // no code divergence between the workitems - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix sub_a; - // For B, we assume B has been already VNNIed. - joint_matrix - sub_b; - joint_matrix sub_c; - - joint_matrix_load( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::row_major); - for (int k = 0; k < K / TK; k += 1) { - joint_matrix_load( - sg, sub_a, - accA.template get_multi_ptr() + - (sg_startx * TM) * K + k * TK, - K); - joint_matrix_load( - sg, sub_b, - accB.template get_multi_ptr() + - (k * TK / VNNI) * (N * VNNI) + - sg_starty / sg_size * TN * VNNI, - N * VNNI); - joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); - } - joint_matrix_store( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::row_major); - }); // parallel for - }).wait(); -} - -template -void test() { - static constexpr size_t MATRIX_M = TM * 2; - static constexpr size_t MATRIX_N = TN * 2; - static constexpr size_t MATRIX_K = TK * 2; - T A[MATRIX_M][MATRIX_K]; - T B[MATRIX_K / VNNI][MATRIX_N * VNNI]; - TResult C[MATRIX_M][MATRIX_N]; - TResult D[MATRIX_M][MATRIX_N]; - - matrix_fill(MATRIX_M, MATRIX_K, (T *)A, - [](int i, int j) { return i + 2 * j; }); - matrix_fill(MATRIX_K / VNNI, MATRIX_N * VNNI, (T *)B, - [](int i, int j) { return i + j; }); - matrix_fill(MATRIX_M, MATRIX_N, (TResult *)C, TResult(1)); - matrix_fill(MATRIX_M, MATRIX_N, (TResult *)D, TResult(1)); - - big_matrix MC((TResult *)&C); - big_matrix MD((TResult *)&D); - big_matrix MA((T *)&A); - big_matrix MB((T *)&B); - matrix_multiply( - MC, MA, MB); - matrix_multiply_ref((T *)A, (T *)B, (TResult *)D, - MATRIX_M, MATRIX_N, MATRIX_K / VNNI); - - assert(matrix_compare(MATRIX_M, MATRIX_N, (TResult *)C, (TResult *)D)); -} - -int main() { - queue q; - std::vector combinations = - q.get_device() - .get_info(); - - for (unsigned int i = 0; i < combinations.size(); i++) { - if (combinations[i].nsize == 0) { // Intel AMX - test(); - break; - } - - if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc - test(); - break; - } - - if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* - test(); - break; - } - } - return 0; -} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_colmajorA_colmajorB_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_colmajorA_colmajorB_impl.hpp deleted file mode 100644 index 1390f8225406c..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_colmajorA_colmajorB_impl.hpp +++ /dev/null @@ -1,114 +0,0 @@ -//===-joint_matrix_int8_colmajorA_colmajorB_impl.hpp - DPC++ joint_matrix--===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#define TM 8 -#define TK 32 - -template -void matrix_multiply(big_matrix &C, - big_matrix &A, - big_matrix &B) { - size_t M = NUM_ROWS_C; - size_t N = NUM_COLS_C; - size_t K = NUM_COLS_A; - size_t NDRangeM = M / TM; - size_t NDRangeN = N / TN; - buffer bufA(A.get_data(), range<2>(M, K)); - buffer bufB(B.get_data(), range<2>(K, N)); - buffer bufC(C.get_data(), range<2>(M, N)); - - queue q; - size_t sg_size = get_sg_size(q); - q.submit([&](handler &cgh) { - auto accC = bufC.get_access(cgh); - auto accA = bufA.get_access(cgh); - auto accB = bufB.get_access(cgh); - - cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif - { - // The submatrix API has to be accessed by all the workitems in a - // subgroup these functions will be called once by the subgroup no - // code divergence between the workitems - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix - sub_a; - joint_matrix - sub_b; - joint_matrix sub_c; - - joint_matrix_fill(sg, sub_c, 0); - for (int k = 0; k < K / TK; k += 1) { - joint_matrix_load( - sg, sub_a, - accA.template get_multi_ptr() + - (k * TK) * M + sg_startx * TM, - M); - joint_matrix_load( - sg, sub_b, - accB.template get_multi_ptr() + - (sg_starty / sg_size * TN) * K + k * TK, - K); - joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); - } - joint_matrix_store( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::row_major); - }); // parallel for - }).wait(); -} - -int main() { - static constexpr size_t MATRIX_M = TM; - static constexpr size_t MATRIX_N = TN; - static constexpr size_t MATRIX_K = TK; - int8_t A[MATRIX_K][MATRIX_M]; - int8_t Aref[MATRIX_K][MATRIX_M]; - int8_t B[MATRIX_N][MATRIX_K]; - int8_t Bref[MATRIX_N][MATRIX_K]; - int32_t C[MATRIX_M][MATRIX_N]; - int32_t D[MATRIX_M][MATRIX_N]; - - matrix_fill(MATRIX_K, MATRIX_M, (int8_t *)A, - [](int i, int j) { return 2 * i + j; }); - matrix_fill(MATRIX_K, MATRIX_M, (int8_t *)Aref, - [](int i, int j) { return 2 * i + j; }); - - matrix_fill(MATRIX_N, MATRIX_K, (int8_t *)B, - [](int i, int j) { return i + 2 * j; }); - matrix_fill(MATRIX_N, MATRIX_K, (int8_t *)Bref, - [](int i, int j) { return i + 2 * j; }); - - matrix_fill(MATRIX_M, MATRIX_N, (int32_t *)C, 0); - matrix_fill(MATRIX_M, MATRIX_N, (int32_t *)D, 0); - - big_matrix MC((int32_t *)&C); - big_matrix MD((int32_t *)&D); - big_matrix MA((int8_t *)&A); - big_matrix MB((int8_t *)&B); - matrix_multiply(MC, MA, MB); - matrix_multiply_ref((int8_t *)Aref, (int8_t *)Bref, (int32_t *)D, MATRIX_M, - MATRIX_N, MATRIX_K, false, true, true); - - bool res = matrix_compare(MATRIX_M, MATRIX_N, (int32_t *)C, (int32_t *)D); - std::cout << (res ? "passed" : "failed") << std::endl; - return !res; -} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_rowmajorA_rowmajorB_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_rowmajorA_rowmajorB_impl.hpp deleted file mode 100644 index 12f91f083def4..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_rowmajorA_rowmajorB_impl.hpp +++ /dev/null @@ -1,123 +0,0 @@ -//===-joint_matrix_int8_rowmajorA_rowmajorB_impl.hpp - DPC++ joint_matrix--===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -template class mult; - -template -void matrix_multiply(big_matrix &C, big_matrix &A, - big_matrix &B) { - size_t NDRangeM = M / TM; - size_t NDRangeN = N / TN; - buffer bufA(A.get_data(), range<2>(M, K)); - buffer bufB(B.get_data(), range<2>(K, N)); - buffer bufC(C.get_data(), range<2>(M, N)); - - queue q; - size_t sg_size = get_sg_size>(q); - q.submit([&](handler &cgh) { - accessor accA{bufA, cgh}; - accessor accB{bufB, cgh}; - accessor accC{bufC, cgh}; - - cgh.parallel_for>( - nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif - { - // The submatrix API has to be accessed by all the workitems in a - // subgroup these functions will be called once by the subgroup no - // code divergence between the workitems - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix sub_a; - joint_matrix sub_b; - joint_matrix sub_c; - - joint_matrix_fill(sg, sub_c, 0); - for (int k = 0; k < K / TK; k += 1) { - joint_matrix_load( - sg, sub_a, - accA.template get_multi_ptr() + - (sg_startx * TM) * K + k * TK, - K); - // VNNI transform is done automatically at this level - joint_matrix_load( - sg, sub_b, - accB.template get_multi_ptr() + - (k * TK) * N + sg_starty / sg_size * TN, - N); - joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); - } - joint_matrix_store( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::row_major); - }); // parallel for - }).wait(); -} - -template -void test() { - static constexpr size_t MATRIX_M = TM * 2; - static constexpr size_t MATRIX_N = TN * 2; - static constexpr size_t MATRIX_K = TK * 2; - T A[MATRIX_M][MATRIX_K]; - T B[MATRIX_K][MATRIX_N]; - TResult C[MATRIX_M][MATRIX_N]; - TResult D[MATRIX_M][MATRIX_N]; - - matrix_fill(MATRIX_M, MATRIX_K, (T *)A, [](int i, int j) { return i + j; }); - matrix_fill(MATRIX_K, MATRIX_N, (T *)B, - [](int i, int j) { return i + j * 2; }); - matrix_fill(MATRIX_M, MATRIX_N, (TResult *)C, 0); - matrix_fill(MATRIX_M, MATRIX_N, (TResult *)D, 0); - - big_matrix MC((TResult *)&C); - big_matrix MD((TResult *)&D); - big_matrix MA((T *)&A); - big_matrix MB((T *)&B); - matrix_multiply(MC, MA, - MB); - matrix_multiply_ref((T *)A, (T *)B, (TResult *)D, MATRIX_M, - MATRIX_N, MATRIX_K); - - assert(matrix_compare(MATRIX_M, MATRIX_N, (TResult *)C, (TResult *)D)); -} - -int main() { - queue q; - std::vector combinations = - q.get_device() - .get_info(); - - for (unsigned int i = 0; i < combinations.size(); i++) { - if (combinations[i].nsize == 0) { // Intel AMX - test(); - break; - } - - if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc - test(); - break; - } - - if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* - test(); - break; - } - } - return 0; -} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature_impl.hpp deleted file mode 100644 index 7aba5911c8386..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature_impl.hpp +++ /dev/null @@ -1,109 +0,0 @@ -//===---joint_matrix_opt_kernel_feature_impl.hpp - DPC++ joint_matrix------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - -static constexpr size_t M_MULTIPLIER = 16; - -template -void matrix_multiply(big_matrix &C, big_matrix &A, - big_matrix &B) { - size_t NDRangeM = M / TM; - size_t NDRangeN = N / TN; - buffer bufA(A.get_data(), range<2>(M, K)); - buffer bufB(B.get_data(), range<2>(K, N)); - buffer bufC(C.get_data(), range<2>(M, N)); - - queue q; - size_t sg_size = get_sg_size(q); - q.submit([&](handler &cgh) { - sycl::accessor accC{bufC, cgh, sycl::read_write}; - sycl::accessor accA{bufA, cgh, sycl::read_only}; - sycl::accessor accB{bufB, cgh, sycl::read_only}; - - cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif - { - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix sub_a; - joint_matrix - sub_b; - joint_matrix sub_c; - - joint_matrix_load( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::row_major); - for (int k = 0; k < K / TK; k += 1) { - joint_matrix_load( - sg, sub_a, - accA.template get_multi_ptr() + - (sg_startx * TM) * K + k * TK, - K); - joint_matrix_load( - sg, sub_b, - accB.template get_multi_ptr() + - (k * TK / vnniFactor) * (N * vnniFactor) + - sg_starty / sg_size * TN * vnniFactor, - N * vnniFactor); - joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); - } - }); - }).wait(); -} - -template -void init_and_multiply() { - static constexpr size_t MATRIX_M = tM * M_MULTIPLIER; - static constexpr size_t MATRIX_N = 128; - static constexpr size_t MATRIX_K = 128; - - Ta A[MATRIX_M][MATRIX_K]; - Ta B[MATRIX_K][MATRIX_N]; - Ta Bvnni[MATRIX_K / vnni_factor][MATRIX_N * vnni_factor]; - Tc C[MATRIX_M][MATRIX_N]; - - matrix_rand(MATRIX_M, MATRIX_K, (Ta *)A, (Ta)50); - matrix_rand(MATRIX_K, MATRIX_N, (Ta *)B, (Ta)50); - matrix_fill(MATRIX_M, MATRIX_N, (Tc *)C, (Tc)1); - - big_matrix MC((Tc *)&C); - big_matrix MA((Ta *)&A); - matrix_vnni(MATRIX_K, MATRIX_N, (Ta *)&B, (Ta *)&Bvnni, vnni_factor); - big_matrix MBvnni( - (Ta *)&Bvnni); - - matrix_multiply(MC, MA, MBvnni); -} - -int main() { - try { - init_and_multiply(); // 500 is not correct size - } catch (const sycl::exception &e) { - if (e.code() == errc::invalid) - return 0; - throw; - } - - return 1; -} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_out_bounds_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_out_bounds_impl.hpp deleted file mode 100644 index 4be21beea9c45..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_out_bounds_impl.hpp +++ /dev/null @@ -1,124 +0,0 @@ -//===---joint_matrix_out_bounds_impl.hpp - DPC++ joint_matrix--------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include -#include - -constexpr size_t TM = 8; -constexpr size_t TK = 16; - -template class mult; - -template -void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q) { - size_t M = NUM_ROWS_C; - size_t N = NUM_COLS_C; - size_t K = NUM_COLS_A; - - assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B * vnniFactor); - // Add one iteration for the out of bounds dpas instruction - size_t NDRangeM = M / TM + (((M % TM) != 0) ? 1 : 0); - size_t NDRangeN = N / TN; - size_t sg_size = get_sg_size>(q); - - q.submit([&](handler &cgh) { - cgh.parallel_for>( - nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif - { - auto pA = - address_space_cast(A); - auto pB = - address_space_cast(B); - auto pC = - address_space_cast(C); - // The submatrix API has to be accessed by all the workitems in a - // subgroup these functions will be called once by the subgroup no - // code divergence between the workitems - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix - sub_a; - - // For B, since current implementation does not support non-packed - // layout, users need to specify the packed_b layout. - joint_matrix sub_b; - joint_matrix sub_c; - // bounds-checked load where width and height are added - ext::intel::experimental::matrix::joint_matrix_fill_checked( - sg, sub_c, 1, N, M, N, sg_startx * TM, sg_starty / sg_size * TN); - for (int k = 0; k < K; k += TK) { - // bounds-checked load where width and height are added - ext::intel::experimental::matrix::joint_matrix_load_checked( - sg, sub_a, pA, K, M, K, sg_startx * TM, k); - // Assume we alreay in vnni format. - // bounds-checked load where width and height are added - ext::intel::experimental::matrix::joint_matrix_load_checked( - sg, sub_b, pB, N * vnniFactor, K / vnniFactor, N * vnniFactor, - k, sg_starty / sg_size * TN * vnniFactor); - joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); - } - // bounds-checked store where width and height are added - ext::intel::experimental::matrix::joint_matrix_store_checked( - sg, sub_c, pC, N, layout::row_major, M, N, sg_startx * TM, - sg_starty / sg_size * TN); - }); // parallel for - }).wait(); -} - -int main() { - static constexpr size_t MATRIX_M = 1024 + 14; - static constexpr size_t MATRIX_N = 1024; - static constexpr unsigned int vnniFactor = 2; - - queue q; - bfloat16 *A = malloc_shared(MATRIX_M * MATRIX_K, q); - bfloat16 *B = malloc_shared(MATRIX_K * MATRIX_N, q); - bfloat16 *vnniB = malloc_shared(MATRIX_K * MATRIX_N, q); - float *C = malloc_shared(MATRIX_M * MATRIX_N, q); - float *D = malloc_shared(MATRIX_M * MATRIX_N, q); - - matrix_rand(MATRIX_M, MATRIX_K, A, (bfloat16)5); - matrix_rand(MATRIX_K, MATRIX_N, B, (bfloat16)5); - matrix_fill(MATRIX_M, MATRIX_N, C, (float)1); - matrix_fill(MATRIX_M, MATRIX_N, D, (float)1); - - matrix_vnni(MATRIX_K, MATRIX_N, B, vnniB, vnniFactor); - - matrix_multiply_ref(A, B, D, MATRIX_M, MATRIX_N, MATRIX_K); - matrix_multiply(C, A, vnniB, q); - bool res = matrix_compare(MATRIX_M, MATRIX_N, C, D); - - matrix_multiply(C, A, B, q); - res = res && matrix_compare(MATRIX_M, MATRIX_N, C, D); - - std::cout << (res ? "passed" : "failed") << std::endl; - - free(A, q); - free(B, q); - free(vnniB, q); - free(C, q); - free(D, q); - - return !res; -} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_prefetch_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_prefetch_impl.hpp deleted file mode 100644 index 1e665f618860f..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_prefetch_impl.hpp +++ /dev/null @@ -1,148 +0,0 @@ -//===---joint_matrix_prefetch_impl.hpp - DPC++ joint_matrix----------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include - -#define TM 8 -#define TK 16 - -template class mult; - -template -void joint_matrix_gemm_vnni(sub_group sg, size_t sg_startx, size_t sg_starty, - size_t sg_size, T1 *A, T2 *B, T *C) { - auto pA = address_space_cast(A); - auto pB = address_space_cast(B); - auto pC = address_space_cast(C); - - joint_matrix sub_a; - joint_matrix sub_b; - joint_matrix sub_c; - joint_matrix_prefetch(sg, A + (sg_startx * TM) * K, K, - layout::row_major, - syclex::properties{syclex::prefetch_hint_L1}); - joint_matrix_prefetch(sg, B + sg_starty / sg_size * TN * vnniFactor, - N * vnniFactor, B_layout, - syclex::properties{syclex::prefetch_hint_L1}); - - if constexpr (C_layout == layout::col_major) - joint_matrix_prefetch( - sg, C + (sg_starty / sg_size * TN) * M + (sg_startx * TM), M, C_layout, - syclex::properties{syclex::prefetch_hint_L1}); - else - joint_matrix_prefetch( - sg, C + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, C_layout, - syclex::properties{syclex::prefetch_hint_L1}); - - joint_matrix_fill(sg, sub_c, 1); - for (int k = 0; k < K; k += TK) { - joint_matrix_load(sg, sub_a, pA + (sg_startx * TM) * K + k, K); - joint_matrix_load(sg, sub_b, - pB + k * N + sg_starty / sg_size * TN * vnniFactor, - N * vnniFactor); - joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); - } - - if constexpr (C_layout == layout::col_major) - joint_matrix_store(sg, sub_c, - pC + (sg_starty / sg_size * TN) * M + (sg_startx * TM), - M, C_layout); - else - joint_matrix_store(sg, sub_c, - pC + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, - C_layout); -} - -template -void matrix_multiply(T *C, T1 *A, T2 *B, queue q) { - size_t NDRangeM = M / TM; - size_t NDRangeN = N / TN; - - size_t sg_size = get_sg_size>(q); - q.submit([&](handler &cgh) { - cgh.parallel_for>( - nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif - { - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix_gemm_vnni(sg, sg_startx, sg_starty, sg_size, - A, B, C); - }); // parallel for - }).wait(); -} - -int main() { - queue q; - std::vector combinations = - q.get_device() - .get_info(); - bool support_p = false; - // joint_matrix_prefetch is not supported on DG2 - for (unsigned int i = 0; i < combinations.size(); i++) { - if (combinations[i].nsize == 0 || combinations[i].nsize == 16) { - support_p = true; - break; - } - } - if (!support_p) { - std::cout << "Prefetch not supported on this device" << std::endl; - return 0; - } - static constexpr size_t M = TM * 2; - static constexpr size_t N = TN * 2; - static constexpr size_t K = TK * 2; - static constexpr unsigned int vnniFactor = 2; - bfloat16 *A = malloc_shared(M * K, q); - bfloat16 *B = malloc_shared(K * N, q); - bfloat16 *vnniB = malloc_shared(K * N, q); - float *C = malloc_shared(M * N, q); - float *D = malloc_shared(M * N, q); - float *Dt = malloc_shared(M * N, q); - - matrix_fill(M, K, A, [](int i, int j) { return 1.0f * (i + j); }); - matrix_fill(K, N, (bfloat16 *)B, - [](int i, int j) { return 2.0f * i + 3.0f * j; }); - matrix_fill(M, N, C, 1.0f); - matrix_fill(M, N, D, 1.0f); - matrix_fill(M, N, Dt, 1.0f); - - matrix_vnni(K, N, B, vnniB, vnniFactor); - - matrix_multiply_ref(A, B, D, M, N, K); - matrix_multiply(C, A, B, q); - - bool res = matrix_compare(M, N, C, D); - matrix_multiply(C, A, vnniB, q); - - res = res && matrix_compare(M, N, C, D); - - matrix_multiply_ref(A, B, Dt, M, N, K, true); - matrix_multiply(C, A, B, q); - - res = res && matrix_compare(M, N, C, Dt); - std::cout << (res ? "passed" : "failed") << std::endl; - return !res; -} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_rowmajorA_rowmajorB_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_rowmajorA_rowmajorB_impl.hpp deleted file mode 100644 index 754d49c354d01..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_rowmajorA_rowmajorB_impl.hpp +++ /dev/null @@ -1,151 +0,0 @@ -//==-----joint_matrix_rowmajorA_rowmajorB_impl.hpp - DPC++ joint_matrix----==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -template -void matrix_multiply(big_matrix &C, big_matrix &A, - big_matrix &B) { - size_t NDRangeM = M / TM; - size_t NDRangeN = N / TN; - buffer bufA((TA *)A.get_data(), range<2>(M, K)); - buffer bufB((TB *)B.get_data(), range<2>(K, N)); - buffer bufC((TC *)C.get_data(), range<2>(M, N)); - - queue q; - size_t sg_size = get_sg_size(q); - q.submit([&](handler &cgh) { - sycl::accessor accC{bufC, cgh, sycl::read_write}; - sycl::accessor accA{bufA, cgh, sycl::read_only}; - sycl::accessor accB{bufB, cgh, sycl::read_only}; - - cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif - { - // The submatrix API has to be accessed by all the workitems in a - // subgroup these functions will be called once by the subgroup no - // code divergence between the workitems - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix sub_a; - joint_matrix sub_b; - joint_matrix sub_c; - - joint_matrix_load( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::row_major); - for (int k = 0; k < K / TK; k += 1) { - joint_matrix_load( - sg, sub_a, - accA.template get_multi_ptr() + - (sg_startx * TM) * K + k * TK, - K); - joint_matrix_load( - sg, sub_b, - accB.template get_multi_ptr() + - (k * TK) * (N) + sg_starty / sg_size * TN, - N); - joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); - } - joint_matrix_store( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::row_major); - }); // parallel for - }).wait(); -} - -template -int gemm_row_major() { - static constexpr size_t MATRIX_M = TM * 2; - static constexpr size_t MATRIX_N = TN * 2; - static constexpr size_t MATRIX_K = TK * 2; - TA A[MATRIX_M][MATRIX_K]; - TB B[MATRIX_K][MATRIX_N]; - TC C[MATRIX_M][MATRIX_N]; - TC D[MATRIX_M][MATRIX_N]; - - matrix_fill(MATRIX_M, MATRIX_K, (TA *)A, - [](int i, int j) { return 1 * (i + j); }); - matrix_fill(MATRIX_K, MATRIX_N, (TB *)B, - [](int i, int j) { return 2 * i + 3 * j; }); - matrix_fill(MATRIX_M, MATRIX_N, (TC *)C, (TC)1); - matrix_fill(MATRIX_M, MATRIX_N, (TC *)D, (TC)1); - - big_matrix MC((TC *)&C); - big_matrix MD((TC *)&D); - big_matrix MA((TA *)&A); - big_matrix MB((TB *)&B); - matrix_multiply(MC, MA, MB); - matrix_multiply_ref((TA *)A, (TB *)B, (TC *)D, MATRIX_M, MATRIX_N, MATRIX_K); - - bool res = matrix_compare(MATRIX_M, MATRIX_N, (TC *)C, (TC *)D); - std::cout << TM << "x" << TN << "x" << TK << ": "; - std::cout << (res ? "passed" : "failed") << std::endl; - return !res; -} - -int main() { - queue q; - std::vector combinations = - q.get_device() - .get_info(); - int res = 0; - for (auto &combination : combinations) { - if (combination.nsize == 0 || - combination.nsize == 16) { // Intel AMX or architecture::intel_gpu_pvc - res += gemm_row_major<8, 16, 16, class bf16_8x16x16, bfloat16, bfloat16, - float>(); - res += gemm_row_major<8, 16, 32, class ss_8x16x32, int8_t, int8_t, - int32_t>(); - res += gemm_row_major<8, 16, 32, class us_8x16x32, uint8_t, int8_t, - int32_t>(); - res += gemm_row_major<8, 16, 32, class su_8x16x32, int8_t, uint8_t, - int32_t>(); - res += gemm_row_major<8, 16, 32, class uu_8x16x32, uint8_t, uint8_t, - int32_t>(); - - if (combination.nsize == 16) { // architecture::intel_gpu_pvc - res += gemm_row_major<16, 16, 16, class bf16_16x16x16, bfloat16, - bfloat16, float>(); - res += gemm_row_major<1, 64, 16, class bf16_1x64x16, bfloat16, bfloat16, - float>(); - res += gemm_row_major<32, 64, 16, class bf16_32x64x16, bfloat16, - bfloat16, float>(); - } - break; - } - - if (combination.nsize == 8) { // architecture::intel_gpu_dg2* - res += gemm_row_major<8, 8, 16, class bf16_8x8x16, bfloat16, bfloat16, - float>(); - res += - gemm_row_major<8, 8, 32, class ss_8x8x32, int8_t, int8_t, int32_t>(); - res += - gemm_row_major<8, 8, 32, class us_8x8x32, uint8_t, int8_t, int32_t>(); - res += - gemm_row_major<8, 8, 32, class su_8x8x32, int8_t, uint8_t, int32_t>(); - res += gemm_row_major<8, 8, 32, class uu_8x8x32, uint8_t, uint8_t, - int32_t>(); - break; - } - } - return res; -} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_ss_int8_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_ss_int8_impl.hpp deleted file mode 100644 index 974a489002b47..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_ss_int8_impl.hpp +++ /dev/null @@ -1,128 +0,0 @@ -//===---joint_matrix_ss_int8_impl.hpp - DPC++ joint_matrix-----------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -template class mult; - -template -void matrix_multiply(big_matrix &C, big_matrix &A, - big_matrix &B) { - size_t NDRangeM = M / TM; - size_t NDRangeN = N / TN; - buffer bufA(A.get_data(), range<2>(M, K)); - buffer bufB(B.get_data(), range<2>(K, N)); - buffer bufC(C.get_data(), range<2>(M, N)); - - queue q; - size_t sg_size = get_sg_size>(q); - q.submit([&](handler &cgh) { - accessor accA{bufA, cgh}; - accessor accB{bufB, cgh}; - accessor accC{bufC, cgh}; - - cgh.parallel_for>( - nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif - { - // The submatrix API has to be accessed by all the workitems in a - // subgroup these functions will be called once by the subgroup no - // code divergence between the workitems - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix sub_a; - // For B, we assume B has been already VNNIed. - joint_matrix - sub_b; - joint_matrix sub_c; - - joint_matrix_fill(sg, sub_c, 0); - for (int k = 0; k < K / TK; k += 1) { - joint_matrix_load( - sg, sub_a, - accA.template get_multi_ptr() + - (sg_startx * TM) * K + k * TK, - K); - joint_matrix_load( - sg, sub_b, - accB.template get_multi_ptr() + - (k * TK / VNNI) * (N * VNNI) + - sg_starty / sg_size * TN * VNNI, - N * VNNI); - joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); - } - joint_matrix_store( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::row_major); - }); // parallel for - }).wait(); -} - -template -void test() { - static constexpr size_t MATRIX_M = TM * 2; - static constexpr size_t MATRIX_N = TN * 2; - static constexpr size_t MATRIX_K = TK * 2; - T A[MATRIX_M][MATRIX_K]; - T B[MATRIX_K / VNNI][MATRIX_N * VNNI]; - TResult C[MATRIX_M][MATRIX_N]; - TResult D[MATRIX_M][MATRIX_N]; - - matrix_fill(MATRIX_M, MATRIX_K, (T *)A, - [](int i, int j) { return i + 2 * j; }); - matrix_fill(MATRIX_K / VNNI, MATRIX_N * VNNI, (T *)B, - [](int i, int j) { return i + j; }); - matrix_fill(MATRIX_M, MATRIX_N, (TResult *)C, 0); - matrix_fill(MATRIX_M, MATRIX_N, (TResult *)D, 0); - - big_matrix MC((TResult *)&C); - big_matrix MD((TResult *)&D); - big_matrix MA((T *)&A); - big_matrix MB((T *)&B); - matrix_multiply( - MC, MA, MB); - matrix_multiply_ref((T *)A, (T *)B, (TResult *)D, - MATRIX_M, MATRIX_N, MATRIX_K / VNNI); - - assert(matrix_compare(MATRIX_M, MATRIX_N, (TResult *)C, (TResult *)D)); -} - -int main() { - queue q; - std::vector combinations = - q.get_device() - .get_info(); - - for (unsigned int i = 0; i < combinations.size(); i++) { - if (combinations[i].nsize == 0) { // Intel AMX - test(); - break; - } - - if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc - test(); - break; - } - - if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* - test(); - break; - } - } - return 0; -} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_su_int8_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_su_int8_impl.hpp deleted file mode 100644 index 17c9d47f61c36..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_su_int8_impl.hpp +++ /dev/null @@ -1,132 +0,0 @@ -//===---joint_matrix_su_int8_impl.hpp - DPC++ joint_matrix-----------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -template class mult; - -template -void matrix_multiply(big_matrix &C, big_matrix &A, - big_matrix &B) { - size_t NDRangeM = M / TM; - size_t NDRangeN = N / TN; - buffer bufA(A.get_data(), range<2>(M, K)); - buffer bufB(B.get_data(), range<2>(K, N)); - buffer bufC(C.get_data(), range<2>(M, N)); - - queue q; - size_t sg_size = get_sg_size>(q); - q.submit([&](handler &cgh) { - accessor accA{bufA, cgh}; - accessor accB{bufB, cgh}; - accessor accC{bufC, cgh}; - - cgh.parallel_for>( - nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif - { - // The submatrix API has to be accessed by all the workitems in a - // subgroup these functions will be called once by the subgroup no - // code divergence between the workitems - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix sub_a; - // For B, we assume B has been already VNNIed. - joint_matrix - sub_b; - joint_matrix sub_c; - - joint_matrix_load( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::row_major); - for (int k = 0; k < K / TK; k += 1) { - joint_matrix_load( - sg, sub_a, - accA.template get_multi_ptr() + - (sg_startx * TM) * K + k * TK, - K); - joint_matrix_load( - sg, sub_b, - accB.template get_multi_ptr() + - (k * TK / VNNI) * (N * VNNI) + - sg_starty / sg_size * TN * VNNI, - N * VNNI); - joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); - } - joint_matrix_store( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::row_major); - }); // parallel for - }).wait(); -} - -template -void test() { - static constexpr size_t MATRIX_M = TM * 2; - static constexpr size_t MATRIX_N = TN * 2; - static constexpr size_t MATRIX_K = TK * 2; - TS A[MATRIX_M][MATRIX_K]; - TU B[MATRIX_K / VNNI][MATRIX_N * VNNI]; - TResult C[MATRIX_M][MATRIX_N]; - TResult D[MATRIX_M][MATRIX_N]; - - matrix_fill(MATRIX_M, MATRIX_K, (TS *)A, - [](int i, int j) { return i + 2 * j; }); - matrix_fill(MATRIX_K / VNNI, MATRIX_N * VNNI, (TU *)B, - [](int i, int j) { return i + j; }); - matrix_fill(MATRIX_M, MATRIX_N, (TResult *)C, 1); - matrix_fill(MATRIX_M, MATRIX_N, (TResult *)D, 1); - - big_matrix MC((TResult *)&C); - big_matrix MD((TResult *)&D); - big_matrix MA((TS *)&A); - big_matrix MB((TU *)&B); - matrix_multiply(MC, MA, MB); - matrix_multiply_ref( - (TS *)A, (TU *)B, (TResult *)D, MATRIX_M, MATRIX_N, MATRIX_K / VNNI); - - assert(matrix_compare(MATRIX_M, MATRIX_N, (TResult *)C, (TResult *)D)); -} - -int main() { - queue q; - std::vector combinations = - q.get_device() - .get_info(); - - for (unsigned int i = 0; i < combinations.size(); i++) { - if (combinations[i].nsize == 0) { // Intel AMX - test(); - break; - } - - if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc - test(); - break; - } - - if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* - test(); - break; - } - } - return 0; -} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_tf32_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_tf32_impl.hpp deleted file mode 100644 index 69991884c0710..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_tf32_impl.hpp +++ /dev/null @@ -1,120 +0,0 @@ -//===---joint_matrix_tf32_impl.hpp - DPC++ joint_matrix--------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -constexpr size_t TM = 8; -constexpr size_t TK = 8; - -template -void matrix_multiply(big_matrix &C, - big_matrix &A, - big_matrix &B) { - size_t M = NUM_ROWS_C; - size_t N = NUM_COLS_C; - size_t K = NUM_COLS_A; - - assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B); - size_t NDRangeM = M / TM; - size_t NDRangeN = N / TN; - buffer bufA(A.get_data(), range<2>(M, K)); - buffer bufB(B.get_data(), range<2>(K, N)); - buffer bufC((float *)C.get_data(), range<2>(M, N)); - - queue q; - size_t sg_size = get_sg_size(q); - q.submit([&](handler &cgh) { - auto accC = bufC.get_access(cgh); - auto accA = bufA.get_access(cgh); - auto accB = bufB.get_access(cgh); - - cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif - { - // The matrix API has to be accessed by all the workitems in a - // subgroup these functions will be called once by the subgroup no - // code divergence between the workitems - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix - sub_a; - joint_matrix - sub_b; - joint_matrix sub_c; - joint_matrix_load( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::row_major); - for (int k = 0; k < K; k += TK) { - joint_matrix_load( - sg, sub_a, - accA.template get_multi_ptr() + - (sg_startx * TM) * K + k, - K); - joint_matrix_load( - sg, sub_b, - accB.template get_multi_ptr() + - (k) * (N) + sg_starty / sg_size * TN, - N); - // If no rounding to tf32 function is called, joint_matrix_mad - // function will work on truncated floats. - joint_matrix_apply(sg, sub_a, - [=](float &x) { x = round_to_tf32(x); }); - joint_matrix_apply(sg, sub_b, - [=](float &x) { x = round_to_tf32(x); }); - joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); - } - - joint_matrix_store( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::row_major); - }); // parallel for - }).wait(); -} - -int main() { - static constexpr size_t MATRIX_M = TM * 2; - static constexpr size_t MATRIX_N = TN * 2; - static constexpr size_t MATRIX_K = TK * 2; - float A[MATRIX_M][MATRIX_K]; - float B[MATRIX_K][MATRIX_N]; - float C[MATRIX_M][MATRIX_N]; - float D[MATRIX_M][MATRIX_N]; - - matrix_fill(MATRIX_M, MATRIX_K, (float *)A, - [](int i, int j) { return 1.0f * (i + j); }); - matrix_fill(MATRIX_K, MATRIX_N, (float *)B, - [](int i, int j) { return 2.0f * i + 3.0f * j; }); - matrix_fill(MATRIX_M, MATRIX_N, (float *)C, 1.0f); - matrix_fill(MATRIX_M, MATRIX_N, (float *)D, 1.0f); - - big_matrix MC((float *)&C); - big_matrix MD((float *)&D); - big_matrix MA((float *)&A); - big_matrix MB((float *)&B); - matrix_multiply(MC, MA, MB); - matrix_multiply_ref((float *)A, (float *)B, (float *)D, MATRIX_M, MATRIX_N, - MATRIX_K); - - bool res = matrix_compare(MATRIX_M, MATRIX_N, (float *)C, (float *)D); - std::cout << (res ? "passed" : "failed") << std::endl; - return !res; -} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_transposeC_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_transposeC_impl.hpp deleted file mode 100644 index 278e5da5cf441..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_transposeC_impl.hpp +++ /dev/null @@ -1,151 +0,0 @@ -//===---joint_matrix_transposeC_impl.hpp - DPC++ joint_matrix--------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include - -template class LS; - -template -void matrix_load_and_store(T1 *input, T1 *out_col_major, T1 *out_row_major, - queue q) { - size_t M = NUM_ROWS; - size_t N = NUM_COLS; - - static_assert((NUM_ROWS % TM) == 0); - static_assert((NUM_COLS % TN) == 0); - - size_t NDRangeM = M / TM; - size_t NDRangeN = N / TN; - size_t sg_size = get_sg_size>(q); - - q.submit([&](handler &cgh) { - cgh.parallel_for>( - nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif - { - auto p_input = - address_space_cast(input); - - auto p_out_col_major = - address_space_cast(out_col_major); - auto p_out_row_major = - address_space_cast(out_row_major); - - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix sub_matrix; - - auto row_major_offset = - (sg_startx * TM) * N + (sg_starty / sg_size * TN); - auto col_major_offset = - (sg_startx * TM) + (sg_starty / sg_size * TN) * M; - - joint_matrix_load(sg, sub_matrix, p_input + col_major_offset, M, - layout::col_major); - - joint_matrix_store(sg, sub_matrix, - p_out_col_major + row_major_offset, N, - layout::row_major); - - joint_matrix_store(sg, sub_matrix, - p_out_row_major + col_major_offset, M, - layout::col_major); - }); // parallel for - }).wait(); -} - -template void run_matrix_test() { - static constexpr size_t MATRIX_M = TM * 16; - static constexpr size_t MATRIX_N = TN * 16; - - queue q; - T *input = malloc_shared(MATRIX_M * MATRIX_N, q); - T *out_col_major = malloc_shared(MATRIX_M * MATRIX_N, q); - T *out_row_major = malloc_shared(MATRIX_M * MATRIX_N, q); - T *ref_col_major = malloc_shared(MATRIX_M * MATRIX_N, q); - - // input is column majot matrix so it is of NxM shape - matrix_rand(MATRIX_N, MATRIX_M, input, (T)5.0); - matrix_fill(MATRIX_M, MATRIX_N, out_col_major, (T)0); - matrix_fill(MATRIX_N, MATRIX_M, out_row_major, (T)0); - matrix_transpose(MATRIX_N, MATRIX_M, ref_col_major, input); - - matrix_load_and_store(input, out_col_major, - out_row_major, q); - - // we use exact comparison as no low precision calculation is used in this - // test - std::cout << "compare results for: " << TM << " x " << TN << " [TM x TN]" - << std::endl; - bool res = - matrix_compare(MATRIX_M, MATRIX_N, out_col_major, - ref_col_major) && - matrix_compare(MATRIX_N, MATRIX_M, out_row_major, input); - free(input, q); - free(out_col_major, q); - free(out_row_major, q); - free(ref_col_major, q); - assert(res); -} - -int main() { - queue q; - std::vector combinations = - q.get_device() - .get_info(); - - for (unsigned int i = 0; i < combinations.size(); i++) { - if (combinations[i].nsize == 0) { // Intel AMX - run_matrix_test(); - run_matrix_test(); - run_matrix_test(); - run_matrix_test(); - run_matrix_test(); - run_matrix_test(); - run_matrix_test(); - run_matrix_test(); - break; - } - - if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc - run_matrix_test(); - run_matrix_test(); - run_matrix_test(); - run_matrix_test(); - run_matrix_test(); - run_matrix_test(); - run_matrix_test(); - run_matrix_test(); - break; - } - - if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* - run_matrix_test(); - run_matrix_test(); - run_matrix_test(); - run_matrix_test(); - run_matrix_test(); - run_matrix_test(); - run_matrix_test(); - run_matrix_test(); - break; - } - } - return 0; -} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_us_int8_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_us_int8_impl.hpp deleted file mode 100644 index baef5c195a1e6..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_us_int8_impl.hpp +++ /dev/null @@ -1,133 +0,0 @@ -//===---joint_matrix_us_int8_impl.hpp - DPC++ joint_matrix-----------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -template class mult; - -template -void matrix_multiply(big_matrix &C, big_matrix &A, - big_matrix &B) { - size_t NDRangeM = M / TM; - size_t NDRangeN = N / TN; - buffer bufA(A.get_data(), range<2>(M, K)); - buffer bufB(B.get_data(), range<2>(K, N)); - buffer bufC(C.get_data(), range<2>(M, N)); - - queue q; - size_t sg_size = get_sg_size>(q); - q.submit([&](handler &cgh) { - accessor accA{bufA, cgh}; - accessor accB{bufB, cgh}; - accessor accC{bufC, cgh}; - - cgh.parallel_for>( - nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif - { - // The submatrix API has to be accessed by all the workitems in a - // subgroup these functions will be called once by the subgroup no - // code divergence between the workitems - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix sub_a; - // For B, we assume B has been already VNNIed. - joint_matrix - sub_b; - joint_matrix sub_c; - - joint_matrix_load( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::row_major); - for (int k = 0; k < K / TK; k += 1) { - joint_matrix_load( - sg, sub_a, - accA.template get_multi_ptr() + - (sg_startx * TM) * K + k * TK, - K); - // Assuming B data is already in VNNI format. - joint_matrix_load( - sg, sub_b, - accB.template get_multi_ptr() + - (k * TK / VNNI) * (N * VNNI) + - sg_starty / sg_size * TN * VNNI, - N * VNNI); - joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); - } - joint_matrix_store( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::row_major); - }); // parallel for - }).wait(); -} - -template -void test() { - static constexpr size_t MATRIX_M = TM * 2; - static constexpr size_t MATRIX_N = TN * 2; - static constexpr size_t MATRIX_K = TK * 2; - TU A[MATRIX_M][MATRIX_K]; - TS B[MATRIX_K / VNNI][MATRIX_N * VNNI]; - TResult C[MATRIX_M][MATRIX_N]; - TResult D[MATRIX_M][MATRIX_N]; - - matrix_fill(MATRIX_M, MATRIX_K, (TU *)A, - [](int i, int j) { return i + 2 * j; }); - matrix_fill(MATRIX_K / VNNI, MATRIX_N * VNNI, (TS *)B, - [](int i, int j) { return i + j; }); - matrix_fill(MATRIX_M, MATRIX_N, (TResult *)C, 1); - matrix_fill(MATRIX_M, MATRIX_N, (TResult *)D, 1); - - big_matrix MC((TResult *)&C); - big_matrix MD((TResult *)&D); - big_matrix MA((TU *)&A); - big_matrix MB((TS *)&B); - matrix_multiply(MC, MA, MB); - matrix_multiply_ref( - (TU *)A, (TS *)B, (TResult *)D, MATRIX_M, MATRIX_N, MATRIX_K / VNNI); - - assert(matrix_compare(MATRIX_M, MATRIX_N, (TResult *)C, (TResult *)D)); -} - -int main() { - queue q; - std::vector combinations = - q.get_device() - .get_info(); - - for (unsigned int i = 0; i < combinations.size(); i++) { - if (combinations[i].nsize == 0) { // Intel AMX - test(); - break; - } - - if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc - test(); - break; - } - - if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* - test(); - break; - } - } - return 0; -} diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_uu_int8_impl.hpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_uu_int8_impl.hpp deleted file mode 100644 index 090b552848f0b..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_uu_int8_impl.hpp +++ /dev/null @@ -1,133 +0,0 @@ -//===---joint_matrix_uu_int8_impl.hpp - DPC++ joint_matrix-----------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -template class mult; - -template -void matrix_multiply(big_matrix &C, big_matrix &A, - big_matrix &B) { - size_t NDRangeM = M / TM; - size_t NDRangeN = N / TN; - buffer bufA(A.get_data(), range<2>(M, K)); - buffer bufB(B.get_data(), range<2>(K, N)); - buffer bufC(C.get_data(), range<2>(M, N)); - - queue q; - size_t sg_size = get_sg_size>(q); - q.submit([&](handler &cgh) { - accessor accA{bufA, cgh}; - accessor accB{bufB, cgh}; - accessor accC{bufC, cgh}; - - cgh.parallel_for>( - nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif - { - // The submatrix API has to be accessed by all the workitems in a - // subgroup these functions will be called once by the subgroup no - // code divergence between the workitems - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix sub_a; - // For B, we assume B has been already VNNIed. - joint_matrix - sub_b; - joint_matrix sub_c; - - joint_matrix_load( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::row_major); - for (int k = 0; k < K / TK; k += 1) { - joint_matrix_load( - sg, sub_a, - accA.template get_multi_ptr() + - (sg_startx * TM) * K + k * TK, - K); - // Assuming B data is already in VNNI format. - joint_matrix_load( - sg, sub_b, - accB.template get_multi_ptr() + - (k * TK / VNNI) * (N * VNNI) + - sg_starty / sg_size * TN * VNNI, - N * VNNI); - joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); - } - joint_matrix_store( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::row_major); - }); // parallel for - }).wait(); -} - -template -void test() { - static constexpr size_t MATRIX_M = TM * 2; - static constexpr size_t MATRIX_N = TN * 2; - static constexpr size_t MATRIX_K = TK * 2; - T A[MATRIX_M][MATRIX_K]; - T B[MATRIX_K / VNNI][MATRIX_N * VNNI]; - TResult C[MATRIX_M][MATRIX_N]; - TResult D[MATRIX_M][MATRIX_N]; - - matrix_fill(MATRIX_M, MATRIX_K, (T *)A, - [](int i, int j) { return i + 2 * j; }); - matrix_fill(MATRIX_K / VNNI, MATRIX_N * VNNI, (T *)B, - [](int i, int j) { return i + j; }); - matrix_fill(MATRIX_M, MATRIX_N, (TResult *)C, 1); - matrix_fill(MATRIX_M, MATRIX_N, (TResult *)D, 1); - - big_matrix MC((TResult *)&C); - big_matrix MD((TResult *)&D); - big_matrix MA((T *)&A); - big_matrix MB((T *)&B); - matrix_multiply( - MC, MA, MB); - matrix_multiply_ref((T *)A, (T *)B, (TResult *)D, - MATRIX_M, MATRIX_N, MATRIX_K / VNNI); - - assert(matrix_compare(MATRIX_M, MATRIX_N, (TResult *)C, (TResult *)D)); -} - -int main() { - queue q; - std::vector combinations = - q.get_device() - .get_info(); - - for (unsigned int i = 0; i < combinations.size(); i++) { - if (combinations[i].nsize == 0) { // Intel AMX - test(); - break; - } - - if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc - test(); - break; - } - - if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* - test(); - break; - } - } - return 0; -} From 7d7c8e13adff094351e3c24c51efeacbfdd3f6e3 Mon Sep 17 00:00:00 2001 From: "Sidorov, Dmitry" Date: Mon, 12 Aug 2024 08:28:55 -0700 Subject: [PATCH 11/19] Fix SG32 tests Signed-off-by: Sidorov, Dmitry --- .../Matrix/SPVCooperativeMatrix/SG32/element_wise_abc.cpp | 6 +++--- .../SPVCooperativeMatrix/SG32/element_wise_all_ops.cpp | 6 +++--- .../SPVCooperativeMatrix/SG32/element_wise_all_ops_half.cpp | 6 +++--- .../SPVCooperativeMatrix/SG32/element_wise_all_ops_int8.cpp | 6 +++--- .../SG32/element_wise_all_ops_int8_packed.cpp | 6 +++--- .../SPVCooperativeMatrix/SG32/element_wise_all_ops_tf32.cpp | 6 +++--- .../SPVCooperativeMatrix/SG32/element_wise_all_sizes.cpp | 6 +++--- .../Matrix/SPVCooperativeMatrix/SG32/element_wise_ops.cpp | 6 +++--- .../SPVCooperativeMatrix/SG32/get_coord_float_matC.cpp | 6 +++--- .../SPVCooperativeMatrix/SG32/get_coord_int8_matA.cpp | 6 +++--- .../SPVCooperativeMatrix/SG32/get_coord_int8_matB.cpp | 6 +++--- .../SPVCooperativeMatrix/SG32/joint_matrix_all_sizes.cpp | 6 +++--- .../SG32/joint_matrix_annotated_ptr.cpp | 6 +++--- .../SPVCooperativeMatrix/SG32/joint_matrix_apply_bf16.cpp | 6 +++--- .../SG32/joint_matrix_apply_two_matrices.cpp | 4 ++-- .../SG32/joint_matrix_bf16_fill_k_cache.cpp | 6 +++--- .../SG32/joint_matrix_bf16_fill_k_cache_init.cpp | 6 +++--- .../SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp | 4 ++-- .../SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp | 4 ++-- .../SPVCooperativeMatrix/SG32/joint_matrix_bfloat16.cpp | 6 +++--- .../SG32/joint_matrix_bfloat16_array.cpp | 6 +++--- .../SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp | 6 +++--- .../SG32/joint_matrix_bfloat16_packedB.cpp | 6 +++--- .../SG32/joint_matrix_colA_rowB_colC.cpp | 6 +++--- .../SPVCooperativeMatrix/SG32/joint_matrix_down_convert.cpp | 6 +++--- .../Matrix/SPVCooperativeMatrix/SG32/joint_matrix_half.cpp | 6 +++--- .../SG32/joint_matrix_int8_colmajorA_colmajorB.cpp | 6 +++--- .../SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp | 6 +++--- .../SPVCooperativeMatrix/SG32/joint_matrix_out_bounds.cpp | 6 +++--- .../SPVCooperativeMatrix/SG32/joint_matrix_prefetch.cpp | 6 +++--- .../SG32/joint_matrix_rowmajorA_rowmajorB.cpp | 6 +++--- .../SPVCooperativeMatrix/SG32/joint_matrix_ss_int8.cpp | 6 +++--- .../SPVCooperativeMatrix/SG32/joint_matrix_su_int8.cpp | 6 +++--- .../Matrix/SPVCooperativeMatrix/SG32/joint_matrix_tf32.cpp | 6 +++--- .../SPVCooperativeMatrix/SG32/joint_matrix_transposeC.cpp | 6 +++--- .../SPVCooperativeMatrix/SG32/joint_matrix_unaligned_k.cpp | 6 +++--- .../SPVCooperativeMatrix/SG32/joint_matrix_us_int8.cpp | 6 +++--- .../SPVCooperativeMatrix/SG32/joint_matrix_uu_int8.cpp | 6 +++--- 38 files changed, 111 insertions(+), 111 deletions(-) diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_abc.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_abc.cpp index 7a471ae628584..4c7cac6eae8ca 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_abc.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_abc.cpp @@ -10,11 +10,11 @@ // REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "../common.hpp" +#include "../../common.hpp" #define SG_SZ 32 -#include "../element_wise_abc_impl.hpp" +#include "../../element_wise_abc_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops.cpp index 8192567a039f3..414ff4e8d8b51 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops.cpp @@ -10,11 +10,11 @@ // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "../common.hpp" +#include "../../common.hpp" #define SG_SZ 32 -#include "../element_wise_all_ops_impl.hpp" +#include "../../element_wise_all_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_half.cpp index e65396893c54a..3c333131c8d8b 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_half.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_half.cpp @@ -11,11 +11,11 @@ // REQUIRES: aspect-ext_intel_matrix, gpu // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "../common.hpp" +#include "../../common.hpp" #define SG_SZ 32 -#include "../element_wise_all_ops_half_impl.hpp" +#include "../../element_wise_all_ops_half_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8.cpp index 25e5a628eb894..0a253724c8649 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8.cpp @@ -10,11 +10,11 @@ // REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "../common.hpp" +#include "../../common.hpp" #define SG_SZ 32 -#include "../element_wise_all_ops_int8_impl.hpp" +#include "../../element_wise_all_ops_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8_packed.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8_packed.cpp index 2f199d13355b5..ca700a18ee344 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8_packed.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8_packed.cpp @@ -10,13 +10,13 @@ // REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out // This test stores the matrix B that is VNNIed (packed). -#include "../common.hpp" +#include "../../common.hpp" #define SG_SZ 32 -#include "../element_wise_all_ops_int8_packed_impl.hpp" +#include "../../element_wise_all_ops_int8_packed_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_tf32.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_tf32.cpp index e42fc1d04fe8e..d841323fc3260 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_tf32.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_tf32.cpp @@ -8,12 +8,12 @@ // REQUIRES: gpu, matrix-tf32 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "../common.hpp" +#include "../../common.hpp" #define SG_SZ 32 constexpr size_t TN = 16; -#include "../element_wise_all_ops_tf32_impl.hpp" +#include "../../element_wise_all_ops_tf32_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_sizes.cpp index 66ce7cf96e40a..2d7af42e6b207 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_sizes.cpp @@ -8,11 +8,11 @@ // REQUIRES: gpu, matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "../common.hpp" +#include "../../common.hpp" #define SG_SZ 32 -#include "../element_wise_all_sizes_impl.hpp" +#include "../../element_wise_all_sizes_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_ops.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_ops.cpp index c18c37858646b..1be795aac5293 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_ops.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_ops.cpp @@ -10,11 +10,11 @@ // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "../common.hpp" +#include "../../common.hpp" #define SG_SZ 32 -#include "../element_wise_ops_impl.hpp" +#include "../../element_wise_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_float_matC.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_float_matC.cpp index f2d82933d4012..ceeb101401684 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_float_matC.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_float_matC.cpp @@ -10,11 +10,11 @@ // REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "../common.hpp" +#include "../../common.hpp" #define SG_SZ 32 -#include "../get_coord_float_matC_impl.hpp" +#include "../../get_coord_float_matC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matA.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matA.cpp index a5436b255e513..910670a7423b2 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matA.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matA.cpp @@ -10,11 +10,11 @@ // REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "../common.hpp" +#include "../../common.hpp" #define SG_SZ 32 -#include "../get_coord_int8_matA_impl.hpp" +#include "../../get_coord_int8_matA_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matB.cpp index ed6b730faf129..9e881a6e98977 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matB.cpp @@ -10,12 +10,12 @@ // REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out // XFAIL: cpu -#include "../common.hpp" +#include "../../common.hpp" #define SG_SZ 32 -#include "../get_coord_int8_matB_impl.hpp" +#include "../../get_coord_int8_matB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_all_sizes.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_all_sizes.cpp index fe5b0c57b0c25..a6021a78db2de 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_all_sizes.cpp @@ -10,11 +10,11 @@ // REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "../common.hpp" +#include "../../common.hpp" #define SG_SZ 32 -#include "../joint_matrix_all_sizes_impl.hpp" +#include "../../joint_matrix_all_sizes_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_annotated_ptr.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_annotated_ptr.cpp index a3e450846f45b..a2ee153ff1895 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_annotated_ptr.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_annotated_ptr.cpp @@ -8,15 +8,15 @@ // REQUIRES: gpu, matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out // Currently row major B fails when annotated_ptr is used // XFAIL: gpu -#include "../common.hpp" +#include "../../common.hpp" #define SG_SZ 32 constexpr size_t TN = 16; -#include "../joint_matrix_annotated_ptr_impl.hpp" +#include "../../joint_matrix_annotated_ptr_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_bf16.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_bf16.cpp index 1f5ed4df8d3a8..a2bd31c83a0d4 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_bf16.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_bf16.cpp @@ -10,11 +10,11 @@ // REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "../common.hpp" +#include "../../common.hpp" #define SG_SZ 32 -#include "../joint_matrix_apply_bf16_impl.hpp" +#include "../../joint_matrix_apply_bf16_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_two_matrices.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_two_matrices.cpp index d4caa86132d85..4e714aa3c1a6a 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_two_matrices.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_two_matrices.cpp @@ -13,8 +13,8 @@ // RUN: %{build} -ffp-model=precise -o %t.out // RUN: %{run} %t.out -#include "../common.hpp" +#include "../../common.hpp" #define SG_SZ 32 -#include "../joint_matrix_apply_two_matrices_impl.hpp" +#include "../../joint_matrix_apply_two_matrices_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache.cpp index 0c99f34aa5d82..64a96e16ece9a 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache.cpp @@ -10,13 +10,13 @@ // REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -o %t.out -ffp-model=precise +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out -ffp-model=precise // RUN: %{run} %t.out // -ffp-model=precise is added to not depend on compiler defaults. -#include "../common.hpp" +#include "../../common.hpp" #define SG_SZ 32 -#include "../joint_matrix_bf16_fill_k_cache_impl.hpp" +#include "../../joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp index 9b4ca122d8f59..bd3c1367c3ea1 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp @@ -10,13 +10,13 @@ // REQUIRES: gpu, aspect-ext_intel_matrix, gpu // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -o %t.out -DINIT_LIST -ffp-model=precise +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out -DINIT_LIST -ffp-model=precise // RUN: %{run} %t.out // -ffp-model=precise is added to not depend on compiler defaults. -#include "../common.hpp" +#include "../../common.hpp" #define SG_SZ 32 -#include "../joint_matrix_bf16_fill_k_cache_impl.hpp" +#include "../../joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp index 53230a2d1d40e..9e2314a6f8f39 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp @@ -17,8 +17,8 @@ // since IGC doesn't support some variants of IR for Joint Matrix currently // -ffp-model=precise is added to not depend on compiler defaults. -#include "../common.hpp" +#include "../../common.hpp" #define SG_SZ 32 -#include "../joint_matrix_bf16_fill_k_cache_impl.hpp" +#include "../../joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp index 6c46816878b15..6079d71c6d0c1 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp @@ -17,8 +17,8 @@ // since IGC doesn't support some variants of IR for Joint Matrix currently // -ffp-model=precise is added to not depend on compiler defaults. -#include "../common.hpp" +#include "../../common.hpp" #define SG_SZ 32 -#include "../joint_matrix_bf16_fill_k_cache_impl.hpp" +#include "../../joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16.cpp index fbdbe6df4ead4..279dae53dd8c3 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16.cpp @@ -10,11 +10,11 @@ // REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "../common.hpp" +#include "../../common.hpp" #define SG_SZ 32 -#include "../joint_matrix_bfloat16_impl.hpp" +#include "../../joint_matrix_bfloat16_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_array.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_array.cpp index abd4de12d8e85..4446e68cb82cc 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_array.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_array.cpp @@ -10,11 +10,11 @@ // REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "../common.hpp" +#include "../../common.hpp" #define SG_SZ 32 -#include "../joint_matrix_bfloat16_array_impl.hpp" +#include "../../joint_matrix_bfloat16_array_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp index 0cf5b40563c0e..087aca09d3167 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp @@ -8,7 +8,7 @@ // REQUIRES: gpu, matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out // This tests support of col major layout for matrix B which does transpose and @@ -16,7 +16,7 @@ // XFAIL: gpu -#include "../common.hpp" +#include "../../common.hpp" #include #include #include @@ -28,4 +28,4 @@ using bfloat16 = sycl::ext::oneapi::bfloat16; #define SG_SZ 32 constexpr size_t TN = 16; -#include "../joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp" +#include "../../joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_packedB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_packedB.cpp index 555d779ae1c10..83cc70e2a7d87 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_packedB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_packedB.cpp @@ -8,12 +8,12 @@ // REQUIRES: gpu, matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out // XFAIL: gpu -#include "../common.hpp" +#include "../../common.hpp" #define SG_SZ 32 -#include "../joint_matrix_bfloat16_packedB_impl.hpp" +#include "../../joint_matrix_bfloat16_packedB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_colA_rowB_colC.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_colA_rowB_colC.cpp index e1f7cc9c24cb6..dc581e77ad73a 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_colA_rowB_colC.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_colA_rowB_colC.cpp @@ -8,14 +8,14 @@ // REQUIRES: gpu, matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out // XFAIL:* -#include "../common.hpp" +#include "../../common.hpp" #define SG_SZ 32 constexpr size_t TN = 16; -#include "../joint_matrix_colA_rowB_colC_impl.hpp" +#include "../../joint_matrix_colA_rowB_colC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_down_convert.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_down_convert.cpp index f173f12b604c9..a473d4457ddff 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_down_convert.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_down_convert.cpp @@ -8,11 +8,11 @@ // REQUIRES: gpu, matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "../common.hpp" +#include "../../common.hpp" #define SG_SZ 32 -#include "../joint_matrix_down_convert_impl.hpp" +#include "../../joint_matrix_down_convert_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_half.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_half.cpp index 07f83c5b24315..7430e522a46af 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_half.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_half.cpp @@ -12,11 +12,11 @@ // REQUIRES: matrix-fp16 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "../common.hpp" +#include "../../common.hpp" #define SG_SZ 32 -#include "../joint_matrix_half_impl.hpp" +#include "../../joint_matrix_half_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp index b978573afdcae..594b0c91ad7aa 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp @@ -8,7 +8,7 @@ // REQUIRES: gpu, matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out // This tests support of col major layout for matrix B which does transpose and @@ -16,7 +16,7 @@ // XFAIL: gpu -#include "../common.hpp" +#include "../../common.hpp" using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; @@ -24,4 +24,4 @@ using namespace sycl::ext::oneapi::experimental::matrix; #define SG_SZ 32 constexpr size_t TN = 16; -#include "../joint_matrix_int8_colmajorA_colmajorB_impl.hpp" +#include "../../joint_matrix_int8_colmajorA_colmajorB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp index 8512f84df4a57..9735478c21a82 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp @@ -10,13 +10,13 @@ // REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out // XFAIL: gpu -#include "../common.hpp" +#include "../../common.hpp" #define SG_SZ 32 -#include "../joint_matrix_int8_rowmajorA_rowmajorB_impl.hpp" +#include "../../joint_matrix_int8_rowmajorA_rowmajorB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_out_bounds.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_out_bounds.cpp index fb571b4dc12c3..48a0215218715 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_out_bounds.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_out_bounds.cpp @@ -8,15 +8,15 @@ // REQUIRES: gpu, matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out // XFAIL:* -#include "../common.hpp" +#include "../../common.hpp" #define SG_SZ 32 constexpr size_t TN = 16; constexpr size_t MATRIX_K = 1024 + 24; -#include "../joint_matrix_out_bounds_impl.hpp" +#include "../../joint_matrix_out_bounds_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_prefetch.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_prefetch.cpp index 6f328ec11fffd..707adc061bcfb 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_prefetch.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_prefetch.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // REQUIRES: gpu, aspect-ext_intel_matrix -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out // XFAIL: gpu @@ -15,8 +15,8 @@ // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -#include "../common.hpp" +#include "../../common.hpp" #define SG_SZ 32 constexpr size_t TN = 16; -#include "../joint_matrix_prefetch_impl.hpp" +#include "../../joint_matrix_prefetch_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_rowmajorA_rowmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_rowmajorA_rowmajorB.cpp index ada0c37286b96..2457eaac47b90 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_rowmajorA_rowmajorB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_rowmajorA_rowmajorB.cpp @@ -11,17 +11,17 @@ // VNNI transform and sub-group size 32 are not supported yet on DG2 by IGC // UNSUPPORTED: gpu-intel-dg2 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out // Sub-group size 32 support for this test is not currently available in IGC // XFAIL: gpu -#include "../common.hpp" +#include "../../common.hpp" using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; #define SG_SZ 32 -#include "../joint_matrix_rowmajorA_rowmajorB_impl.hpp" +#include "../../joint_matrix_rowmajorA_rowmajorB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_ss_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_ss_int8.cpp index 97cd8bb817a6b..98c69c1655a17 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_ss_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_ss_int8.cpp @@ -10,11 +10,11 @@ // REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "../common.hpp" +#include "../../common.hpp" #define SG_SZ 32 -#include "../joint_matrix_ss_int8_impl.hpp" +#include "../../joint_matrix_ss_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_su_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_su_int8.cpp index 3feb8272fdfa1..913034564e1bf 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_su_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_su_int8.cpp @@ -10,11 +10,11 @@ // REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "../common.hpp" +#include "../../common.hpp" #define SG_SZ 32 -#include "../joint_matrix_su_int8_impl.hpp" +#include "../../joint_matrix_su_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_tf32.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_tf32.cpp index c57542890d1ec..965a80fe98398 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_tf32.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_tf32.cpp @@ -8,10 +8,10 @@ // REQUIRES: gpu, matrix-tf32 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "../common.hpp" +#include "../../common.hpp" using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; @@ -19,4 +19,4 @@ using namespace sycl::ext::oneapi::experimental::matrix; #define SG_SZ 32 constexpr size_t TN = 16; -#include "../joint_matrix_tf32_impl.hpp" +#include "../../joint_matrix_tf32_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_transposeC.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_transposeC.cpp index 4a1d8fd618775..bf23b7fb8f504 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_transposeC.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_transposeC.cpp @@ -10,11 +10,11 @@ // REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "../common.hpp" +#include "../../common.hpp" #define SG_SZ 32 -#include "../joint_matrix_transposeC_impl.hpp" +#include "../../joint_matrix_transposeC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_unaligned_k.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_unaligned_k.cpp index 74a5b6fefa1a9..e54164775eb38 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_unaligned_k.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_unaligned_k.cpp @@ -8,15 +8,15 @@ // REQUIRES: gpu, matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out // XFAIL:* -#include "../common.hpp" +#include "../../common.hpp" #define SG_SZ 32 constexpr size_t TN = 16; static constexpr size_t MATRIX_K = 1024 + 14; -#include "../joint_matrix_out_bounds_impl.hpp" +#include "../../joint_matrix_out_bounds_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_us_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_us_int8.cpp index 9413b5e20ef82..a59a3ca04843d 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_us_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_us_int8.cpp @@ -10,11 +10,11 @@ // REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "../common.hpp" +#include "../../common.hpp" #define SG_SZ 32 -#include "../joint_matrix_us_int8_impl.hpp" +#include "../../joint_matrix_us_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_uu_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_uu_int8.cpp index 501264f2ce1f1..d7f73d765ac5b 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_uu_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_uu_int8.cpp @@ -10,11 +10,11 @@ // REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -#include "../common.hpp" +#include "../../common.hpp" #define SG_SZ 32 -#include "../joint_matrix_uu_int8_impl.hpp" +#include "../../joint_matrix_uu_int8_impl.hpp" From 80368d0a30da5758f953f349d52a04b9942e82f2 Mon Sep 17 00:00:00 2001 From: "Sidorov, Dmitry" Date: Mon, 12 Aug 2024 08:30:58 -0700 Subject: [PATCH 12/19] remove old folder Signed-off-by: Sidorov, Dmitry --- .../SPVCooperativeMatrix/element_wise_abc.cpp | 14 -------------- .../SPVCooperativeMatrix/element_wise_ops.cpp | 14 -------------- .../joint_matrix_bfloat16.cpp | 14 -------------- .../joint_matrix_bfloat16_array.cpp | 14 -------------- .../SPVCooperativeMatrix/joint_matrix_half.cpp | 17 ----------------- .../joint_matrix_prefetch.cpp | 15 --------------- .../joint_matrix_ss_int8.cpp | 14 -------------- .../joint_matrix_su_int8.cpp | 14 -------------- .../SPVCooperativeMatrix/joint_matrix_tf32.cpp | 18 ------------------ .../joint_matrix_us_int8.cpp | 14 -------------- .../joint_matrix_uu_int8.cpp | 14 -------------- 11 files changed, 162 deletions(-) delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/element_wise_abc.cpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/element_wise_ops.cpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_bfloat16.cpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array.cpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_half.cpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_prefetch.cpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_ss_int8.cpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_su_int8.cpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_tf32.cpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_us_int8.cpp delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_uu_int8.cpp diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/element_wise_abc.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/element_wise_abc.cpp deleted file mode 100644 index c8bc57c62912b..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/element_wise_abc.cpp +++ /dev/null @@ -1,14 +0,0 @@ -//==----------- element_wise_abc.cpp - DPC++ joint_matrix------------- ----==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// REQUIRES: gpu, gpu, aspect-ext_intel_matrix - -// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out -// RUN: %{run} %t.out - -#include "../common.hpp" -#include "../element_wise_abc_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/element_wise_ops.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/element_wise_ops.cpp deleted file mode 100644 index ae9e21855bcfb..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/element_wise_ops.cpp +++ /dev/null @@ -1,14 +0,0 @@ -//==----------- element_wise_ops.cpp - DPC++ joint_matrix------------- ----==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// REQUIRES: gpu, gpu, aspect-ext_intel_matrix - -// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out -// RUN: %{run} %t.out - -#include "../common.hpp" -#include "../element_wise_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_bfloat16.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_bfloat16.cpp deleted file mode 100644 index 2dd6b2f9e7ebf..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_bfloat16.cpp +++ /dev/null @@ -1,14 +0,0 @@ -//==-------- joint_matrix_bfloat16.cpp - DPC++ joint_matrix----------- ----==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// REQUIRES: gpu, gpu, aspect-ext_intel_matrix - -// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out -// RUN: %{run} %t.out - -#include "../common.hpp" -#include "../joint_matrix_bfloat16_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array.cpp deleted file mode 100644 index a0a67f55fa4ab..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array.cpp +++ /dev/null @@ -1,14 +0,0 @@ -//==-------- joint_matrix_bfloat16_array.cpp - DPC++ joint_matrix----------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// REQUIRES: gpu, gpu, aspect-ext_intel_matrix - -// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out -// RUN: %{run} %t.out - -#include "../common.hpp" -#include "../joint_matrix_bfloat16_array_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_half.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_half.cpp deleted file mode 100644 index 378556e190a7a..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_half.cpp +++ /dev/null @@ -1,17 +0,0 @@ -//==-------- joint_matrix_half.cpp - DPC++ joint_matrix------------ ----==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-fp16 -// REQUIRES: aspect-ext_intel_matrix -// REQUIRES: matrix-fp16 -// REQUIRES: gpu - -// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out -// RUN: %{run} %t.out - -#include "../common.hpp" -#include "../joint_matrix_half_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_prefetch.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_prefetch.cpp deleted file mode 100644 index 918c3df7db407..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_prefetch.cpp +++ /dev/null @@ -1,15 +0,0 @@ -//==-------- joint_matrix_prefetch.cpp - DPC++ joint_matrix----------------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// REQUIRES: gpu, gpu, aspect-ext_intel_matrix -// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out -// RUN: %{run} %t.out - -#include "../common.hpp" - -constexpr size_t TN = 16; -#include "../joint_matrix_prefetch_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_ss_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_ss_int8.cpp deleted file mode 100644 index 0b6fa4ac88fe3..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_ss_int8.cpp +++ /dev/null @@ -1,14 +0,0 @@ -//==-------- joint_matrix_ss_int8.cpp - DPC++ joint_matrix------------ ----==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// REQUIRES: gpu, gpu, aspect-ext_intel_matrix - -// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out -// RUN: %{run} %t.out - -#include "../common.hpp" -#include "../joint_matrix_ss_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_su_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_su_int8.cpp deleted file mode 100644 index b9f746f4ea8fc..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_su_int8.cpp +++ /dev/null @@ -1,14 +0,0 @@ -//==-------- joint_matrix_su_int8.cpp - DPC++ joint_matrix------------ ----==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// REQUIRES: gpu, gpu, aspect-ext_intel_matrix - -// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out -// RUN: %{run} %t.out - -#include "../common.hpp" -#include "../joint_matrix_su_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_tf32.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_tf32.cpp deleted file mode 100644 index c006e6b401b4a..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_tf32.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//==---------------- joint_matrix_tf32.cpp - DPC++ joint_matrix------------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// REQUIRES: gpu, gpu, matrix-tf32 -// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 - -// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out -// RUN: %{run} %t.out - -#include "../common.hpp" - -constexpr size_t TN = 16; - -#include "../joint_matrix_tf32_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_us_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_us_int8.cpp deleted file mode 100644 index 38d01c67df783..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_us_int8.cpp +++ /dev/null @@ -1,14 +0,0 @@ -//==-------- joint_matrix_us_int8.cpp - DPC++ joint_matrix------------ ----==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// REQUIRES: gpu, gpu, aspect-ext_intel_matrix - -// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out -// RUN: %{run} %t.out - -#include "../common.hpp" -#include "../joint_matrix_us_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_uu_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_uu_int8.cpp deleted file mode 100644 index ac01a98568e9a..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SPVCooperativeMatrix/joint_matrix_uu_int8.cpp +++ /dev/null @@ -1,14 +0,0 @@ -//==-------- joint_matrix_uu_int8.cpp - DPC++ joint_matrix------------ ----==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// REQUIRES: gpu, gpu, aspect-ext_intel_matrix - -// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out -// RUN: %{run} %t.out - -#include "../common.hpp" -#include "../joint_matrix_uu_int8_impl.hpp" From 65c1c4cd746ae77737f2f32d4b313eb09d8eb6fd Mon Sep 17 00:00:00 2001 From: "Sidorov, Dmitry" Date: Tue, 13 Aug 2024 04:46:03 -0700 Subject: [PATCH 13/19] fix tests Signed-off-by: Sidorov, Dmitry --- .../SG32/element_wise_all_ops_half.cpp | 2 +- .../SG32/joint_matrix_apply_two_matrices.cpp | 2 +- .../joint_matrix_bf16_fill_k_cache_init.cpp | 2 +- .../joint_matrix_bf16_fill_k_cache_unroll.cpp | 2 +- ...t_matrix_bf16_fill_k_cache_unroll_init.cpp | 4 +-- .../element_wise_all_ops.cpp | 3 -- .../element_wise_all_ops_1d.cpp | 5 +-- .../element_wise_all_ops_1d_cont.cpp | 5 +-- .../element_wise_all_ops_scalar.cpp | 5 +-- .../element_wise_all_sizes.cpp | 3 -- .../joint_matrix_apply_two_matrices.cpp | 2 +- .../joint_matrix_bf16_fill_k_cache_OOB.cpp | 4 +-- .../joint_matrix_bf16_fill_k_cache_init.cpp | 2 +- .../joint_matrix_bf16_fill_k_cache_unroll.cpp | 2 +- ...trix_opt_kernel_feature_unsupported_hw.cpp | 36 ------------------- 15 files changed, 14 insertions(+), 65 deletions(-) delete mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature_unsupported_hw.cpp diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_half.cpp index 3c333131c8d8b..3d286108fea6b 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_half.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_half.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: gpu, aspect-fp16 +// REQUIRES: aspect-fp16 // REQUIRES: aspect-ext_intel_matrix, gpu // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_two_matrices.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_two_matrices.cpp index 4e714aa3c1a6a..1c1d134f9c30d 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_two_matrices.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_two_matrices.cpp @@ -10,7 +10,7 @@ // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// RUN: %{build} -ffp-model=precise -o %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -ffp-model=precise -o %t.out // RUN: %{run} %t.out #include "../../common.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp index bd3c1367c3ea1..6cc104d4a3ca7 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: gpu, aspect-ext_intel_matrix, gpu +// REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out -DINIT_LIST -ffp-model=precise diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp index 9e2314a6f8f39..005ae4581794e 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp @@ -10,7 +10,7 @@ // REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -mllvm -inline-threshold=5000 -ffp-model=precise -o %t.out -DMANUAL_UNROLL +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -mllvm -inline-threshold=5000 -ffp-model=precise -o %t.out -DMANUAL_UNROLL // RUN: %{run} %t.out // -mllvm -inline-threshold added as a workaround, diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp index 6079d71c6d0c1..5421d081ec86a 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp @@ -7,10 +7,10 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: gpu, aspect-ext_intel_matrix, gpu +// REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -mllvm -inline-threshold=5000 -ffp-model=precise -o %t_gpu.out -DINIT_LIST -DMANUAL_UNROLL +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -mllvm -inline-threshold=5000 -ffp-model=precise -o %t_gpu.out -DINIT_LIST -DMANUAL_UNROLL // RUN: %{run} %t_gpu.out // -mllvm -inline-threshold added as a workaround, diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops.cpp index 0fe6ccae0cc83..d2dbdeffd650d 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops.cpp @@ -10,8 +10,5 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -// TODO: Reenable, see https://github.com/intel/llvm/issues/14598 -// UNSUPPORTED: windows, linux - #include "../common.hpp" #include "../element_wise_all_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d.cpp index 72d7bf86884cf..1749aa0f237f4 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d.cpp @@ -5,13 +5,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix, gpu +// REQUIRES: gpu, aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: env IGC_JointMatrixLoadStoreOpt=1 %{run} %t.out -// TODO: Reenable, see https://github.com/intel/llvm/issues/14598 -// UNSUPPORTED: windows, linux - #include "../common.hpp" #include "../element_wise_all_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d_cont.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d_cont.cpp index f96a0ab5305aa..8f1233a22f340 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d_cont.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d_cont.cpp @@ -5,13 +5,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix, gpu +// REQUIRES: gpu, aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: env IGC_JointMatrixLoadStoreOpt=2 %{run} %t.out -// TODO: Reenable, see https://github.com/intel/llvm/issues/14598 -// UNSUPPORTED: windows, linux - #include "../common.hpp" #include "../element_wise_all_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_scalar.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_scalar.cpp index e599ab9015a09..220c47dc57baf 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_scalar.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_scalar.cpp @@ -5,13 +5,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix, gpu +// REQUIRES: gpu, aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: env IGC_JointMatrixLoadStoreOpt=0 %{run} %t.out -// TODO: Reenable, see https://github.com/intel/llvm/issues/14598 -// UNSUPPORTED: windows, linux - #include "../common.hpp" #include "../element_wise_all_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_sizes.cpp index 301b8b30e69c5..e1f10456ec29a 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_sizes.cpp @@ -8,9 +8,6 @@ // REQUIRES: gpu, aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// TODO: Reenable, see https://github.com/intel/llvm/issues/14598 -// UNSUPPORTED: windows, linux - // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_two_matrices.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_two_matrices.cpp index d41424f1fa6f0..e404115ca3966 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_two_matrices.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_two_matrices.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // REQUIRES: gpu, aspect-ext_intel_matrix -// RUN: %{build} -ffp-model=precise -o %t.out +// RUN: %{build} -ffp-model=precise -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out #include "../common.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_OOB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_OOB.cpp index 86cb8f6bf4704..03b14bbc805dd 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_OOB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_OOB.cpp @@ -5,10 +5,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix, gpu +// REQUIRES: gpu, aspect-ext_intel_matrix // UNSUPPORTED: gpu-intel-dg2 -// RUN: %{build} -o %t_gpu.out -ffp-model=precise -DOOB +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t_gpu.out -ffp-model=precise -DOOB // RUN: %{run} %t_gpu.out // XFAIL: gpu diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_init.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_init.cpp index e116e885d5b2c..05cf7235ffd18 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_init.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_init.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix, gpu +// REQUIRES: gpu, aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out -DINIT_LIST -ffp-model=precise // RUN: %{run} %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll.cpp index 752d0303b2dfa..103eb3c65fc6c 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // REQUIRES: gpu, aspect-ext_intel_matrix -// RUN: %{build} -mllvm -inline-threshold=2000 -ffp-model=precise -o %t.out -DMANUAL_UNROLL +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -mllvm -inline-threshold=2000 -ffp-model=precise -o %t.out -DMANUAL_UNROLL // RUN: %{run} %t.out // -mllvm -inline-threshold=2000 added as a workaround, diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature_unsupported_hw.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature_unsupported_hw.cpp deleted file mode 100644 index c8078e08f9151..0000000000000 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature_unsupported_hw.cpp +++ /dev/null @@ -1,36 +0,0 @@ -//===---joint_matrix_opt_kernel_feature_unsupported_hw_impl.cpp------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// REQUIRES: gpu, gpu-intel-gen12, gpu - -// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out -// RUN: %{run} %t.out - -// Test checks that exception will be thrown in case object of joint_matrix type -// is used on unsupported HW, in this case, on Gen12. - -#include "../common.hpp" - -int main() { - sycl::queue q; - - try { - q.submit([&](sycl::handler &cgh) { - cgh.single_task([]() { - joint_matrix - m; // matrix type and sizes do not matter - }); - }); - } catch (const sycl::exception &e) { - assert((e.code() == sycl::errc::kernel_not_supported) && - (std::string(e.what()) == - std::string("no matrix hardware on the target device, joint_matrix " - "is not supported"))); - } - return 0; -} From 39ed4176661b7c35d5a985e425e926980ecd9ce1 Mon Sep 17 00:00:00 2001 From: "Sidorov, Dmitry" Date: Tue, 13 Aug 2024 05:11:16 -0700 Subject: [PATCH 14/19] xfail cpu Signed-off-by: Sidorov, Dmitry --- .../Matrix/SPVCooperativeMatrix/SG32/element_wise_abc.cpp | 2 ++ .../Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops.cpp | 2 ++ .../SPVCooperativeMatrix/SG32/element_wise_all_ops_half.cpp | 2 ++ .../SPVCooperativeMatrix/SG32/element_wise_all_ops_int8.cpp | 2 ++ .../SG32/element_wise_all_ops_int8_packed.cpp | 2 ++ .../SPVCooperativeMatrix/SG32/element_wise_all_ops_tf32.cpp | 2 ++ .../SPVCooperativeMatrix/SG32/element_wise_all_sizes.cpp | 2 ++ .../Matrix/SPVCooperativeMatrix/SG32/element_wise_ops.cpp | 2 ++ .../Matrix/SPVCooperativeMatrix/SG32/get_coord_float_matC.cpp | 2 ++ .../Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matA.cpp | 2 ++ .../Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matB.cpp | 2 ++ .../SPVCooperativeMatrix/SG32/joint_matrix_all_sizes.cpp | 2 ++ .../SPVCooperativeMatrix/SG32/joint_matrix_annotated_ptr.cpp | 2 ++ .../SPVCooperativeMatrix/SG32/joint_matrix_apply_bf16.cpp | 2 ++ .../SG32/joint_matrix_apply_two_matrices.cpp | 2 ++ .../SG32/joint_matrix_bf16_fill_k_cache.cpp | 2 ++ .../SG32/joint_matrix_bf16_fill_k_cache_init.cpp | 2 ++ .../SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp | 2 ++ .../SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp | 2 ++ .../SPVCooperativeMatrix/SG32/joint_matrix_bfloat16.cpp | 2 ++ .../SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_array.cpp | 2 ++ .../SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp | 2 ++ .../SG32/joint_matrix_bfloat16_packedB.cpp | 2 ++ .../SPVCooperativeMatrix/SG32/joint_matrix_colA_rowB_colC.cpp | 2 ++ .../SPVCooperativeMatrix/SG32/joint_matrix_down_convert.cpp | 2 ++ .../Matrix/SPVCooperativeMatrix/SG32/joint_matrix_half.cpp | 2 ++ .../SG32/joint_matrix_int8_colmajorA_colmajorB.cpp | 2 ++ .../SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp | 2 ++ .../SPVCooperativeMatrix/SG32/joint_matrix_out_bounds.cpp | 2 ++ .../SPVCooperativeMatrix/SG32/joint_matrix_prefetch.cpp | 2 ++ .../SG32/joint_matrix_rowmajorA_rowmajorB.cpp | 2 ++ .../Matrix/SPVCooperativeMatrix/SG32/joint_matrix_ss_int8.cpp | 2 ++ .../Matrix/SPVCooperativeMatrix/SG32/joint_matrix_su_int8.cpp | 2 ++ .../Matrix/SPVCooperativeMatrix/SG32/joint_matrix_tf32.cpp | 2 ++ .../SPVCooperativeMatrix/SG32/joint_matrix_transposeC.cpp | 2 ++ .../SPVCooperativeMatrix/SG32/joint_matrix_unaligned_k.cpp | 2 ++ .../Matrix/SPVCooperativeMatrix/SG32/joint_matrix_us_int8.cpp | 2 ++ .../Matrix/SPVCooperativeMatrix/SG32/joint_matrix_uu_int8.cpp | 2 ++ .../test-e2e/Matrix/SPVCooperativeMatrix/element_wise_abc.cpp | 4 +++- .../Matrix/SPVCooperativeMatrix/element_wise_all_ops.cpp | 2 ++ .../Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d.cpp | 2 ++ .../SPVCooperativeMatrix/element_wise_all_ops_1d_cont.cpp | 4 +++- .../Matrix/SPVCooperativeMatrix/element_wise_all_ops_half.cpp | 2 ++ .../Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8.cpp | 2 ++ .../SPVCooperativeMatrix/element_wise_all_ops_int8_packed.cpp | 2 ++ .../SPVCooperativeMatrix/element_wise_all_ops_scalar.cpp | 2 ++ .../Matrix/SPVCooperativeMatrix/element_wise_all_ops_tf32.cpp | 2 ++ .../Matrix/SPVCooperativeMatrix/element_wise_all_sizes.cpp | 2 ++ .../test-e2e/Matrix/SPVCooperativeMatrix/element_wise_ops.cpp | 2 ++ .../Matrix/SPVCooperativeMatrix/get_coord_float_matC.cpp | 2 ++ .../Matrix/SPVCooperativeMatrix/get_coord_int8_matA.cpp | 2 ++ .../Matrix/SPVCooperativeMatrix/get_coord_int8_matB.cpp | 2 ++ .../Matrix/SPVCooperativeMatrix/joint_matrix_all_sizes.cpp | 2 ++ .../SPVCooperativeMatrix/joint_matrix_annotated_ptr.cpp | 2 ++ .../Matrix/SPVCooperativeMatrix/joint_matrix_apply_bf16.cpp | 2 ++ .../SPVCooperativeMatrix/joint_matrix_apply_two_matrices.cpp | 2 ++ .../SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache.cpp | 2 ++ .../joint_matrix_bf16_fill_k_cache_OOB.cpp | 2 ++ .../joint_matrix_bf16_fill_k_cache_init.cpp | 2 ++ .../joint_matrix_bf16_fill_k_cache_unroll.cpp | 2 ++ .../joint_matrix_bf16_fill_k_cache_unroll_init.cpp | 2 ++ .../Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16.cpp | 2 ++ .../SPVCooperativeMatrix/joint_matrix_bfloat16_array.cpp | 2 ++ .../joint_matrix_bfloat16_colmajorA_colmajorB.cpp | 2 ++ .../SPVCooperativeMatrix/joint_matrix_bfloat16_packedB.cpp | 2 ++ .../SPVCooperativeMatrix/joint_matrix_colA_rowB_colC.cpp | 2 ++ .../Matrix/SPVCooperativeMatrix/joint_matrix_down_convert.cpp | 2 ++ .../Matrix/SPVCooperativeMatrix/joint_matrix_half.cpp | 2 ++ .../joint_matrix_int8_colmajorA_colmajorB.cpp | 2 ++ .../joint_matrix_int8_rowmajorA_rowmajorB.cpp | 2 ++ .../SPVCooperativeMatrix/joint_matrix_opt_kernel_feature.cpp | 2 ++ .../Matrix/SPVCooperativeMatrix/joint_matrix_out_bounds.cpp | 2 ++ .../Matrix/SPVCooperativeMatrix/joint_matrix_prefetch.cpp | 2 ++ .../SPVCooperativeMatrix/joint_matrix_rowmajorA_rowmajorB.cpp | 2 ++ .../Matrix/SPVCooperativeMatrix/joint_matrix_ss_int8.cpp | 2 ++ .../Matrix/SPVCooperativeMatrix/joint_matrix_su_int8.cpp | 2 ++ .../Matrix/SPVCooperativeMatrix/joint_matrix_tf32.cpp | 2 ++ .../Matrix/SPVCooperativeMatrix/joint_matrix_transposeC.cpp | 2 ++ .../Matrix/SPVCooperativeMatrix/joint_matrix_unaligned_k.cpp | 2 ++ .../Matrix/SPVCooperativeMatrix/joint_matrix_us_int8.cpp | 2 ++ .../Matrix/SPVCooperativeMatrix/joint_matrix_uu_int8.cpp | 2 ++ 81 files changed, 164 insertions(+), 2 deletions(-) diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_abc.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_abc.cpp index 4c7cac6eae8ca..cce4a1b89f9fb 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_abc.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_abc.cpp @@ -13,6 +13,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../../common.hpp" #define SG_SZ 32 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops.cpp index 414ff4e8d8b51..91f908ed1c335 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops.cpp @@ -13,6 +13,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../../common.hpp" #define SG_SZ 32 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_half.cpp index 3d286108fea6b..c90aa1b824a6d 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_half.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_half.cpp @@ -14,6 +14,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../../common.hpp" #define SG_SZ 32 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8.cpp index 0a253724c8649..ccab3b8c200ef 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8.cpp @@ -13,6 +13,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../../common.hpp" #define SG_SZ 32 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8_packed.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8_packed.cpp index ca700a18ee344..41bb1f4291283 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8_packed.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8_packed.cpp @@ -13,6 +13,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + // This test stores the matrix B that is VNNIed (packed). #include "../../common.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_tf32.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_tf32.cpp index d841323fc3260..fe09792660e9b 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_tf32.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_tf32.cpp @@ -11,6 +11,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../../common.hpp" #define SG_SZ 32 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_sizes.cpp index 2d7af42e6b207..92ebb301677e2 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_sizes.cpp @@ -11,6 +11,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../../common.hpp" #define SG_SZ 32 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_ops.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_ops.cpp index 1be795aac5293..9abea818c6779 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_ops.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_ops.cpp @@ -13,6 +13,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../../common.hpp" #define SG_SZ 32 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_float_matC.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_float_matC.cpp index ceeb101401684..27105c0f7c933 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_float_matC.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_float_matC.cpp @@ -13,6 +13,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../../common.hpp" #define SG_SZ 32 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matA.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matA.cpp index 910670a7423b2..7bf347ca34103 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matA.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matA.cpp @@ -13,6 +13,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../../common.hpp" #define SG_SZ 32 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matB.cpp index 9e881a6e98977..40663a156874f 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matB.cpp @@ -12,6 +12,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out + +// XFAIL: cpu // XFAIL: cpu #include "../../common.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_all_sizes.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_all_sizes.cpp index a6021a78db2de..4bc63a47b76b3 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_all_sizes.cpp @@ -13,6 +13,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../../common.hpp" #define SG_SZ 32 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_annotated_ptr.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_annotated_ptr.cpp index a2ee153ff1895..4800b89f986e4 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_annotated_ptr.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_annotated_ptr.cpp @@ -11,6 +11,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + // Currently row major B fails when annotated_ptr is used // XFAIL: gpu diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_bf16.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_bf16.cpp index a2bd31c83a0d4..3073c02d1a131 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_bf16.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_bf16.cpp @@ -13,6 +13,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../../common.hpp" #define SG_SZ 32 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_two_matrices.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_two_matrices.cpp index 1c1d134f9c30d..a391887ebf15d 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_two_matrices.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_two_matrices.cpp @@ -13,6 +13,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -ffp-model=precise -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../../common.hpp" #define SG_SZ 32 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache.cpp index 64a96e16ece9a..91fd3e47bd81d 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache.cpp @@ -13,6 +13,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out -ffp-model=precise // RUN: %{run} %t.out +// XFAIL: cpu + // -ffp-model=precise is added to not depend on compiler defaults. #include "../../common.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp index 6cc104d4a3ca7..e8d809c7b52d3 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp @@ -13,6 +13,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out -DINIT_LIST -ffp-model=precise // RUN: %{run} %t.out +// XFAIL: cpu + // -ffp-model=precise is added to not depend on compiler defaults. #include "../../common.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp index 005ae4581794e..c82146c153023 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp @@ -13,6 +13,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -mllvm -inline-threshold=5000 -ffp-model=precise -o %t.out -DMANUAL_UNROLL // RUN: %{run} %t.out +// XFAIL: cpu + // -mllvm -inline-threshold added as a workaround, // since IGC doesn't support some variants of IR for Joint Matrix currently // -ffp-model=precise is added to not depend on compiler defaults. diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp index 5421d081ec86a..506f43801f078 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp @@ -13,6 +13,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -mllvm -inline-threshold=5000 -ffp-model=precise -o %t_gpu.out -DINIT_LIST -DMANUAL_UNROLL // RUN: %{run} %t_gpu.out +// XFAIL: cpu + // -mllvm -inline-threshold added as a workaround, // since IGC doesn't support some variants of IR for Joint Matrix currently // -ffp-model=precise is added to not depend on compiler defaults. diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16.cpp index 279dae53dd8c3..4060126529c09 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16.cpp @@ -13,6 +13,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../../common.hpp" #define SG_SZ 32 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_array.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_array.cpp index 4446e68cb82cc..a07659db2d102 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_array.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_array.cpp @@ -13,6 +13,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../../common.hpp" #define SG_SZ 32 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp index 087aca09d3167..f50d83697a9f2 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp @@ -11,6 +11,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + // This tests support of col major layout for matrix B which does transpose and // then VNNI transform. This is currently only available on AMX diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_packedB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_packedB.cpp index 83cc70e2a7d87..4b4aee5cfdfe4 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_packedB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_packedB.cpp @@ -11,6 +11,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + // XFAIL: gpu #include "../../common.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_colA_rowB_colC.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_colA_rowB_colC.cpp index dc581e77ad73a..f8d7525b94f26 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_colA_rowB_colC.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_colA_rowB_colC.cpp @@ -11,6 +11,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + // XFAIL:* #include "../../common.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_down_convert.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_down_convert.cpp index a473d4457ddff..683de97c16792 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_down_convert.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_down_convert.cpp @@ -11,6 +11,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../../common.hpp" #define SG_SZ 32 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_half.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_half.cpp index 7430e522a46af..98bef544aff2d 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_half.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_half.cpp @@ -15,6 +15,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../../common.hpp" #define SG_SZ 32 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp index 594b0c91ad7aa..8b053f26f1c87 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp @@ -11,6 +11,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + // This tests support of col major layout for matrix B which does transpose and // then VNNI transform. This is currently only available on AMX diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp index 9735478c21a82..9993b3ff74eb6 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp @@ -13,6 +13,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + // XFAIL: gpu #include "../../common.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_out_bounds.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_out_bounds.cpp index 48a0215218715..355de138d6e3d 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_out_bounds.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_out_bounds.cpp @@ -11,6 +11,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + // XFAIL:* #include "../../common.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_prefetch.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_prefetch.cpp index 707adc061bcfb..b5d5a5a33c871 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_prefetch.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_prefetch.cpp @@ -10,6 +10,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + // XFAIL: gpu // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_rowmajorA_rowmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_rowmajorA_rowmajorB.cpp index 2457eaac47b90..a8ef1190d1079 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_rowmajorA_rowmajorB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_rowmajorA_rowmajorB.cpp @@ -14,6 +14,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + // Sub-group size 32 support for this test is not currently available in IGC // XFAIL: gpu diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_ss_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_ss_int8.cpp index 98c69c1655a17..d172bdf73fb7f 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_ss_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_ss_int8.cpp @@ -13,6 +13,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../../common.hpp" #define SG_SZ 32 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_su_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_su_int8.cpp index 913034564e1bf..dd1fe6af86f77 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_su_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_su_int8.cpp @@ -13,6 +13,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../../common.hpp" #define SG_SZ 32 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_tf32.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_tf32.cpp index 965a80fe98398..7b0d79911a000 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_tf32.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_tf32.cpp @@ -11,6 +11,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../../common.hpp" using namespace sycl; diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_transposeC.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_transposeC.cpp index bf23b7fb8f504..6dd489487e478 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_transposeC.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_transposeC.cpp @@ -13,6 +13,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../../common.hpp" #define SG_SZ 32 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_unaligned_k.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_unaligned_k.cpp index e54164775eb38..7c2dc81ad0c02 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_unaligned_k.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_unaligned_k.cpp @@ -11,6 +11,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + // XFAIL:* #include "../../common.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_us_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_us_int8.cpp index a59a3ca04843d..842b70af021da 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_us_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_us_int8.cpp @@ -13,6 +13,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../../common.hpp" #define SG_SZ 32 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_uu_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_uu_int8.cpp index d7f73d765ac5b..66bf0e255aaae 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_uu_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_uu_int8.cpp @@ -13,6 +13,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../../common.hpp" #define SG_SZ 32 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_abc.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_abc.cpp index 3bf9030658112..9cc96a4396066 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_abc.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_abc.cpp @@ -5,10 +5,12 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../common.hpp" #include "../element_wise_abc_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops.cpp index d2dbdeffd650d..ef4379c42c6d9 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops.cpp @@ -10,5 +10,7 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../common.hpp" #include "../element_wise_all_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d.cpp index 1749aa0f237f4..b212708fa1d4b 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d.cpp @@ -10,5 +10,7 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: env IGC_JointMatrixLoadStoreOpt=1 %{run} %t.out +// XFAIL: cpu + #include "../common.hpp" #include "../element_wise_all_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d_cont.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d_cont.cpp index 8f1233a22f340..b6781d9a4a3af 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d_cont.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d_cont.cpp @@ -5,10 +5,12 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix, gpu // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: env IGC_JointMatrixLoadStoreOpt=2 %{run} %t.out +// XFAIL: cpu + #include "../common.hpp" #include "../element_wise_all_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_half.cpp index 24694a44a8ca9..90b2cb8ce15d5 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_half.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_half.cpp @@ -13,5 +13,7 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../common.hpp" #include "../element_wise_all_ops_half_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8.cpp index cc18fa57cec4e..20b89d0e96e50 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8.cpp @@ -11,5 +11,7 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../common.hpp" #include "../element_wise_all_ops_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8_packed.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8_packed.cpp index d57758c88698f..90bb5b97279e2 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8_packed.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8_packed.cpp @@ -11,6 +11,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + // This test stores the matrix B that is VNNIed (packed). #include "../common.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_scalar.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_scalar.cpp index 220c47dc57baf..3872843efb966 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_scalar.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_scalar.cpp @@ -10,5 +10,7 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: env IGC_JointMatrixLoadStoreOpt=0 %{run} %t.out +// XFAIL: cpu + #include "../common.hpp" #include "../element_wise_all_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_tf32.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_tf32.cpp index 9612d677506d0..934eea94294ad 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_tf32.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_tf32.cpp @@ -11,6 +11,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../common.hpp" constexpr size_t TN = 16; diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_sizes.cpp index e1f10456ec29a..ea1a072da2b12 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_sizes.cpp @@ -16,5 +16,7 @@ // RUN: %{build} -fsycl-device-code-split=off -o %t_split.out // RUN: %if gpu-intel-dg2 %{ %{run} %t_split.out %} +// XFAIL: cpu + #include "../common.hpp" #include "../element_wise_all_sizes_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_ops.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_ops.cpp index 19c2d5ef9c273..a388337663145 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_ops.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_ops.cpp @@ -10,5 +10,7 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../common.hpp" #include "../element_wise_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_float_matC.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_float_matC.cpp index dce56aebad781..95505baa92da3 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_float_matC.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_float_matC.cpp @@ -10,5 +10,7 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../common.hpp" #include "../get_coord_float_matC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matA.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matA.cpp index 4ade538001d98..5e58090544429 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matA.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matA.cpp @@ -10,5 +10,7 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../common.hpp" #include "../get_coord_int8_matA_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matB.cpp index c8de880baccf3..c9cf347fcf266 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matB.cpp @@ -9,6 +9,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out + +// XFAIL: cpu // XFAIL: cpu #include "../common.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_all_sizes.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_all_sizes.cpp index 6a93d9ba12747..266af945d6a3f 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_all_sizes.cpp @@ -10,5 +10,7 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../common.hpp" #include "../joint_matrix_all_sizes_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_annotated_ptr.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_annotated_ptr.cpp index 9881c24e35a22..e66f4f4d05faf 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_annotated_ptr.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_annotated_ptr.cpp @@ -12,6 +12,8 @@ // RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=0 %{run} %t.out %} // RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=1 %{run} %t.out %} +// XFAIL: cpu + #include "../common.hpp" constexpr size_t TN = 16; diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_bf16.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_bf16.cpp index 2e2f1294bab8f..6c962b8979553 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_bf16.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_bf16.cpp @@ -10,5 +10,7 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../common.hpp" #include "../joint_matrix_apply_bf16_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_two_matrices.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_two_matrices.cpp index e404115ca3966..018849e93936c 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_two_matrices.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_two_matrices.cpp @@ -10,5 +10,7 @@ // RUN: %{build} -ffp-model=precise -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../common.hpp" #include "../joint_matrix_apply_two_matrices_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache.cpp index 014f7cd68e9f6..8f26dc4d1f976 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache.cpp @@ -10,6 +10,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out -ffp-model=precise // RUN: %{run} %t.out +// XFAIL: cpu + // -ffp-model=precise is added to not depend on compiler defaults. #include "../common.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_OOB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_OOB.cpp index 03b14bbc805dd..caf68d0b92ed1 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_OOB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_OOB.cpp @@ -11,6 +11,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t_gpu.out -ffp-model=precise -DOOB // RUN: %{run} %t_gpu.out +// XFAIL: cpu + // XFAIL: gpu // -ffp-model=precise is added to not depend on compiler defaults. diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_init.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_init.cpp index 05cf7235ffd18..79a155226a807 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_init.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_init.cpp @@ -10,6 +10,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out -DINIT_LIST -ffp-model=precise // RUN: %{run} %t.out +// XFAIL: cpu + // -ffp-model=precise is added to not depend on compiler defaults. #include "../common.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll.cpp index 103eb3c65fc6c..6b94d757a557a 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll.cpp @@ -10,6 +10,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -mllvm -inline-threshold=2000 -ffp-model=precise -o %t.out -DMANUAL_UNROLL // RUN: %{run} %t.out +// XFAIL: cpu + // -mllvm -inline-threshold=2000 added as a workaround, // since IGC doesn't support some variants of IR for Joint Matrix currently // -ffp-model=precise is added to not depend on compiler defaults. diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp index d27bc33dbb5a5..898a4eadf7636 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp @@ -10,6 +10,8 @@ // RUN: %{build} -mllvm -inline-threshold=2000 -ffp-model=precise -o %t_gpu.out -DINIT_LIST -DMANUAL_UNROLL // RUN: %{run} %t_gpu.out +// XFAIL: cpu + // -mllvm -inline-threshold=2000 added as a workaround, // since IGC doesn't support some variants of IR for Joint Matrix currently // -ffp-model=precise is added to not depend on compiler defaults. diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16.cpp index db664554a1a14..5d9ecca5c2c32 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16.cpp @@ -10,5 +10,7 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../common.hpp" #include "../joint_matrix_bfloat16_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array.cpp index bcbb20228ac00..4f01a63b0df24 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array.cpp @@ -10,5 +10,7 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../common.hpp" #include "../joint_matrix_bfloat16_array_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp index 022e26d7f58b6..567ba25816655 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp @@ -10,6 +10,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + // This tests support of col major layout for matrix B which does transpose and // then VNNI transform. This is currently only available on AMX diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_packedB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_packedB.cpp index f9d2efc51e970..9c9bfbfcf9b4f 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_packedB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_packedB.cpp @@ -14,5 +14,7 @@ // RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=1 %{run} %t.out %} // RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=0 %{run} %t.out %} +// XFAIL: cpu + #include "../common.hpp" #include "../joint_matrix_bfloat16_packedB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_colA_rowB_colC.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_colA_rowB_colC.cpp index 76c10eb46ec1f..d7d38c9093693 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_colA_rowB_colC.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_colA_rowB_colC.cpp @@ -10,6 +10,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + // XFAIL:* #include "../common.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_down_convert.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_down_convert.cpp index 7f1953174b118..49fe613f9a667 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_down_convert.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_down_convert.cpp @@ -10,5 +10,7 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../common.hpp" #include "../joint_matrix_down_convert_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_half.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_half.cpp index 7fbcfcd0ee957..4b31771874010 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_half.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_half.cpp @@ -12,5 +12,7 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../common.hpp" #include "../joint_matrix_half_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_colmajorA_colmajorB.cpp index dff506ba3effa..c818adfa70fe3 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_colmajorA_colmajorB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_colmajorA_colmajorB.cpp @@ -10,6 +10,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + // This tests support of col major layout for matrix B which does transpose and // then VNNI transform. This is currently only available on AMX diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_rowmajorA_rowmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_rowmajorA_rowmajorB.cpp index e37c5deb9ff51..8e5465349c3f6 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_rowmajorA_rowmajorB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_rowmajorA_rowmajorB.cpp @@ -15,5 +15,7 @@ // RUN: %if arch-intel_gpu_pvc %{ env IGC_JointMatrixLoadStoreOpt=0 %{run} %t.out %} // RUN: %if arch-intel_gpu_pvc %{ env IGC_JointMatrixLoadStoreOpt=1 %{run} %t.out %} +// XFAIL: cpu + #include "../common.hpp" #include "../joint_matrix_int8_rowmajorA_rowmajorB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature.cpp index 7f7c309c906fc..37f4a129ef3b0 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature.cpp @@ -11,6 +11,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + // Test checks that exception will be thrown in case matrix parameters are // incompatible on the current device diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_out_bounds.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_out_bounds.cpp index 391edcff2df8d..e3e0c63c3df93 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_out_bounds.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_out_bounds.cpp @@ -12,6 +12,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + // XFAIL:* #include "../common.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_prefetch.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_prefetch.cpp index fe3c25e338094..b933633839d47 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_prefetch.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_prefetch.cpp @@ -9,6 +9,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../common.hpp" constexpr size_t TN = 16; diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_rowmajorA_rowmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_rowmajorA_rowmajorB.cpp index 8397ad43a11cd..cbc7da3a93871 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_rowmajorA_rowmajorB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_rowmajorA_rowmajorB.cpp @@ -15,6 +15,8 @@ // RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=1 %{run} %t.out %} // RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=0 %{run} %t.out %} +// XFAIL: cpu + // This tests support of row major layout for matrix B which does automatic VNNI // transform. This is currently only available on AMX and XMX of PVC diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_ss_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_ss_int8.cpp index ac650910f237a..46a1a547276b7 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_ss_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_ss_int8.cpp @@ -10,5 +10,7 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../common.hpp" #include "../joint_matrix_ss_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_su_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_su_int8.cpp index 55d85decd7caa..5e8f4fef068bb 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_su_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_su_int8.cpp @@ -10,5 +10,7 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../common.hpp" #include "../joint_matrix_su_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_tf32.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_tf32.cpp index bdf6afc5bc395..9f8ad17c18a90 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_tf32.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_tf32.cpp @@ -11,6 +11,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../common.hpp" constexpr size_t TN = 16; diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_transposeC.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_transposeC.cpp index 7aa8c5cbd41ba..ee1a5ce731957 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_transposeC.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_transposeC.cpp @@ -10,5 +10,7 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../common.hpp" #include "../joint_matrix_transposeC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_unaligned_k.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_unaligned_k.cpp index 79cca59d80b15..efb040bb60d2b 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_unaligned_k.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_unaligned_k.cpp @@ -10,6 +10,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + // XFAIL:* #include "../common.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_us_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_us_int8.cpp index 08e2f532b887c..66ffea69bc6c1 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_us_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_us_int8.cpp @@ -10,5 +10,7 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../common.hpp" #include "../joint_matrix_us_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_uu_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_uu_int8.cpp index f1b19bc257b1d..812f32f715a29 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_uu_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_uu_int8.cpp @@ -10,5 +10,7 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../common.hpp" #include "../joint_matrix_uu_int8_impl.hpp" From 9ec4a21b5744051a696336b0251e36aa185de0e5 Mon Sep 17 00:00:00 2001 From: "Sidorov, Dmitry" Date: Tue, 13 Aug 2024 06:12:30 -0700 Subject: [PATCH 15/19] try CI Signed-off-by: Sidorov, Dmitry --- .../SPVCooperativeMatrix/SG32/element_wise_abc.cpp | 2 +- .../SPVCooperativeMatrix/SG32/element_wise_all_ops.cpp | 2 +- .../SG32/element_wise_all_ops_int8.cpp | 2 +- .../SG32/element_wise_all_ops_int8_packed.cpp | 2 +- .../SG32/element_wise_all_ops_tf32.cpp | 2 +- .../SG32/element_wise_all_sizes.cpp | 2 +- .../SPVCooperativeMatrix/SG32/element_wise_ops.cpp | 2 +- .../SPVCooperativeMatrix/SG32/get_coord_float_matC.cpp | 2 +- .../SPVCooperativeMatrix/SG32/get_coord_int8_matA.cpp | 2 +- .../SPVCooperativeMatrix/SG32/get_coord_int8_matB.cpp | 4 +--- .../SG32/joint_matrix_all_sizes.cpp | 2 +- .../SG32/joint_matrix_annotated_ptr.cpp | 2 +- .../SG32/joint_matrix_apply_bf16.cpp | 2 +- .../SG32/joint_matrix_apply_two_matrices.cpp | 1 + .../SG32/joint_matrix_bfloat16_packedB.cpp | 2 +- .../SG32/joint_matrix_colA_rowB_colC.cpp | 4 +--- .../SG32/joint_matrix_down_convert.cpp | 2 +- .../SPVCooperativeMatrix/SG32/joint_matrix_half.cpp | 2 +- .../SG32/joint_matrix_int8_colmajorA_colmajorB.cpp | 2 +- .../SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp | 2 +- .../SG32/joint_matrix_out_bounds.cpp | 8 ++++---- .../SG32/joint_matrix_prefetch.cpp | 2 +- .../SG32/joint_matrix_rowmajorA_rowmajorB.cpp | 2 +- .../SPVCooperativeMatrix/SG32/joint_matrix_ss_int8.cpp | 2 +- .../SPVCooperativeMatrix/SG32/joint_matrix_su_int8.cpp | 2 +- .../SPVCooperativeMatrix/SG32/joint_matrix_tf32.cpp | 2 +- .../SG32/joint_matrix_transposeC.cpp | 2 +- .../SG32/joint_matrix_unaligned_k.cpp | 8 ++++---- .../SPVCooperativeMatrix/SG32/joint_matrix_us_int8.cpp | 2 +- .../SPVCooperativeMatrix/SG32/joint_matrix_uu_int8.cpp | 2 +- .../Matrix/SPVCooperativeMatrix/element_wise_abc.cpp | 1 + .../SPVCooperativeMatrix/element_wise_all_ops.cpp | 3 ++- .../SPVCooperativeMatrix/element_wise_all_ops_1d.cpp | 2 +- .../element_wise_all_ops_1d_cont.cpp | 2 +- .../SPVCooperativeMatrix/element_wise_all_ops_half.cpp | 4 ++-- .../SPVCooperativeMatrix/element_wise_all_ops_int8.cpp | 4 ++-- .../element_wise_all_ops_int8_packed.cpp | 4 ++-- .../element_wise_all_ops_scalar.cpp | 3 ++- .../SPVCooperativeMatrix/element_wise_all_ops_tf32.cpp | 2 +- .../SPVCooperativeMatrix/element_wise_all_sizes.cpp | 2 +- .../Matrix/SPVCooperativeMatrix/element_wise_ops.cpp | 4 ++-- .../SPVCooperativeMatrix/get_coord_float_matC.cpp | 4 ++-- .../SPVCooperativeMatrix/get_coord_int8_matA.cpp | 3 ++- .../SPVCooperativeMatrix/get_coord_int8_matB.cpp | 4 ++-- .../SPVCooperativeMatrix/joint_matrix_all_sizes.cpp | 2 +- .../joint_matrix_annotated_ptr.cpp | 2 +- .../SPVCooperativeMatrix/joint_matrix_apply_bf16.cpp | 4 ++-- .../joint_matrix_apply_two_matrices.cpp | 4 ++-- .../joint_matrix_bf16_fill_k_cache.cpp | 2 +- .../joint_matrix_bf16_fill_k_cache_OOB.cpp | 9 ++++----- .../joint_matrix_bf16_fill_k_cache_init.cpp | 4 +--- .../joint_matrix_bf16_fill_k_cache_unroll.cpp | 2 +- .../joint_matrix_bf16_fill_k_cache_unroll_init.cpp | 4 +--- .../SPVCooperativeMatrix/joint_matrix_bfloat16.cpp | 2 +- .../joint_matrix_bfloat16_array.cpp | 2 +- .../joint_matrix_bfloat16_colmajorA_colmajorB.cpp | 6 +++--- .../joint_matrix_bfloat16_packedB.cpp | 2 +- .../joint_matrix_colA_rowB_colC.cpp | 4 +--- .../SPVCooperativeMatrix/joint_matrix_down_convert.cpp | 3 ++- .../Matrix/SPVCooperativeMatrix/joint_matrix_half.cpp | 2 +- .../joint_matrix_int8_colmajorA_colmajorB.cpp | 6 +++--- .../joint_matrix_int8_rowmajorA_rowmajorB.cpp | 2 +- .../joint_matrix_opt_kernel_feature.cpp | 2 +- .../SPVCooperativeMatrix/joint_matrix_out_bounds.cpp | 10 +++------- .../SPVCooperativeMatrix/joint_matrix_prefetch.cpp | 2 +- .../joint_matrix_rowmajorA_rowmajorB.cpp | 2 +- .../SPVCooperativeMatrix/joint_matrix_ss_int8.cpp | 2 +- .../SPVCooperativeMatrix/joint_matrix_su_int8.cpp | 2 +- .../Matrix/SPVCooperativeMatrix/joint_matrix_tf32.cpp | 2 +- .../SPVCooperativeMatrix/joint_matrix_transposeC.cpp | 2 +- .../SPVCooperativeMatrix/joint_matrix_unaligned_k.cpp | 8 +++----- .../SPVCooperativeMatrix/joint_matrix_us_int8.cpp | 2 +- .../SPVCooperativeMatrix/joint_matrix_uu_int8.cpp | 2 +- 73 files changed, 102 insertions(+), 113 deletions(-) diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_abc.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_abc.cpp index cce4a1b89f9fb..16e1951eab262 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_abc.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_abc.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops.cpp index 91f908ed1c335..6c80692109ca8 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8.cpp index ccab3b8c200ef..4d7fdb9285023 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8_packed.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8_packed.cpp index 41bb1f4291283..87ede89ab00c8 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8_packed.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_int8_packed.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_tf32.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_tf32.cpp index fe09792660e9b..613bcd1f7650e 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_tf32.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_ops_tf32.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, matrix-tf32 +// REQUIRES: matrix-tf32 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_sizes.cpp index 92ebb301677e2..b473dc00a1844 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_sizes.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, matrix +// REQUIRES: matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_ops.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_ops.cpp index 9abea818c6779..611c369b99011 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_ops.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_ops.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_float_matC.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_float_matC.cpp index 27105c0f7c933..8ef78f76b3509 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_float_matC.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_float_matC.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matA.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matA.cpp index 7bf347ca34103..9d3e62726720c 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matA.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matA.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matB.cpp index 40663a156874f..0b6dac6047681 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/get_coord_int8_matB.cpp @@ -7,13 +7,11 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out - -// XFAIL: cpu // XFAIL: cpu #include "../../common.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_all_sizes.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_all_sizes.cpp index 4bc63a47b76b3..cddc7659167a2 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_all_sizes.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_annotated_ptr.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_annotated_ptr.cpp index 4800b89f986e4..21b9014e7bff9 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_annotated_ptr.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_annotated_ptr.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, matrix +// REQUIRES: matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_bf16.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_bf16.cpp index 3073c02d1a131..b5fb8bf2c6dfe 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_bf16.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_bf16.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_two_matrices.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_two_matrices.cpp index a391887ebf15d..35a9b08119910 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_two_matrices.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_two_matrices.cpp @@ -14,6 +14,7 @@ // RUN: %{run} %t.out // XFAIL: cpu +// XFAIL: gpu #include "../../common.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_packedB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_packedB.cpp index 4b4aee5cfdfe4..bfbc2a7810e2c 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_packedB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_packedB.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, matrix +// REQUIRES: matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_colA_rowB_colC.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_colA_rowB_colC.cpp index f8d7525b94f26..6a51e03bebd23 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_colA_rowB_colC.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_colA_rowB_colC.cpp @@ -5,14 +5,12 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, matrix +// REQUIRES: matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -// XFAIL: cpu - // XFAIL:* #include "../../common.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_down_convert.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_down_convert.cpp index 683de97c16792..efe3c2b456802 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_down_convert.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_down_convert.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, matrix +// REQUIRES: matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_half.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_half.cpp index 98bef544aff2d..16953caff99e0 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_half.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_half.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: gpu, aspect-fp16 +// REQUIRES: aspect-fp16 // REQUIRES: aspect-ext_intel_matrix // REQUIRES: matrix-fp16 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp index 8b053f26f1c87..a3875112e556e 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, matrix +// REQUIRES: matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp index 9993b3ff74eb6..59c3173a35a8f 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_out_bounds.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_out_bounds.cpp index 355de138d6e3d..6926e7951b0db 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_out_bounds.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_out_bounds.cpp @@ -5,15 +5,15 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, matrix +// REQUIRES: matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 +// UNSUPPORTED: gpu-intel-dg2, cpu + // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -// XFAIL: cpu - -// XFAIL:* +// XFAIL:gpu #include "../../common.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_prefetch.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_prefetch.cpp index b5d5a5a33c871..27a0b05b6bced 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_prefetch.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_prefetch.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_rowmajorA_rowmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_rowmajorA_rowmajorB.cpp index a8ef1190d1079..610b2b5bf6e5c 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_rowmajorA_rowmajorB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_rowmajorA_rowmajorB.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // This tests support of row major layout for matrix B which does automatic VNNI -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // VNNI transform and sub-group size 32 are not supported yet on DG2 by IGC // UNSUPPORTED: gpu-intel-dg2 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_ss_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_ss_int8.cpp index d172bdf73fb7f..fce90d5a42fa3 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_ss_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_ss_int8.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_su_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_su_int8.cpp index dd1fe6af86f77..c3d0302a3f187 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_su_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_su_int8.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_tf32.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_tf32.cpp index 7b0d79911a000..e2ae342ed4598 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_tf32.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_tf32.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, matrix-tf32 +// REQUIRES: matrix-tf32 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_transposeC.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_transposeC.cpp index 6dd489487e478..aadb351b7db9c 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_transposeC.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_transposeC.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_unaligned_k.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_unaligned_k.cpp index 7c2dc81ad0c02..4177e52e75647 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_unaligned_k.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_unaligned_k.cpp @@ -5,15 +5,15 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, matrix +// REQUIRES: matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 +// UNSUPPORTED: gpu-intel-dg2, cpu + // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -// XFAIL: cpu - -// XFAIL:* +// XFAIL: gpu #include "../../common.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_us_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_us_int8.cpp index 842b70af021da..56ae3f112bb85 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_us_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_us_int8.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_uu_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_uu_int8.cpp index 66bf0e255aaae..daf87d386d3c8 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_uu_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_uu_int8.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_abc.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_abc.cpp index 9cc96a4396066..9a27ccf807076 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_abc.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_abc.cpp @@ -11,6 +11,7 @@ // RUN: %{run} %t.out // XFAIL: cpu +// XFAIL: gpu #include "../common.hpp" #include "../element_wise_abc_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops.cpp index ef4379c42c6d9..5cce8d34344c7 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops.cpp @@ -5,12 +5,13 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out // XFAIL: cpu +// XFAIL: gpu #include "../common.hpp" #include "../element_wise_all_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d.cpp index b212708fa1d4b..d4b7e5c808d21 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d.cpp @@ -10,7 +10,7 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: env IGC_JointMatrixLoadStoreOpt=1 %{run} %t.out -// XFAIL: cpu +// XFAIL: gpu #include "../common.hpp" #include "../element_wise_all_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d_cont.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d_cont.cpp index b6781d9a4a3af..076e1c9114523 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d_cont.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d_cont.cpp @@ -10,7 +10,7 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: env IGC_JointMatrixLoadStoreOpt=2 %{run} %t.out -// XFAIL: cpu +// XFAIL: gpu #include "../common.hpp" #include "../element_wise_all_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_half.cpp index 90b2cb8ce15d5..8853c0306edb1 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_half.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_half.cpp @@ -5,8 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// UNSUPPORTED: windows, linux -// REQUIRES: gpu, aspect-fp16 +// REQUIRES: aspect-fp16 // REQUIRES: aspect-ext_intel_matrix // REQUIRES: matrix-fp16 @@ -14,6 +13,7 @@ // RUN: %{run} %t.out // XFAIL: cpu +// XFAIL: gpu #include "../common.hpp" #include "../element_wise_all_ops_half_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8.cpp index 20b89d0e96e50..ba10657af03ae 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8.cpp @@ -5,13 +5,13 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// UNSUPPORTED: windows, linux -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out // XFAIL: cpu +// XFAIL: gpu #include "../common.hpp" #include "../element_wise_all_ops_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8_packed.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8_packed.cpp index 90bb5b97279e2..105c1ed763154 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8_packed.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_int8_packed.cpp @@ -5,13 +5,13 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// UNSUPPORTED: windows, linux -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out // XFAIL: cpu +// XFAIL: gpu // This test stores the matrix B that is VNNIed (packed). diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_scalar.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_scalar.cpp index 3872843efb966..d86cd6f469a66 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_scalar.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_scalar.cpp @@ -5,12 +5,13 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix, gpu // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: env IGC_JointMatrixLoadStoreOpt=0 %{run} %t.out // XFAIL: cpu +// XFAIL: gpu #include "../common.hpp" #include "../element_wise_all_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_tf32.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_tf32.cpp index 934eea94294ad..cc26efe30d8b8 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_tf32.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_tf32.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, matrix-tf32 +// REQUIRES: matrix-tf32 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_sizes.cpp index ea1a072da2b12..afb7535d110a3 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_sizes.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_ops.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_ops.cpp index a388337663145..ee7c063b2d4c8 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_ops.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_ops.cpp @@ -5,12 +5,12 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -// XFAIL: cpu +// XFAIL: gpu #include "../common.hpp" #include "../element_wise_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_float_matC.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_float_matC.cpp index 95505baa92da3..2438df6b3318d 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_float_matC.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_float_matC.cpp @@ -5,12 +5,12 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -// XFAIL: cpu +// XFAIL: gpu #include "../common.hpp" #include "../get_coord_float_matC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matA.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matA.cpp index 5e58090544429..e9f19ccc5b04b 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matA.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matA.cpp @@ -5,11 +5,12 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: gpu // XFAIL: cpu #include "../common.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matB.cpp index c9cf347fcf266..b80c12d67d8c9 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matB.cpp @@ -5,13 +5,13 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out // XFAIL: cpu -// XFAIL: cpu +// XFAIL: gpu #include "../common.hpp" #include "../get_coord_int8_matB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_all_sizes.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_all_sizes.cpp index 266af945d6a3f..10d6d9ee62d56 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_all_sizes.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_annotated_ptr.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_annotated_ptr.cpp index e66f4f4d05faf..52e680fb25070 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_annotated_ptr.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_annotated_ptr.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, matrix +// REQUIRES: matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_bf16.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_bf16.cpp index 6c962b8979553..64dd1eb81346b 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_bf16.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_bf16.cpp @@ -5,12 +5,12 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -// XFAIL: cpu +// XFAIL: gpu #include "../common.hpp" #include "../joint_matrix_apply_bf16_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_two_matrices.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_two_matrices.cpp index 018849e93936c..a376614406e8e 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_two_matrices.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_two_matrices.cpp @@ -5,12 +5,12 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // RUN: %{build} -ffp-model=precise -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -// XFAIL: cpu +// XFAIL: gpu #include "../common.hpp" #include "../joint_matrix_apply_two_matrices_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache.cpp index 8f26dc4d1f976..b1c3996c7a40a 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out -ffp-model=precise // RUN: %{run} %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_OOB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_OOB.cpp index caf68d0b92ed1..af4b28e090b4c 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_OOB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_OOB.cpp @@ -5,16 +5,15 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix, gpu // UNSUPPORTED: gpu-intel-dg2 +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t_gpu_vnni.out -ffp-model=precise -DOOB -DVNNI +// RUN: %{run} %t_gpu_vnni.out + // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t_gpu.out -ffp-model=precise -DOOB // RUN: %{run} %t_gpu.out -// XFAIL: cpu - -// XFAIL: gpu - // -ffp-model=precise is added to not depend on compiler defaults. #include "../common.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_init.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_init.cpp index 79a155226a807..416a894fc2c67 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_init.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_init.cpp @@ -5,13 +5,11 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix, gpu // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out -DINIT_LIST -ffp-model=precise // RUN: %{run} %t.out -// XFAIL: cpu - // -ffp-model=precise is added to not depend on compiler defaults. #include "../common.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll.cpp index 6b94d757a557a..32a398e241fe6 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -mllvm -inline-threshold=2000 -ffp-model=precise -o %t.out -DMANUAL_UNROLL // RUN: %{run} %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp index 898a4eadf7636..30ad9e15b3a99 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp @@ -5,13 +5,11 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix, gpu +// REQUIRES: aspect-ext_intel_matrix, gpu // RUN: %{build} -mllvm -inline-threshold=2000 -ffp-model=precise -o %t_gpu.out -DINIT_LIST -DMANUAL_UNROLL // RUN: %{run} %t_gpu.out -// XFAIL: cpu - // -mllvm -inline-threshold=2000 added as a workaround, // since IGC doesn't support some variants of IR for Joint Matrix currently // -ffp-model=precise is added to not depend on compiler defaults. diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16.cpp index 5d9ecca5c2c32..c6f1db06e625f 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array.cpp index 4f01a63b0df24..69f9aa8553bd3 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_array.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp index 567ba25816655..11724ad32aa3b 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp @@ -5,18 +5,18 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, matrix +// REQUIRES: matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -// XFAIL: cpu - // This tests support of col major layout for matrix B which does transpose and // then VNNI transform. This is currently only available on AMX // XFAIL: gpu +// XFAIL: cpu + #include "../common.hpp" constexpr size_t TN = 16; diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_packedB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_packedB.cpp index 9c9bfbfcf9b4f..1b7de0a673641 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_packedB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_packedB.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, matrix +// REQUIRES: matrix // REQUIRES-INTEL-DRIVER: lin: 27868, win: 101.5181 // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_colA_rowB_colC.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_colA_rowB_colC.cpp index d7d38c9093693..2c6323364ead9 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_colA_rowB_colC.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_colA_rowB_colC.cpp @@ -5,13 +5,11 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, matrix +// REQUIRES: matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -// XFAIL: cpu - // XFAIL:* #include "../common.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_down_convert.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_down_convert.cpp index 49fe613f9a667..760402d0a35cc 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_down_convert.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_down_convert.cpp @@ -5,12 +5,13 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, matrix +// REQUIRES: matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out // XFAIL: cpu +// XFAIL: gpu #include "../common.hpp" #include "../joint_matrix_down_convert_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_half.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_half.cpp index 4b31771874010..43b70b990b7d1 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_half.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_half.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-fp16 +// REQUIRES: aspect-fp16 // REQUIRES: aspect-ext_intel_matrix // REQUIRES: matrix-fp16 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_colmajorA_colmajorB.cpp index c818adfa70fe3..37d30b3be9527 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_colmajorA_colmajorB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_colmajorA_colmajorB.cpp @@ -5,18 +5,18 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, matrix +// REQUIRES: matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -// XFAIL: cpu - // This tests support of col major layout for matrix B which does transpose and // then VNNI transform. This is currently only available on AMX // XFAIL: gpu +// XFAIL: cpu + #include "../common.hpp" constexpr size_t TN = 16; diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_rowmajorA_rowmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_rowmajorA_rowmajorB.cpp index 8e5465349c3f6..51843073938a5 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_rowmajorA_rowmajorB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_rowmajorA_rowmajorB.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature.cpp index 37f4a129ef3b0..a0b53f0cff119 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_out_bounds.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_out_bounds.cpp index e3e0c63c3df93..138f6738155f9 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_out_bounds.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_out_bounds.cpp @@ -5,17 +5,13 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix - -// UNSUPPORTED: gpu-intel-dg2 +// REQUIRES: aspect-ext_intel_matrix +// UNSUPPORTED: gpu-intel-dg2, cpu +// // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -// XFAIL: cpu - -// XFAIL:* - #include "../common.hpp" constexpr size_t TN = 16; diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_prefetch.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_prefetch.cpp index b933633839d47..0c99e7a09063e 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_prefetch.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_prefetch.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_rowmajorA_rowmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_rowmajorA_rowmajorB.cpp index cbc7da3a93871..6d213ba4ed870 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_rowmajorA_rowmajorB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_rowmajorA_rowmajorB.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // VNNI transform is not supported yet by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_ss_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_ss_int8.cpp index 46a1a547276b7..d43e10ffed568 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_ss_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_ss_int8.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_su_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_su_int8.cpp index 5e8f4fef068bb..1432abf115508 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_su_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_su_int8.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_tf32.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_tf32.cpp index 9f8ad17c18a90..3033631282058 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_tf32.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_tf32.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, matrix-tf32 +// REQUIRES: matrix-tf32 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_transposeC.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_transposeC.cpp index ee1a5ce731957..259659068935f 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_transposeC.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_transposeC.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_unaligned_k.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_unaligned_k.cpp index efb040bb60d2b..43f3c12b70079 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_unaligned_k.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_unaligned_k.cpp @@ -5,15 +5,13 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, matrix +// REQUIRES: matrix + +// UNSUPPORTED: gpu-intel-dg2, cpu // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -// XFAIL: cpu - -// XFAIL:* - #include "../common.hpp" constexpr size_t TN = 16; diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_us_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_us_int8.cpp index 66ffea69bc6c1..93050559c1dfe 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_us_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_us_int8.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_uu_int8.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_uu_int8.cpp index 812f32f715a29..2b988bdf92bcc 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_uu_int8.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_uu_int8.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out From ae0ca2c385b537a6587d5a37567ee61fe04ce727 Mon Sep 17 00:00:00 2001 From: "Sidorov, Dmitry" Date: Tue, 13 Aug 2024 06:59:35 -0700 Subject: [PATCH 16/19] disable missing test Signed-off-by: Sidorov, Dmitry --- .../Matrix/SPVCooperativeMatrix/element_wise_all_sizes.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_sizes.cpp index afb7535d110a3..bb3e64a2b9dae 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_sizes.cpp @@ -17,6 +17,7 @@ // RUN: %if gpu-intel-dg2 %{ %{run} %t_split.out %} // XFAIL: cpu +// XFAIL: gpu #include "../common.hpp" #include "../element_wise_all_sizes_impl.hpp" From f20dcd341bbb922594d5cbfb9e4e53e591792fb4 Mon Sep 17 00:00:00 2001 From: "Sidorov, Dmitry" Date: Wed, 14 Aug 2024 04:52:40 -0700 Subject: [PATCH 17/19] alignments Signed-off-by: Sidorov, Dmitry --- .../SPVCooperativeMatrix/SG32/element_wise_all_sizes.cpp | 2 -- .../SG32/joint_matrix_apply_two_matrices.cpp | 2 +- .../SG32/joint_matrix_bf16_fill_k_cache.cpp | 9 ++++++--- .../SG32/joint_matrix_bf16_fill_k_cache_init.cpp | 4 ++-- .../SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp | 4 ++-- .../SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp | 6 ++---- .../SPVCooperativeMatrix/SG32/joint_matrix_bfloat16.cpp | 2 +- .../SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp | 2 +- .../SG32/joint_matrix_bfloat16_packedB.cpp | 2 -- .../SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp | 2 -- .../SPVCooperativeMatrix/SG32/joint_matrix_prefetch.cpp | 1 - .../SG32/joint_matrix_transposeC.cpp | 2 -- .../SG32/joint_matrix_unaligned_k.cpp | 2 +- .../SPVCooperativeMatrix/element_wise_all_ops_1d.cpp | 2 +- .../SPVCooperativeMatrix/element_wise_all_sizes.cpp | 2 +- .../Matrix/SPVCooperativeMatrix/element_wise_ops.cpp | 1 + .../Matrix/SPVCooperativeMatrix/get_coord_float_matC.cpp | 1 + .../Matrix/SPVCooperativeMatrix/get_coord_int8_matB.cpp | 1 - .../SPVCooperativeMatrix/joint_matrix_apply_bf16.cpp | 1 + .../joint_matrix_apply_two_matrices.cpp | 1 + .../joint_matrix_bf16_fill_k_cache.cpp | 5 +++-- .../joint_matrix_bf16_fill_k_cache_init.cpp | 2 +- .../joint_matrix_bf16_fill_k_cache_unroll.cpp | 2 +- .../joint_matrix_bf16_fill_k_cache_unroll_init.cpp | 2 +- .../joint_matrix_bfloat16_colmajorA_colmajorB.cpp | 4 ++-- .../joint_matrix_bfloat16_packedB.cpp | 2 -- .../joint_matrix_int8_colmajorA_colmajorB.cpp | 4 ++-- .../joint_matrix_int8_rowmajorA_rowmajorB.cpp | 2 -- .../joint_matrix_opt_kernel_feature.cpp | 2 -- .../SPVCooperativeMatrix/joint_matrix_transposeC.cpp | 2 -- 30 files changed, 32 insertions(+), 44 deletions(-) diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_sizes.cpp index b473dc00a1844..47ca1d8104a7d 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_sizes.cpp @@ -11,8 +11,6 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -// XFAIL: cpu - #include "../../common.hpp" #define SG_SZ 32 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_two_matrices.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_two_matrices.cpp index 35a9b08119910..5fa27fc66df1c 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_two_matrices.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_apply_two_matrices.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache.cpp index 91fd3e47bd81d..91175103be6cc 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache.cpp @@ -7,11 +7,14 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out -ffp-model=precise -// RUN: %{run} %t.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t_vnni.out -DVNNI -ffp-model=precise +// RUN: %{run} %t_vnni.out + +// TODO: add row major compilation and run once Sub-group size 32 +// support becomes available in IGC for row major // XFAIL: cpu diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp index e8d809c7b52d3..7f9d88b30b7fe 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp @@ -7,10 +7,10 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix, gpu // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out -DINIT_LIST -ffp-model=precise +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out -DINIT_LIST -DVNNI -ffp-model=precise // RUN: %{run} %t.out // XFAIL: cpu diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp index c82146c153023..df0b207c0605b 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp @@ -7,10 +7,10 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -mllvm -inline-threshold=5000 -ffp-model=precise -o %t.out -DMANUAL_UNROLL +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -mllvm -inline-threshold=5000 -ffp-model=precise -o %t.out -DMANUAL_UNROLL -DVNNI // RUN: %{run} %t.out // XFAIL: cpu diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp index 506f43801f078..7fdd5e62736bc 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp @@ -7,14 +7,12 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix, gpu // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 -// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -mllvm -inline-threshold=5000 -ffp-model=precise -o %t_gpu.out -DINIT_LIST -DMANUAL_UNROLL +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -mllvm -inline-threshold=5000 -ffp-model=precise -o %t_gpu.out -DINIT_LIST -DMANUAL_UNROLL -DVNNI // RUN: %{run} %t_gpu.out -// XFAIL: cpu - // -mllvm -inline-threshold added as a workaround, // since IGC doesn't support some variants of IR for Joint Matrix currently // -ffp-model=precise is added to not depend on compiler defaults. diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16.cpp index 4060126529c09..637e9ebcd858c 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp index f50d83697a9f2..6b6fd3503bccc 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, matrix +// REQUIRES: matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_packedB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_packedB.cpp index bfbc2a7810e2c..e888d5ec37d57 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_packedB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_packedB.cpp @@ -11,8 +11,6 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -// XFAIL: cpu - // XFAIL: gpu #include "../../common.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp index 59c3173a35a8f..654bdfe695116 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp @@ -13,8 +13,6 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -// XFAIL: cpu - // XFAIL: gpu #include "../../common.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_prefetch.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_prefetch.cpp index 27a0b05b6bced..7073feae64ac4 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_prefetch.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_prefetch.cpp @@ -11,7 +11,6 @@ // RUN: %{run} %t.out // XFAIL: cpu - // XFAIL: gpu // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_transposeC.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_transposeC.cpp index aadb351b7db9c..d0fd090b2e371 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_transposeC.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_transposeC.cpp @@ -13,8 +13,6 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -// XFAIL: cpu - #include "../../common.hpp" #define SG_SZ 32 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_unaligned_k.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_unaligned_k.cpp index 4177e52e75647..d0ac32d7661e6 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_unaligned_k.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_unaligned_k.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix +// REQUIRES: aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // UNSUPPORTED: gpu-intel-dg2, cpu diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d.cpp index d4b7e5c808d21..73afef5afa996 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_ops_1d.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix, gpu // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: env IGC_JointMatrixLoadStoreOpt=1 %{run} %t.out diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_sizes.cpp index bb3e64a2b9dae..27b2f4b21f0dc 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_all_sizes.cpp @@ -13,7 +13,7 @@ // This is a version of the test with disabled device code // split to test against fixed bug in IGC -// RUN: %{build} -fsycl-device-code-split=off -o %t_split.out +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -fsycl-device-code-split=off -o %t_split.out // RUN: %if gpu-intel-dg2 %{ %{run} %t_split.out %} // XFAIL: cpu diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_ops.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_ops.cpp index ee7c063b2d4c8..e45b7507ed828 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_ops.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/element_wise_ops.cpp @@ -11,6 +11,7 @@ // RUN: %{run} %t.out // XFAIL: gpu +// XFAIL: cpu #include "../common.hpp" #include "../element_wise_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_float_matC.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_float_matC.cpp index 2438df6b3318d..41d9971c6f003 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_float_matC.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_float_matC.cpp @@ -11,6 +11,7 @@ // RUN: %{run} %t.out // XFAIL: gpu +// XFAIL: cpu #include "../common.hpp" #include "../get_coord_float_matC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matB.cpp index b80c12d67d8c9..0f370e1be8898 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/get_coord_int8_matB.cpp @@ -9,7 +9,6 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out - // XFAIL: cpu // XFAIL: gpu diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_bf16.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_bf16.cpp index 64dd1eb81346b..24bf88f9d3f34 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_bf16.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_bf16.cpp @@ -11,6 +11,7 @@ // RUN: %{run} %t.out // XFAIL: gpu +// XFAIL: cpu #include "../common.hpp" #include "../joint_matrix_apply_bf16_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_two_matrices.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_two_matrices.cpp index a376614406e8e..1d1927c3a1d5d 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_two_matrices.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_apply_two_matrices.cpp @@ -11,6 +11,7 @@ // RUN: %{run} %t.out // XFAIL: gpu +// XFAIL: cpu #include "../common.hpp" #include "../joint_matrix_apply_two_matrices_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache.cpp index b1c3996c7a40a..d6215c816f531 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache.cpp @@ -7,11 +7,12 @@ //===----------------------------------------------------------------------===// // REQUIRES: aspect-ext_intel_matrix +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t_vnni.out -DVNNI -ffp-model=precise +// RUN: %{run} %t_vnni.out + // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out -ffp-model=precise // RUN: %{run} %t.out -// XFAIL: cpu - // -ffp-model=precise is added to not depend on compiler defaults. #include "../common.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_init.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_init.cpp index 416a894fc2c67..f4c77bb50c5da 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_init.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_init.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // REQUIRES: aspect-ext_intel_matrix, gpu -// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out -DINIT_LIST -ffp-model=precise +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out -DINIT_LIST -DVNNI -ffp-model=precise // RUN: %{run} %t.out // -ffp-model=precise is added to not depend on compiler defaults. diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll.cpp index 32a398e241fe6..89e9048170837 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // REQUIRES: aspect-ext_intel_matrix -// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -mllvm -inline-threshold=2000 -ffp-model=precise -o %t.out -DMANUAL_UNROLL +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -mllvm -inline-threshold=2000 -ffp-model=precise -o %t.out -DMANUAL_UNROLL -DVNNI // RUN: %{run} %t.out // XFAIL: cpu diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp index 30ad9e15b3a99..d653268345dcc 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // REQUIRES: aspect-ext_intel_matrix, gpu -// RUN: %{build} -mllvm -inline-threshold=2000 -ffp-model=precise -o %t_gpu.out -DINIT_LIST -DMANUAL_UNROLL +// RUN: %{build} -mllvm -inline-threshold=2000 -ffp-model=precise -o %t_gpu.out -DINIT_LIST -DMANUAL_UNROLL -DVNNI // RUN: %{run} %t_gpu.out // -mllvm -inline-threshold=2000 added as a workaround, diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp index 11724ad32aa3b..9359640ff4a53 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp @@ -10,13 +10,13 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + // This tests support of col major layout for matrix B which does transpose and // then VNNI transform. This is currently only available on AMX // XFAIL: gpu -// XFAIL: cpu - #include "../common.hpp" constexpr size_t TN = 16; diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_packedB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_packedB.cpp index 1b7de0a673641..99b764dbe7039 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_packedB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_packedB.cpp @@ -14,7 +14,5 @@ // RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=1 %{run} %t.out %} // RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=0 %{run} %t.out %} -// XFAIL: cpu - #include "../common.hpp" #include "../joint_matrix_bfloat16_packedB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_colmajorA_colmajorB.cpp index 37d30b3be9527..8559cc7849f50 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_colmajorA_colmajorB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_colmajorA_colmajorB.cpp @@ -10,13 +10,13 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + // This tests support of col major layout for matrix B which does transpose and // then VNNI transform. This is currently only available on AMX // XFAIL: gpu -// XFAIL: cpu - #include "../common.hpp" constexpr size_t TN = 16; diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_rowmajorA_rowmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_rowmajorA_rowmajorB.cpp index 51843073938a5..31bc890fdd197 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_rowmajorA_rowmajorB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_rowmajorA_rowmajorB.cpp @@ -15,7 +15,5 @@ // RUN: %if arch-intel_gpu_pvc %{ env IGC_JointMatrixLoadStoreOpt=0 %{run} %t.out %} // RUN: %if arch-intel_gpu_pvc %{ env IGC_JointMatrixLoadStoreOpt=1 %{run} %t.out %} -// XFAIL: cpu - #include "../common.hpp" #include "../joint_matrix_int8_rowmajorA_rowmajorB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature.cpp index a0b53f0cff119..2ef42e3b499e9 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_opt_kernel_feature.cpp @@ -11,8 +11,6 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -// XFAIL: cpu - // Test checks that exception will be thrown in case matrix parameters are // incompatible on the current device diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_transposeC.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_transposeC.cpp index 259659068935f..2a2ee8eccaf66 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_transposeC.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_transposeC.cpp @@ -10,7 +10,5 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -// XFAIL: cpu - #include "../common.hpp" #include "../joint_matrix_transposeC_impl.hpp" From 792b02678d0b09bbbe10f08dd5b0bb930e47b504 Mon Sep 17 00:00:00 2001 From: "Sidorov, Dmitry" Date: Wed, 14 Aug 2024 07:02:57 -0700 Subject: [PATCH 18/19] update cpu checks Signed-off-by: Sidorov, Dmitry --- .../Matrix/SPVCooperativeMatrix/SG32/element_wise_all_sizes.cpp | 2 ++ .../SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp | 2 -- .../SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_packedB.cpp | 1 + .../SG32/joint_matrix_int8_colmajorA_colmajorB.cpp | 2 -- .../SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache.cpp | 2 ++ .../joint_matrix_bfloat16_colmajorA_colmajorB.cpp | 2 -- .../SPVCooperativeMatrix/joint_matrix_bfloat16_packedB.cpp | 2 ++ .../joint_matrix_int8_colmajorA_colmajorB.cpp | 2 -- 8 files changed, 7 insertions(+), 8 deletions(-) diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_sizes.cpp index 47ca1d8104a7d..b473dc00a1844 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/element_wise_all_sizes.cpp @@ -11,6 +11,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu + #include "../../common.hpp" #define SG_SZ 32 diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp index 6b6fd3503bccc..80de9af24bb85 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp @@ -11,8 +11,6 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -// XFAIL: cpu - // This tests support of col major layout for matrix B which does transpose and // then VNNI transform. This is currently only available on AMX diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_packedB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_packedB.cpp index e888d5ec37d57..aa8e4bf7758ca 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_packedB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_packedB.cpp @@ -12,6 +12,7 @@ // RUN: %{run} %t.out // XFAIL: gpu +// XFAIL: cpu #include "../../common.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp index a3875112e556e..375e3bb958733 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp @@ -11,8 +11,6 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -// XFAIL: cpu - // This tests support of col major layout for matrix B which does transpose and // then VNNI transform. This is currently only available on AMX diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache.cpp index d6215c816f531..eb68e332eaa7c 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache.cpp @@ -13,6 +13,8 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out -ffp-model=precise // RUN: %{run} %t.out +// XFAIL: cpu + // -ffp-model=precise is added to not depend on compiler defaults. #include "../common.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp index 9359640ff4a53..ff7d2f1e05882 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp @@ -10,8 +10,6 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -// XFAIL: cpu - // This tests support of col major layout for matrix B which does transpose and // then VNNI transform. This is currently only available on AMX diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_packedB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_packedB.cpp index 99b764dbe7039..1b7de0a673641 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_packedB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bfloat16_packedB.cpp @@ -14,5 +14,7 @@ // RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=1 %{run} %t.out %} // RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=0 %{run} %t.out %} +// XFAIL: cpu + #include "../common.hpp" #include "../joint_matrix_bfloat16_packedB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_colmajorA_colmajorB.cpp index 8559cc7849f50..e0d2c5bd81769 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_colmajorA_colmajorB.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_int8_colmajorA_colmajorB.cpp @@ -10,8 +10,6 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out // RUN: %{run} %t.out -// XFAIL: cpu - // This tests support of col major layout for matrix B which does transpose and // then VNNI transform. This is currently only available on AMX From 69e2c427eb54b48ef9175f1fc0e149b17f6d693d Mon Sep 17 00:00:00 2001 From: "Sidorov, Dmitry" Date: Wed, 14 Aug 2024 13:40:52 -0700 Subject: [PATCH 19/19] Remove the rest Signed-off-by: Sidorov, Dmitry --- .../SG32/joint_matrix_bf16_fill_k_cache_init.cpp | 2 -- .../SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_array.cpp | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp index 7f9d88b30b7fe..eda198c7b2a41 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp @@ -13,8 +13,6 @@ // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out -DINIT_LIST -DVNNI -ffp-model=precise // RUN: %{run} %t.out -// XFAIL: cpu - // -ffp-model=precise is added to not depend on compiler defaults. #include "../../common.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_array.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_array.cpp index a07659db2d102..eee85175d678d 100644 --- a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_array.cpp +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bfloat16_array.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: gpu, aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t.out