diff --git a/SYCL/Matrix/element_wise_all_ops_cuda.cpp b/SYCL/Matrix/element_wise_all_ops_cuda.cpp
index 832194e3d9..2f5322cca1 100644
--- a/SYCL/Matrix/element_wise_all_ops_cuda.cpp
+++ b/SYCL/Matrix/element_wise_all_ops_cuda.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 // REQUIRES: cuda
 
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=3 %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 %s -o %t.out
 // RUN: %t.out
 
 #include <sycl/sycl.hpp>
@@ -64,9 +64,9 @@ void matrix_verify_op(queue q, big_matrix<T2, M * nWGperDim, N * nWGperDim> &C,
 
              auto sg = spmd_item.get_sub_group();
 
-             joint_matrix<T, matrix_use::a, M, K> sub_a;
-             joint_matrix<T, matrix_use::b, K, N> sub_b;
-             joint_matrix<T2, matrix_use::accumulator, M, N> sub_c;
+             joint_matrix<T, use::a, M, K, layout::row_major> sub_a;
+             joint_matrix<T, use::b, K, N, layout::row_major> sub_b;
+             joint_matrix<T2, use::accumulator, M, N> sub_c;
 
              joint_matrix_fill(sg, sub_a, 3);
              joint_matrix_fill(sg, sub_b, 1);
@@ -104,7 +104,7 @@ void matrix_verify_op(queue q, big_matrix<T2, M * nWGperDim, N * nWGperDim> &C,
                                 accC.get_pointer() +
                                     (sg_startx * M) * (N * nWGperDim) +
                                     sg_starty / SG_SZ * N,
-                                (N * nWGperDim));
+                                (N * nWGperDim), layout::row_major);
            }); // parallel for
      }).wait();
   }
diff --git a/SYCL/Matrix/element_wise_all_ops_cuda_legacy.cpp b/SYCL/Matrix/element_wise_all_ops_cuda_legacy.cpp
new file mode 100644
index 0000000000..c73da53888
--- /dev/null
+++ b/SYCL/Matrix/element_wise_all_ops_cuda_legacy.cpp
@@ -0,0 +1,184 @@
+//==----------- element_wise_all_ops_cuda.cpp  - DPC++ joint_matrix---------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: cuda
+
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out
+// RUN: %t.out
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+using namespace sycl::ext::oneapi::experimental::matrix;
+using sycl::ext::oneapi::experimental::bfloat16;
+
+#define SG_SZ 32
+constexpr size_t nWGperDim = 2;
+
+class Logical {};
+
+template <typename T1, typename T2, size_t M, size_t K, size_t N, typename OP>
+class KernelName;
+
+template <typename T, size_t NUM_ROWS, size_t NUM_COLS> struct big_matrix {
+public:
+  T *mat;
+
+public:
+  T *get_data() { return mat; }
+  void set_data(T *data) { mat = data; }
+  big_matrix(T *data) : mat(data) {}
+};
+
+template <typename T, size_t M, size_t N>
+void assert_ops_ref(T *C, const float ref) {
+  for (size_t i = 0; i < M; i++)
+    for (size_t j = 0; j < N; j++) {
+      auto diff = C[i + j * M] - ref;
+      assert(std::fabs(static_cast<float>(diff)) <
+             std::numeric_limits<float>::epsilon());
+    }
+}
+template <typename T, typename T2, size_t M, size_t K, size_t N,
+          class Operation>
+void matrix_verify_op(queue q, big_matrix<T2, M * nWGperDim, N * nWGperDim> &C,
+                      nd_range<2> &r, const float ref, Operation Op) {
+  {
+    buffer<T2, 2> bufC(C.get_data(), range<2>(N * nWGperDim, M * nWGperDim));
+
+    q.submit([&](handler &cgh) {
+       accessor<T2, 2, access::mode::read_write, target::device> accC(bufC,
+                                                                      cgh);
+
+       cgh.parallel_for<KernelName<T, T2, M, K, N, Operation>>(
+           r, [accC,
+               Op](nd_item<2> spmd_item) [[sycl::reqd_sub_group_size(SG_SZ)]] {
+             const auto global_idx = spmd_item.get_global_id(0);
+             const auto global_idy = spmd_item.get_global_id(1);
+             const auto sg_startx = global_idx - spmd_item.get_local_id(0);
+             const auto sg_starty = global_idy - spmd_item.get_local_id(1);
+
+             auto sg = spmd_item.get_sub_group();
+
+             joint_matrix<T, matrix_use::a, M, K> sub_a;
+             joint_matrix<T, matrix_use::b, K, N> sub_b;
+             joint_matrix<T2, matrix_use::accumulator, M, N> sub_c;
+
+             joint_matrix_fill(sg, sub_a, 3);
+             joint_matrix_fill(sg, sub_b, 1);
+             joint_matrix_fill(sg, sub_c, -80);
+
+             auto wi_slice_a = sub_a.get_wi_data();
+             for (int i = 0; i < wi_slice_a.length(); i++) {
+               if constexpr (std::is_same_v<Operation, Logical>) {
+                 if (wi_slice_a[i]) {
+                   if (wi_slice_a[i] > 2.0 || wi_slice_a[i] >= 3.0 ||
+                       wi_slice_a[i] < 4.0 || wi_slice_a[i] <= 3.0) {
+                     T val = (wi_slice_a[i] != (2.0)) ? wi_slice_a[i]
+                                                      : static_cast<T>(2.0);
+                     val = ((val) - (1));
+                     val = ((val) + (1));
+                     if (wi_slice_a[i] == (2.0)) {
+                       val = ((val) - (2));
+                       val = ((val) * (3));
+                       val = ((val) / (2));
+
+                     } else {
+                       val = ((val) + (2));
+                     }
+                     wi_slice_a[i] = val;
+                   }
+                 }
+               } else {
+                 wi_slice_a[i] = Op(wi_slice_a[i], 2);
+               }
+             }
+
+             sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
+
+             joint_matrix_store(sg, sub_c,
+                                accC.get_pointer() +
+                                    (sg_startx * M) * (N * nWGperDim) +
+                                    sg_starty / SG_SZ * N,
+                                (N * nWGperDim));
+           }); // parallel for
+     }).wait();
+  }
+  assert_ops_ref<T2, M * nWGperDim, N * nWGperDim>(C.get_data(), ref);
+}
+
+static constexpr size_t MATRIX_M = 16 * nWGperDim;
+static constexpr size_t MATRIX_N = 16 * nWGperDim;
+
+int main() {
+
+  float D[MATRIX_M][MATRIX_N];
+  big_matrix<float, MATRIX_M, MATRIX_N> MD_f((float *)&D);
+
+  queue q;
+  auto computeCapability =
+      std::stof(q.get_device().get_info<info::device::backend_version>());
+  nd_range<2> r({nWGperDim, nWGperDim * SG_SZ}, {1, 1 * SG_SZ});
+
+  if (computeCapability >= 7.0) {
+    matrix_verify_op<half, float, 16, 16, 16>(q, MD_f, r, 0.0,
+                                              std::plus<half>{});
+    matrix_verify_op<half, float, 16, 16, 16>(q, MD_f, r, 0.0, Logical{});
+    matrix_verify_op<half, float, 16, 16, 16>(q, MD_f, r, 16.0,
+                                              std::multiplies<half>{});
+    matrix_verify_op<half, float, 16, 16, 16>(q, MD_f, r, -56.0,
+                                              std::divides<half>{});
+    matrix_verify_op<half, float, 16, 16, 16>(q, MD_f, r, -64.0,
+                                              std::minus<half>{});
+  }
+
+  if (computeCapability >= 7.2) {
+    int32_t D_i[MATRIX_M][MATRIX_N];
+    big_matrix<int32_t, MATRIX_M, MATRIX_N> MD_i((int32_t *)&D_i);
+    matrix_verify_op<uint8_t, int32_t, 16, 16, 16>(q, MD_i, r, 0,
+                                                   std::plus<uint8_t>{});
+    matrix_verify_op<uint8_t, int32_t, 16, 16, 16>(q, MD_i, r, 16,
+                                                   std::multiplies<uint8_t>{});
+    matrix_verify_op<uint8_t, int32_t, 16, 16, 16>(q, MD_i, r, -64,
+                                                   std::minus<uint8_t>{});
+    matrix_verify_op<int8_t, int32_t, 16, 16, 16>(q, MD_i, r, 0,
+                                                  std::plus<int8_t>{});
+    matrix_verify_op<int8_t, int32_t, 16, 16, 16>(q, MD_i, r, 0.0, Logical{});
+    matrix_verify_op<int8_t, int32_t, 16, 16, 16>(q, MD_i, r, 16,
+                                                  std::multiplies<int8_t>{});
+    matrix_verify_op<int8_t, int32_t, 16, 16, 16>(q, MD_i, r, -64,
+                                                  std::minus<int8_t>{});
+  }
+
+  if (computeCapability >= 8.0) {
+
+    matrix_verify_op<bfloat16, float, 16, 16, 16>(q, MD_f, r, 0.0,
+                                                  std::plus<bfloat16>{});
+    matrix_verify_op<bfloat16, float, 16, 16, 16>(q, MD_f, r, 0.0, Logical{});
+    matrix_verify_op<bfloat16, float, 16, 16, 16>(q, MD_f, r, 16.0,
+                                                  std::multiplies<bfloat16>{});
+    matrix_verify_op<bfloat16, float, 16, 16, 16>(q, MD_f, r, -56.0,
+                                                  std::divides<bfloat16>{});
+    matrix_verify_op<bfloat16, float, 16, 16, 16>(q, MD_f, r, -64.0,
+                                                  std::minus<bfloat16>{});
+
+    double D_d[MATRIX_M / 2][MATRIX_N / 2];
+    big_matrix<double, 8 * nWGperDim, 8 * nWGperDim> MD_d((double *)&D_d);
+
+    matrix_verify_op<double, double, 8, 4, 8>(q, MD_d, r, -60.0,
+                                              std::plus<double>{});
+    matrix_verify_op<double, double, 8, 4, 8>(q, MD_d, r, -60.0, Logical{});
+    matrix_verify_op<double, double, 8, 4, 8>(q, MD_d, r, -56.0,
+                                              std::multiplies<double>{});
+    matrix_verify_op<double, double, 8, 4, 8>(q, MD_d, r, -74.0,
+                                              std::divides<double>{});
+    matrix_verify_op<double, double, 8, 4, 8>(q, MD_d, r, -76.0,
+                                              std::minus<double>{});
+  }
+
+  return 0;
+}
diff --git a/SYCL/Matrix/element_wise_wi_marray.cpp b/SYCL/Matrix/element_wise_wi_marray.cpp
index 8bce3fe880..91ce3cc9b6 100644
--- a/SYCL/Matrix/element_wise_wi_marray.cpp
+++ b/SYCL/Matrix/element_wise_wi_marray.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 // REQUIRES: cuda
 
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=3 %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 %s -o %t.out
 // RUN: %t.out
 
 #include <sycl/sycl.hpp>
@@ -30,8 +30,8 @@ template <typename T, size_t M, size_t K> void verify_wi_marray(queue q) {
            [ERR](nd_item<2> spmd_item) [[sycl::reqd_sub_group_size(SG_SZ)]] {
              auto sg = spmd_item.get_sub_group();
 
-             joint_matrix<T, matrix_use::a, M, K> sub_a;
-             joint_matrix<T, matrix_use::a, M, K> sub_a_2;
+             joint_matrix<T, use::a, M, K, layout::row_major> sub_a;
+             joint_matrix<T, use::a, M, K, layout::row_major> sub_a_2;
 
              joint_matrix_fill(sg, sub_a, -1);
              joint_matrix_fill(sg, sub_a_2, -1);
diff --git a/SYCL/Matrix/element_wise_wi_marray_legacy.cpp b/SYCL/Matrix/element_wise_wi_marray_legacy.cpp
new file mode 100644
index 0000000000..8bce3fe880
--- /dev/null
+++ b/SYCL/Matrix/element_wise_wi_marray_legacy.cpp
@@ -0,0 +1,67 @@
+//==----------- element_wise_wi_marray.cpp  - DPC++ joint_matrix------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: cuda
+
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=3 %s -o %t.out
+// RUN: %t.out
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+using namespace sycl::ext::oneapi::experimental;
+using namespace sycl::ext::oneapi::experimental::matrix;
+
+#define SG_SZ 32
+
+template <typename T, size_t M, size_t K> void verify_wi_marray(queue q) {
+  int err = 0;
+  {
+    buffer<int> err_buf(&err, 1);
+    q.submit([&](handler &cgh) {
+       accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh);
+
+       cgh.parallel_for<class marray_kernel>(
+           nd_range<2>({1, 1 * SG_SZ}, {1, 1 * SG_SZ}),
+           [ERR](nd_item<2> spmd_item) [[sycl::reqd_sub_group_size(SG_SZ)]] {
+             auto sg = spmd_item.get_sub_group();
+
+             joint_matrix<T, matrix_use::a, M, K> sub_a;
+             joint_matrix<T, matrix_use::a, M, K> sub_a_2;
+
+             joint_matrix_fill(sg, sub_a, -1);
+             joint_matrix_fill(sg, sub_a_2, -1);
+
+             auto wi_slice_a = sub_a.get_wi_data();
+             for (int i = 0; i < wi_slice_a.length(); i++) {
+               wi_slice_a[i] = fabs(wi_slice_a[i]);
+             }
+             sub_a_2.wi_marray = fabs(sub_a_2.wi_marray);
+
+             for (int i = 0; i < sub_a_2.wi_marray.size(); i++) {
+               if (sub_a_2.wi_marray[i] != wi_slice_a[i]) {
+                 ERR[0] = 1;
+               }
+             }
+           }); // parallel for
+     }).wait();
+  }
+  assert(err == 0);
+}
+
+int main() {
+
+  queue q;
+  auto computeCapability =
+      std::stof(q.get_device().get_info<sycl::info::device::backend_version>());
+
+  if (computeCapability >= 8.0) {
+    verify_wi_marray<bfloat16, 16, 16>(q);
+  }
+
+  return 0;
+}
diff --git a/SYCL/Matrix/joint_matrix_tensorcores.cpp b/SYCL/Matrix/joint_matrix_tensorcores.cpp
new file mode 100644
index 0000000000..d401c3133e
--- /dev/null
+++ b/SYCL/Matrix/joint_matrix_tensorcores.cpp
@@ -0,0 +1,277 @@
+
+// REQUIRES: cuda
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 %s -o %t.out
+// RUN: %t.out
+//
+// This tests the latest unified matrix extension interfaces.
+// Specifying the sm version via the --cuda-gpu-arch flag is necessary
+// for the Nvidia case.  DPC++ JIT compilation is not
+// supported for the Nvidia matrix extension, although some JIT optimizations
+// are performed at the level of the PTX assembly code.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+using namespace sycl::ext::oneapi::experimental;
+using namespace sycl::ext::oneapi::experimental::matrix;
+constexpr float bf16_eps = 0.00390625;
+
+// Example usage of Nvidia matrix multiply.
+// Optimizations such as memory paddings for avoiding bank conflicts are not
+// included in this test which aids clarity for what is going on. This example
+// forms a "Big matrix" corresponding to a single "TILE" using cuda example
+// terminology.  Multiple TILES can be used to construct yet larger matrices.
+// This example uses row_major a, b, and accumulator matrices.
+
+// M, N, K define the unit sizes of dimensions of the three types (a, b,
+// accumulator) of matrices per subgroup operation:
+// M: number of rows of "C"/"D" (Accumulator) sub-matrices,
+// number of cols of "B" sub-matrix.
+// N: number of cols of "C"/"D" (Accumulator) sub-matrices,
+// number of rows of "A" sub-matrix.
+// K: number of cols of "A"/number of rows of "B" sub-matrices.
+
+// the number of threads per MMA subgroup is always 32 for Nvidia.
+constexpr int N_THREADS_PER_MATRIX_OP = 32;
+
+// number of submatrices per row of accumulator ("C", "D") matrices.
+constexpr int SUB_TILES_M = 2;
+// number of submatrices per col of accumulator matrices.
+constexpr int SUB_TILES_N = 3;
+// number of submatrices per col of "A"/per row of "B", matrices.
+constexpr int SUB_TILES_K = 4;
+
+template <typename T1, typename T2, size_t M, size_t K, size_t N>
+class TypeHelper;
+
+template <typename T1, typename T2, size_t M, size_t K, size_t N>
+using KernelName = class TypeHelper<T1, T2, M, K, N>;
+
+template <size_t Big_N, size_t Big_K, typename T1, typename T2>
+T2 matrix_ref_mn(const int &m, const int &n, T1 *A, T1 *B, T2 *C) {
+  T2 res = C[m * Big_N + n];
+
+  if constexpr (std::is_same<T1, bfloat16>::value) {
+    for (int k = 0; k < Big_K; k++)
+      res += A[m * Big_K + k] * B[k * Big_N + n];
+  } else {
+    for (int k = 0; k < Big_K; k++)
+      res +=
+          static_cast<T2>(A[m * Big_K + k]) * static_cast<T2>(B[k * Big_N + n]);
+  }
+
+  return res;
+}
+
+template <typename T1, typename T2, size_t Sub_Tiles_M, size_t Sub_Tiles_K,
+          size_t Sub_Tiles_N, size_t M, size_t K, size_t N,
+          typename T3 = std::remove_const_t<T1>>
+void test(queue &q) {
+  // total number of M dimension matrix elements for the "Big matrix".
+  constexpr auto Big_M = Sub_Tiles_M * M;
+  // total number of N dimension matrix elements for the "Big matrix".
+  constexpr auto Big_N = Sub_Tiles_N * N;
+  // total number of K dimension matrix elements for the "Big matrix".
+  constexpr auto Big_K = Sub_Tiles_K * K;
+
+  std::remove_const_t<T1> A[Big_M * Big_K];
+  std::remove_const_t<T1> B[Big_K * Big_N];
+  std::remove_const_t<T2> C[Big_M * Big_N];
+  std::remove_const_t<T2> D[Big_M * Big_N];
+
+  for (int i = 0; i < Big_M * Big_N; i++) {
+    C[i] = 1;
+    D[i] = 0;
+  }
+
+  if constexpr (!std::is_same<std::remove_const_t<T1>, bfloat16>::value) {
+    for (int i = 0; i < Big_M * Big_K; i++) {
+      A[i] = i % 100;
+    }
+
+    for (int i = 0; i < Big_K * Big_N; i++) {
+      B[i] = i % 100;
+    }
+  }
+  {
+    if constexpr (std::is_same<std::remove_const_t<T1>, bfloat16>::value) {
+
+      buffer<bfloat16, 1> bufA(A, range<1>(Big_M * Big_K));
+      buffer<bfloat16, 1> bufB(B, range<1>(Big_K * Big_N));
+      q.submit([&](handler &cgh) {
+        accessor<bfloat16, 1, access::mode::write, target::device> accA(bufA,
+                                                                        cgh);
+
+        cgh.parallel_for<KernelName<T1, class copyA, M, K, N>>(
+            range<1>(Big_M * Big_K), [=](item<1> item) {
+              auto i = item.get_linear_id();
+              accA[i] = 0.1f * (i % 10);
+            });
+      });
+      q.submit([&](handler &cgh) {
+        accessor<bfloat16, 1, access::mode::write, target::device> accB(bufB,
+                                                                        cgh);
+
+        cgh.parallel_for<KernelName<T1, class copyB, M, K, N>>(
+            range<1>(Big_K * Big_N), [=](item<1> item) {
+              auto i = item.get_linear_id();
+              accB[i] = 0.1f * (i % 10);
+            });
+      });
+    }
+
+    buffer<T1, 1> bufA(A, range<1>(Big_M * Big_K));
+    buffer<T1, 1> bufB(B, range<1>(Big_K * Big_N));
+    buffer<T2, 1> bufC(C, range<1>(Big_M * Big_N));
+    buffer<std::remove_const_t<T2>, 1> bufD(D, range<1>(Big_M * Big_N));
+
+    q.submit([&](handler &cgh) {
+      accessor<T1, 1, access::mode::read, target::device> accA(bufA, cgh);
+      accessor<T1, 1, access::mode::read, target::device> accB(bufB, cgh);
+      accessor<T2, 1, access::mode::read, target::device> accC(bufC, cgh);
+      accessor<std::remove_const_t<T2>, 1, access::mode::write, target::device>
+          accD(bufD, cgh);
+
+      range<2> LocalRange = {1, N_THREADS_PER_MATRIX_OP};
+      range<2> GlobalRange = {Sub_Tiles_M,
+                              Sub_Tiles_N * N_THREADS_PER_MATRIX_OP};
+
+      cgh.parallel_for<KernelName<T1, T2, M, K, N>>(
+          nd_range<2>(GlobalRange, LocalRange), [=](nd_item<2> item) {
+            sub_group sg = item.get_sub_group();
+            // row id of current submatrix of BIG C matrix
+            const auto m = item.get_group().get_group_id()[0];
+            // column id of current submatrix of BIG C matrix
+            const auto n = item.get_group().get_group_id()[1];
+
+            joint_matrix<T3, use::a, M, K, layout::row_major> sub_a;
+            joint_matrix<T3, use::b, K, N, layout::row_major> sub_b;
+            joint_matrix<std::remove_const_t<T2>, use::accumulator, M, N> sub_c;
+
+            joint_matrix_load(sg, sub_c,
+                              accC.get_pointer() + (m * M) * Big_N + n * N,
+                              Big_N, layout::row_major);
+            // k = row/col id of current submatrix of BIG A/B matrices
+            for (int k = 0; k < Sub_Tiles_K; k++) {
+              joint_matrix_load(sg, sub_a,
+                                accA.get_pointer() + (k * K) + (m * M * Big_K),
+                                Big_K);
+
+              joint_matrix_load(sg, sub_b,
+                                accB.get_pointer() + (k * K * Big_N) + (n * N),
+                                Big_N);
+
+              // round values to correct precision if using tf32
+              if constexpr (std::is_same<T3, precision::tf32>::value) {
+                auto wi_size = sub_a.wi_marray.size();
+                assert(wi_size == sub_b.wi_marray.size());
+                for (auto i = 0; i < wi_size; ++i) {
+                  sub_a.wi_marray[i] = round_to_tf32(sub_a.wi_marray[i]);
+                  sub_b.wi_marray[i] = round_to_tf32(sub_b.wi_marray[i]);
+                }
+              }
+
+              sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
+            }
+            joint_matrix_store(sg, sub_c,
+                               accD.get_pointer() + (m * M) * Big_N + n * N,
+                               Big_N, layout::row_major);
+          });
+    });
+    q.wait();
+  }
+
+  for (int m = 0; m < Big_M; m++) {
+    for (int n = 0; n < Big_N; n++) {
+      if constexpr (std::is_same<std::remove_const_t<T1>, bfloat16>::value) {
+        auto res_device = matrix_ref_mn<Big_N, Big_K>(m, n, A, B, C);
+        assert(fabs(2 * (D[m * Big_N + n] - res_device)) /
+                   (D[m * Big_N + n] + res_device) <
+               bf16_eps * 2);
+      } else {
+        assert(
+            (D[m * Big_N + n] == matrix_ref_mn<Big_N, Big_K>(m, n, A, B, C)));
+      }
+    }
+  }
+};
+
+int main() {
+
+  queue Q;
+  auto computeCapability =
+      std::stof(Q.get_device().get_info<sycl::info::device::backend_version>());
+
+  if (computeCapability >= 7.0) {
+    // A/B half, Accumulator float
+    test<half, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 16, 16>(Q);
+    test<half, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 16, 32>(Q);
+    test<half, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32, 16, 8>(Q);
+
+    test<const half, const float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 16,
+         16>(Q);
+    test<const half, const float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 16,
+         32>(Q);
+    test<const half, const float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32, 16,
+         8>(Q);
+
+    // A/B/Accumulator half
+    test<half, half, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 16, 16>(Q);
+    test<half, half, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 16, 32>(Q);
+    test<half, half, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32, 16, 8>(Q);
+
+    test<const half, const half, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 16,
+         16>(Q);
+    test<const half, const half, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 16,
+         32>(Q);
+    test<const half, const half, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32, 16,
+         8>(Q);
+  }
+  if (computeCapability >= 7.2) {
+    test<int8_t, int32_t, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 16, 16>(Q);
+    test<int8_t, int32_t, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 16, 32>(Q);
+    test<int8_t, int32_t, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32, 16, 8>(Q);
+
+    test<const int8_t, const int32_t, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16,
+         16, 16>(Q);
+    test<const int8_t, const int32_t, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8,
+         16, 32>(Q);
+    test<const int8_t, const int32_t, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32,
+         16, 8>(Q);
+
+    test<uint8_t, int32_t, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 16, 16>(
+        Q);
+    test<uint8_t, int32_t, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 16, 32>(Q);
+    test<uint8_t, int32_t, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32, 16, 8>(Q);
+
+    test<const uint8_t, const int32_t, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N,
+         16, 16, 16>(Q);
+    test<const uint8_t, const int32_t, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8,
+         16, 32>(Q);
+    test<const uint8_t, const int32_t, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N,
+         32, 16, 8>(Q);
+  }
+  if (computeCapability >= 8.0) {
+    test<double, double, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 4, 8>(Q);
+    test<const double, const double, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8,
+         4, 8>(Q);
+
+    test<bfloat16, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 16, 16>(Q);
+    test<bfloat16, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 16, 32>(Q);
+    test<bfloat16, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32, 16, 8>(Q);
+
+    test<const bfloat16, const float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16,
+         16, 16>(Q);
+    test<const bfloat16, const float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8,
+         16, 32>(Q);
+    test<const bfloat16, const float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32,
+         16, 8>(Q);
+
+    // A/B tf32
+    test<float, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 8, 16,
+         precision::tf32>(Q);
+    test<const float, const float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 8,
+         16, precision::tf32>(Q);
+  }
+  return 0;
+};
diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcores_legacy.cpp
similarity index 99%
rename from SYCL/Matrix/joint_matrix_tensorcore.cpp
rename to SYCL/Matrix/joint_matrix_tensorcores_legacy.cpp
index 72483bf282..d98a389f1e 100644
--- a/SYCL/Matrix/joint_matrix_tensorcore.cpp
+++ b/SYCL/Matrix/joint_matrix_tensorcores_legacy.cpp
@@ -2,6 +2,7 @@
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=3 %s -o %t.out
 // RUN: %t.out
 //
+// This tests the deprecated legacy matrix extension interfaces
 // Specifying the sm version via the --cuda-gpu-arch flag is necessary
 // for the Nvidia case.  DPC++ JIT compilation is not
 // supported for the Nvidia matrix extension, although some JIT optimizations