ginkgo-project · yhmtsai · Mar 15, 2023 · Nov 25, 2022 · Nov 28, 2022 · Nov 29, 2022
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -281,6 +281,12 @@ if(MSVC)
     endif()
 endif()
 
+if(GINKGO_BUILD_DPCPP)
+    ginkgo_extract_dpcpp_version(${CMAKE_CXX_COMPILER} GINKGO_DPCPP_MAJOR_VERSION __LIBSYCL_MAJOR_VERSION)
+    ginkgo_extract_dpcpp_version(${CMAKE_CXX_COMPILER} GINKGO_DPCPP_VERSION __SYCL_COMPILER_VERSION)
+else()
+    set(GINKGO_DPCPP_MAJOR_VERSION "0")
+endif()
 configure_file(${Ginkgo_SOURCE_DIR}/include/ginkgo/config.hpp.in
     ${Ginkgo_BINARY_DIR}/include/ginkgo/config.hpp @ONLY)
 

diff --git a/cmake/GinkgoConfig.cmake.in b/cmake/GinkgoConfig.cmake.in
@@ -78,6 +78,7 @@ set(GINKGO_HIP_VERSION @GINKGO_HIP_VERSION@)
 set(GINKGO_AMD_ARCH_FLAGS @GINKGO_AMD_ARCH_FLAGS@)
 
 set(GINKGO_DPCPP_VERSION @GINKGO_DPCPP_VERSION@)
+set(GINKGO_DPCPP_MAJOR_VERSION @GINKGO_DPCPP_MAJOR_VERSION@)
 set(GINKGO_DPCPP_FLAGS @GINKGO_DPCPP_FLAGS@)
 set(GINKGO_MKL_ROOT @GINKGO_MKL_ROOT@)
 set(GINKGO_DPL_ROOT @GINKGO_DPL_ROOT@)

diff --git a/cmake/build_helpers.cmake b/cmake/build_helpers.cmake
@@ -134,9 +134,9 @@ function(ginkgo_extract_clang_version CLANG_COMPILER GINKGO_CLANG_VERSION)
 endfunction()
 
 # Extract the DPC++ version
-function(ginkgo_extract_dpcpp_version DPCPP_COMPILER GINKGO_DPCPP_VERSION)
+function(ginkgo_extract_dpcpp_version DPCPP_COMPILER GINKGO_DPCPP_VERSION MACRO_VAR)
     set(DPCPP_VERSION_PROG "#include <CL/sycl.hpp>\n#include <iostream>\n"
-        "int main() {std::cout << __SYCL_COMPILER_VERSION << '\\n'\;"
+        "int main() {std::cout << ${MACRO_VAR} << '\\n'\;"
         "return 0\;}")
     file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/extract_dpcpp_ver.cpp" ${DPCPP_VERSION_PROG})
     execute_process(COMMAND ${DPCPP_COMPILER} ${CMAKE_CURRENT_BINARY_DIR}/extract_dpcpp_ver.cpp

diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu
@@ -94,7 +94,7 @@ namespace csr {
 constexpr int default_block_size = 512;
 constexpr int warps_in_block = 4;
 constexpr int spmv_block_size = warps_in_block * config::warp_size;
-constexpr int classical_overweight = 32;
+constexpr int classical_oversubscription = 32;
 
 
 /**
@@ -240,7 +240,8 @@ void classical_spmv(syn::value_list<int, subwarp_size>,
                     const matrix::Dense<ValueType>* beta = nullptr)
 {
     const auto nwarps = exec->get_num_warps_per_sm() *
-                        exec->get_num_multiprocessor() * classical_overweight;
+                        exec->get_num_multiprocessor() *
+                        classical_oversubscription;
     const auto gridx =
         std::min(ceildiv(a->get_size()[0], spmv_block_size / subwarp_size),
                  int64(nwarps / warps_in_block));

diff --git a/cuda/matrix/sparsity_csr_kernels.cu b/cuda/matrix/sparsity_csr_kernels.cu
@@ -60,7 +60,7 @@ namespace cuda {
 namespace sparsity_csr {
 
 
-constexpr int classical_overweight = 32;
+constexpr int classical_oversubscription = 32;
 constexpr int spmv_block_size = 128;
 constexpr int warps_in_block = 4;
 
@@ -92,7 +92,8 @@ void classical_spmv(syn::value_list<int, subwarp_size>,
         gko::acc::reduced_row_major<2, arithmetic_type, OutputValueType>;
 
     const auto nwarps = exec->get_num_warps_per_sm() *
-                        exec->get_num_multiprocessor() * classical_overweight;
+                        exec->get_num_multiprocessor() *
+                        classical_oversubscription;
     const auto gridx =
         std::min(ceildiv(a->get_size()[0], spmv_block_size / subwarp_size),
                  int64(nwarps / warps_in_block));

diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
@@ -1,6 +1,3 @@
-ginkgo_extract_dpcpp_version(${CMAKE_CXX_COMPILER} GINKGO_DPCPP_VERSION)
-set(GINKGO_DPCPP_VERSION ${GINKGO_DPCPP_VERSION} PARENT_SCOPE)
-
 find_package(MKL CONFIG REQUIRED HINTS "$ENV{MKLROOT}")
 set(GINKGO_MKL_ROOT "${MKL_ROOT}" PARENT_SCOPE)
 find_package(oneDPL REQUIRED HINTS "$ENV{DPL_ROOT}")

diff --git a/dpcpp/base/executor.dp.cpp b/dpcpp/base/executor.dp.cpp
@@ -109,7 +109,7 @@ void DpcppExecutor::raw_free(void* ptr) const noexcept
     try {
         queue_->wait_and_throw();
         sycl::free(ptr, queue_->get_context());
-    } catch (cl::sycl::exception& err) {
+    } catch (sycl::exception& err) {
 #if GKO_VERBOSE_LEVEL >= 1
         // Unfortunately, if memory free fails, there's not much we can do
         std::cerr << "Unrecoverable Dpcpp error on device "
@@ -231,7 +231,7 @@ void delete_queue(sycl::queue* queue)
 }
 
 
-::cl::sycl::property_list get_property_list(dpcpp_queue_property property)
+sycl::property_list get_property_list(dpcpp_queue_property property)
 {
     if (property == dpcpp_queue_property::in_order) {
         return {sycl::property::queue::in_order{}};
@@ -257,11 +257,11 @@ void DpcppExecutor::set_device_property(dpcpp_queue_property property)
     if (!device.is_host()) {
         try {
             auto subgroup_sizes =
-                device.get_info<cl::sycl::info::device::sub_group_sizes>();
+                device.get_info<sycl::info::device::sub_group_sizes>();
             for (auto& i : subgroup_sizes) {
                 this->get_exec_info().subgroup_sizes.push_back(i);
             }
-        } catch (cl::sycl::exception& err) {
+        } catch (sycl::exception& err) {
             GKO_NOT_SUPPORTED(device);
         }
     }
@@ -274,15 +274,38 @@ void DpcppExecutor::set_device_property(dpcpp_queue_property property)
     }
     this->get_exec_info().max_workgroup_size = static_cast<int>(
         device.get_info<sycl::info::device::max_work_group_size>());
+// They change the max_work_item_size with template parameter Dimension after
+// major version 6 and adding the default = 3 is not in the same release.
+#if GINKGO_DPCPP_MAJOR_VERSION >= 6
+    auto max_workitem_sizes =
+        device.get_info<sycl::info::device::max_work_item_sizes<3>>();
+#else
     auto max_workitem_sizes =
         device.get_info<sycl::info::device::max_work_item_sizes>();
+#endif
     // Get the max dimension of a sycl::id object
     auto max_work_item_dimensions =
         device.get_info<sycl::info::device::max_work_item_dimensions>();
     for (uint32 i = 0; i < max_work_item_dimensions; i++) {
         this->get_exec_info().max_workitem_sizes.push_back(
             max_workitem_sizes[i]);
     }
+
+    // Get the hardware threads per eu
+    if (device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
+#if GINKGO_DPCPP_MAJOR_VERSION >= 6
+        this->get_exec_info().num_pu_per_cu = device.get_info<
+            sycl::ext::intel::info::device::gpu_hw_threads_per_eu>();
+#else
+        this->get_exec_info().num_pu_per_cu = device.get_info<
+            sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
+#endif
+    } else {
+        // To make the usage still valid.
+        // TODO: check the value for other vendor gpu or cpu.
+        this->get_exec_info().num_pu_per_cu = 1;
+    }
+
     // Here we declare the queue with the property `in_order` which ensures the
     // kernels are executed in the submission order. Otherwise, calls to
     // `wait()` would be needed after every call to a DPC++ function or kernel.

diff --git a/dpcpp/base/helper.dp.cpp b/dpcpp/base/helper.dp.cpp
@@ -46,7 +46,7 @@ bool validate(sycl::queue* queue, unsigned int workgroup_size,
 {
     auto device = queue->get_device();
     auto subgroup_size_list =
-        device.get_info<cl::sycl::info::device::sub_group_sizes>();
+        device.get_info<sycl::info::device::sub_group_sizes>();
     auto max_workgroup_size =
         device.get_info<sycl::info::device::max_work_group_size>();
     bool allowed = false;

diff --git a/dpcpp/base/onemkl_bindings.hpp b/dpcpp/base/onemkl_bindings.hpp
@@ -95,7 +95,7 @@ struct is_supported<std::complex<double>> : std::true_type {};
 
 
 #define GKO_BIND_DOT(ValueType, Name, Func)                                    \
-    inline void Name(::cl::sycl::queue& exec_queue, std::int64_t n,            \
+    inline void Name(sycl::queue& exec_queue, std::int64_t n,                  \
                      const ValueType* x, std::int64_t incx,                    \
                      const ValueType* y, std::int64_t incy, ValueType* result) \
     {                                                                          \