From 1b4b671fb10c39d01c6d419532f90666dd468041 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Mon, 12 Jun 2023 08:24:36 -0500
Subject: [PATCH] exclude call of div() with MKL 2023.1.0

---
 .github/workflows/conda-package.yml       |  8 +--
 dpnp/backend/extensions/vm/CMakeLists.txt | 13 -----
 dpnp/backend/extensions/vm/div.cpp        | 66 +++++++----------------
 3 files changed, 22 insertions(+), 65 deletions(-)
diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index 5645191b7c6e..fbfe66ff17b9 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -194,10 +194,10 @@ jobs:
       # TODO: run the whole scope once the issues on CPU are resolved
       - name: Run tests
         run: |
-          python -m pytest -q -ra --disable-warnings -vv -s ${{ env.TEST_SCOPE }}
+          python -m pytest -q -ra --disable-warnings -vv ${{ env.TEST_SCOPE }}
         working-directory: ${{ env.tests-path }}
         env:
-          SYCL_QUEUE_THREAD_POOL_SIZE: 16
+          SYCL_QUEUE_THREAD_POOL_SIZE: 6
 
   test_windows:
     name: Test ['windows-latest', python='${{ matrix.python }}']
@@ -333,10 +333,10 @@ jobs:
       # TODO: run the whole scope once the issues on CPU are resolved
       - name: Run tests
         run: |
-          python -m pytest -q -ra --disable-warnings -vv -s ${{ env.TEST_SCOPE }}
+          python -m pytest -q -ra --disable-warnings -vv ${{ env.TEST_SCOPE }}
         working-directory: ${{ env.tests-path }}
         env:
-          SYCL_QUEUE_THREAD_POOL_SIZE: 16
+          SYCL_QUEUE_THREAD_POOL_SIZE: 6
 
   upload:
     name: Upload ['${{ matrix.os }}', python='${{ matrix.python }}']
diff --git a/dpnp/backend/extensions/vm/CMakeLists.txt b/dpnp/backend/extensions/vm/CMakeLists.txt
index 07a4ffae8aba..8f3086ec3a9e 100644
--- a/dpnp/backend/extensions/vm/CMakeLists.txt
+++ b/dpnp/backend/extensions/vm/CMakeLists.txt
@@ -55,9 +55,7 @@ else()
   target_compile_options(${python_module_name} PRIVATE
     -fno-approx-func
     -fno-finite-math-only
-    -no-ipo
     )
-    target_link_options(${python_module_name} PRIVATE -no-ipo)
 endif()
 
 target_link_options(${python_module_name} PUBLIC -fsycl-device-code-split=per_kernel)
@@ -72,17 +70,6 @@ endif()
 
 target_link_libraries(${python_module_name} PUBLIC MKL::MKL_DPCPP)
 
-target_link_libraries(${python_module_name} PUBLIC oneDPL)
-
-if (UNIX)
-  # needed for STL headers with GCC < 11
-  target_compile_definitions(${python_module_name} PUBLIC _GLIBCXX_USE_TBB_PAR_BACKEND=0)
-endif()
-
-target_compile_definitions(${python_module_name} PUBLIC PSTL_USE_PARALLEL_POLICIES=0)
-# work-around for Windows at exit crash with predefined policies
-target_compile_definitions(${python_module_name} PUBLIC ONEDPL_USE_PREDEFINED_POLICIES=0)
-
 install(TARGETS ${python_module_name}
   DESTINATION "dpnp/backend/extensions/vm"
 )
diff --git a/dpnp/backend/extensions/vm/div.cpp b/dpnp/backend/extensions/vm/div.cpp
index 8a6751a45a44..28fbe1cdf3cc 100644
--- a/dpnp/backend/extensions/vm/div.cpp
+++ b/dpnp/backend/extensions/vm/div.cpp
@@ -64,46 +64,16 @@ static sycl::event div_impl(sycl::queue exec_q,
 {
     type_utils::validate_type_for_device<T>(exec_q);
 
-    std::cerr << "enter div_impl" << std::endl;
+    const T* a = reinterpret_cast<const T*>(in_a);
+    const T* b = reinterpret_cast<const T*>(in_b);
+    T* y = reinterpret_cast<T*>(out_y);
 
-    const T* _a = reinterpret_cast<const T*>(in_a);
-    const T* _b = reinterpret_cast<const T*>(in_b);
-    T* _y = reinterpret_cast<T*>(out_y);
-
-    std::cerr << "casting is done" << std::endl;
-
-    T* a = sycl::malloc_device<T>(n, exec_q);
-    T* b = sycl::malloc_device<T>(n, exec_q);
-    T* y = sycl::malloc_device<T>(n, exec_q);
-
-    std::cerr << "malloc is done" << std::endl;
-
-    exec_q.copy(_a, a, n).wait();
-    exec_q.copy(_b, b, n).wait();
-    exec_q.copy(_y, y, n).wait();
-
-    std::cerr << "copy is done" << std::endl;
-
-    sycl::event ev = mkl_vm::div(exec_q,
+    return mkl_vm::div(exec_q,
                        n, // number of elements to be calculated
                        a, // pointer `a` containing 1st input vector of size n
                        b, // pointer `b` containing 2nd input vector of size n
                        y, // pointer `y` to the output vector of size n
                        depends);
-    ev.wait();
-
-    std::cerr << "div is done" << std::endl;
-
-    exec_q.copy(y, _y, n).wait();
-
-    std::cerr << "copy is done" << std::endl;
-
-    sycl::free(a, exec_q);
-    sycl::free(b, exec_q);
-    sycl::free(y, exec_q);
-
-    std::cerr << "leaving div_impl" << std::endl;
-    return sycl::event();
 }
 
 std::pair<sycl::event, sycl::event> div(sycl::queue exec_q,
@@ -205,20 +175,9 @@ std::pair<sycl::event, sycl::event> div(sycl::queue exec_q,
         throw py::value_error("No div implementation defined");
     }
     sycl::event sum_ev = div_fn(exec_q, src_nelems, src1_data, src2_data, dst_data, depends);
-    // sum_ev.wait();
-
-    // int* dummy = sycl::malloc_device<int>(1, exec_q);
-    // sycl::event cleanup_ev = exec_q.submit([&](sycl::handler& cgh) {
-    //     // cgh.depends_on(sum_ev);
-    //     auto ctx = exec_q.get_context();
-    //     cgh.host_task([dummy, ctx]() {
-    //         // dummy host task to pass into keep_args_alive
-    //         sycl::free(dummy, ctx);
-    //     });
-    // });
-
-    // sycl::event ht_ev = dpctl::utils::keep_args_alive(exec_q, {src1, src2, dst}, {sum_ev});
-    // return std::make_pair(ht_ev, sum_ev);
+
+    sycl::event ht_ev = dpctl::utils::keep_args_alive(exec_q, {src1, src2, dst}, {sum_ev});
+    return std::make_pair(ht_ev, sum_ev);
     return std::make_pair(sycl::event(), sycl::event());
 }
 
@@ -227,6 +186,7 @@ bool can_call_div(sycl::queue exec_q,
                   dpctl::tensor::usm_ndarray src2,
                   dpctl::tensor::usm_ndarray dst)
 {
+#if INTEL_MKL_VERSION >= 20230002
     // check type_nums
     int src1_typenum = src1.get_typenum();
     int src2_typenum = src2.get_typenum();
@@ -325,6 +285,16 @@ bool can_call_div(sycl::queue exec_q,
         return false;
     }
     return true;
+#else
+    // In OneMKL 2023.1.0 the call of oneapi::mkl::vm::div() is going to dead lock
+    // inside ~usm_wrapper_to_host()->{...; q_->wait_and_throw(); ...}
+
+    (void)exec_q;
+    (void)src1;
+    (void)src2;
+    (void)dst;
+    return false;
+#endif // INTEL_MKL_VERSION >= 20230002
 }
 
 template <typename fnT, typename T>