Merge branch 'intel:main' into test/array_api_testing

uxlfoundation · Oct 1, 2024 · 7a15963 · 7a15963
2 parents c9ff599 + 3208718
commit 7a15963
Show file tree

Hide file tree

Showing 74 changed files with 1,154 additions and 306 deletions.
diff --git a/.circleci/deselect_tests.py b/.circleci/deselect_tests.py
@@ -22,7 +22,12 @@
 import warnings
 
 import sklearn
-from packaging.version import Version
+
+try:
+    from packaging.version import Version
+except ImportError:
+    from distutils.version import LooseVersion as Version
+
 from sklearn import __version__ as sklearn_version
 from yaml import FullLoader
 from yaml import load as yaml_load

diff --git a/.github/Pull_Request_template.md b/.github/Pull_Request_template.md
@@ -1,28 +1,35 @@
-### Description
+## Description
 
 _Add a comprehensive description of proposed changes_
 
-_List issue number(s) if exist(s): #6 (for example)_
+_List associated issue number(s) if exist(s): #6 (for example)_
+
+_Documentation PR (if needed): #1340 (for example)_
+
+_Benchmarks PR (if needed): https://github.com/IntelPython/scikit-learn_bench/pull/155 (for example)_
 
 ---
 
-Checklist to comply with before moving PR from draft:
+Checklist to comply with **before moving PR from draft**:
 
 **PR completeness and readability**
 
 - [ ] I have reviewed my changes thoroughly before submitting this pull request.
 - [ ] I have commented my code, particularly in hard-to-understand areas.
 - [ ] I have updated the documentation to reflect the changes or created a separate PR with update and provided its number in the description, if necessary.
 - [ ] Git commit message contains an appropriate signed-off-by string _(see [CONTRIBUTING.md](https://github.com/intel/scikit-learn-intelex/blob/main/CONTRIBUTING.md#pull-requests) for details)_.
-- [ ] I have added a respective label(s) to PR if I have a permission for that.  
+- [ ] I have added a respective label(s) to PR if I have a permission for that.
 - [ ] I have resolved any merge conflicts that might occur with the base branch.
 
 **Testing**
 
-- [ ] The unit tests pass successfully.
 - [ ] I have run it locally and tested the changes extensively.
+- [ ] All CI jobs are green or I have provided justification why they aren't.
+- [ ] I have extended testing suite if new functionality was introduced in this PR.
 
 **Performance**
 
 - [ ] I have measured performance for affected algorithms using [scikit-learn_bench](https://github.com/IntelPython/scikit-learn_bench) and provided at least summary table with measured data, if performance change is expected.
 - [ ] I have provided justification why performance has changed or why changes are not expected.
+- [ ] I have provided justification why quality metrics have changed or why changes are not expected.
+- [ ] I have extended benchmarking suite and provided corresponding scikit-learn_bench PR if new measurable functionality was introduced in this PR.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -23,6 +23,10 @@ on:
       - main
   workflow_dispatch:
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref_name }}-${{ github.event.number || github.sha }}
+  cancel-in-progress: true
+
 env:
   TBB_VERSION: 2021.13
   DPCPP_VERSION: 2024.2

diff --git a/.github/workflows/pr-checklist.yml b/.github/workflows/pr-checklist.yml
@@ -0,0 +1,54 @@
+#===============================================================================
+# Copyright 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+name: Check PR Checklist
+
+on:
+  pull_request:
+    types: [opened, edited, synchronize]
+
+jobs:
+  checklist:
+    name: Close all checkboxes before moving from draft
+    timeout-minutes: 5
+    runs-on: ubuntu-24.04
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v4
+    - name: Get pull request details
+      id: pr
+      uses: actions/github-script@v7
+      with:
+        script: |
+          const pr_desc = await github.rest.pulls.get({
+            owner: context.repo.owner,
+            repo: context.repo.repo,
+            pull_number: context.payload.pull_request.number
+          });
+          core.setOutput('body', pr_desc.data.body)
+          core.setOutput('draft', pr_desc.data.draft)
+    - name: Check if all checkboxes are checked
+      id: checkboxes
+      env:
+        DESCRIPTION: ${{ steps.pr.outputs.body }}
+      run: |
+        UNCHECKED=$(echo "$DESCRIPTION" | grep -c '\[ \]' || true)
+        echo "unchecked=$UNCHECKED" >> $GITHUB_OUTPUT
+    - name: Fail if not all checkboxes are checked and PR is not draft
+      if: ${{ (steps.pr.outputs.draft == 'false') && (steps.checkboxes.outputs.unchecked != '0') }}
+      run: |
+        echo "Unchecked checkboxes: ${{ steps.checkboxes.outputs.unchecked }}"
+        exit 1
diff --git a/INSTALL.md b/INSTALL.md
@@ -196,6 +196,14 @@ The build-process (using setup.py) happens in 4 stages:
    python setup.py develop --no-deps
    ```
 
+- To build the python module without installing it:
+
+   ```bash
+   cd <checkout-dir>
+   python setup.py build_ext --inplace --force
+   python setup.py build
+   ```
+
 Where: 
 
 * Keys `--single-version-externally-managed` and `--no-deps` are required to not download daal4py after the installation of Intel(R) Extension for Scikit-learn. 

diff --git a/daal4py/__init__.py b/daal4py/__init__.py
@@ -31,10 +31,17 @@
     path_to_libs = os.path.join(path_to_env, "Library", "bin")
     if sys.version_info.minor >= 8:
         if "DALROOT" in os.environ:
-            dal_root_redist = os.path.join(os.environ["DALROOT"], "redist", arch_dir)
+            dal_root = os.environ["DALROOT"]
+            dal_root_redist = os.path.join(dal_root, "redist", arch_dir)
             if os.path.exists(dal_root_redist):
                 os.add_dll_directory(dal_root_redist)
                 os.environ["PATH"] = dal_root_redist + os.pathsep + os.environ["PATH"]
+        if "TBBROOT" in os.environ:
+            tbb_root = os.environ["TBBROOT"]
+            tbb_root_redist = os.path.join(tbb_root, "bin")
+            if os.path.exists(tbb_root_redist):
+                os.add_dll_directory(tbb_root_redist)
+                os.environ["PATH"] = tbb_root_redist + os.pathsep + os.environ["PATH"]
 
         try:
             os.add_dll_directory(path_to_libs)

diff --git a/daal4py/sklearn/_n_jobs_support.py b/daal4py/sklearn/_n_jobs_support.py
@@ -76,7 +76,7 @@ def _run_with_n_jobs(method):
     """
 
     @wraps(method)
-    def method_wrapper(self, *args, **kwargs):
+    def n_jobs_wrapper(self, *args, **kwargs):
         # threading parallel backend branch
         if not isinstance(threading.current_thread(), threading._MainThread):
             warn(
@@ -117,20 +117,22 @@ def method_wrapper(self, *args, **kwargs):
                 n_jobs = max(1, n_threads + n_jobs + 1)
         # branch with set n_jobs
         old_n_threads = get_n_threads()
-        if n_jobs != old_n_threads:
+        if n_jobs == old_n_threads:
+            return method(self, *args, **kwargs)
+
+        try:
             logger = logging.getLogger("sklearnex")
             cl = self.__class__
             logger.debug(
                 f"{cl.__module__}.{cl.__name__}.{method.__name__}: "
                 f"setting {n_jobs} threads (previous - {old_n_threads})"
             )
             set_n_threads(n_jobs)
-        result = method(self, *args, **kwargs)
-        if n_jobs != old_n_threads:
+            return method(self, *args, **kwargs)
+        finally:
             set_n_threads(old_n_threads)
-        return result
 
-    return method_wrapper
+    return n_jobs_wrapper
 
 
 def control_n_jobs(decorated_methods: list = []):

diff --git a/daal4py/sklearn/_utils.py b/daal4py/sklearn/_utils.py
@@ -30,7 +30,10 @@
 
 import logging
 
-from packaging.version import Version
+try:
+    from packaging.version import Version
+except ImportError:
+    from distutils.version import LooseVersion as Version
 
 try:
     from pandas import DataFrame

diff --git a/dependencies-dev b/dependencies-dev
@@ -2,6 +2,6 @@ Cython==3.0.11
 Jinja2==3.1.4
 numpy==2.0.1 ; python_version <= '3.9'
 numpy==2.1.1 ; python_version > '3.9'
-pybind11==2.13.5
-cmake==3.30.2
-setuptools==74.1.2
+pybind11==2.13.6
+cmake==3.30.3
+setuptools==75.1.0
diff --git a/deselected_tests.yaml b/deselected_tests.yaml
@@ -380,6 +380,11 @@ deselected_tests:
   - model_selection/tests/test_classification_threshold.py::test_fit_and_score_over_thresholds_sample_weight >=1.5
   - model_selection/tests/test_classification_threshold.py::test_tuned_threshold_classifier_cv_zeros_sample_weights_equivalence >=1.5
 
+  # Deselections for 2025.0
+  - ensemble/tests/test_forest.py::test_importances[ExtraTreesRegressor-squared_error-float64]
+
+  - cluster/tests/test_k_means.py::test_kmeans_elkan_results[42-1e-100-sparse_array-normal]
+
   # --------------------------------------------------------
   # No need to test daal4py patching
 reduced_tests:

diff --git a/generator/wrapper_gen.py b/generator/wrapper_gen.py
@@ -374,7 +374,7 @@ def __dealloc__(self):
     def __init__(self, int64_t ptr=0):
         self.c_ptr = <{{class_type|flat}}>ptr
 
-    def __str__(self):
+    def __repr__(self):
         return _str(self, [{% for m in enum_gets+named_gets %}'{{m[1]}}',{% endfor %}])
 {% for m in enum_gets+named_gets %}
 {% set rtype = m[2]|d2cy(False) if m in enum_gets else m[0]|d2cy(False) %}
@@ -1003,6 +1003,8 @@ def __cinit__(self):
 
 # this is our actual algorithm class for Python
 cdef class {{algo}}{{'('+iface[0]|lower+'__iface__)' if iface[0] else ''}}:
+    cdef tuple _params
+
     '''
     {{algo}}
     {{params_all|fmt('{}', 'sphinx', sep='\n')|indent(4)}}
@@ -1017,6 +1019,17 @@ def __cinit__(self,
         self.c_ptr = mk_{{algo}}(
             {{params_all|fmt('{}', 'arg_cyext', sep=',\n')|indent(25+(algo|length))}}
         )
+        current_locals = locals()
+        ordered_input_args = '''
+            {{params_all|fmt('{}', 'name', sep=' ')|indent(0)}}
+        '''.strip().split()
+        self._params = tuple(
+            current_locals[arg]
+            for arg in ordered_input_args
+        )
+
+    def __reduce__(self):
+        return (self.__class__, self._params)
 
 {% if not iface[0] %}
     # the C++ manager__iface__ (de-templatized)

diff --git a/onedal/cluster/dbscan.cpp b/onedal/cluster/dbscan.cpp
@@ -149,7 +149,6 @@ ONEDAL_PY_INIT_MODULE(dbscan) {
     ONEDAL_PY_INSTANTIATE(init_compute_ops, sub, policy_list, task_list);
     ONEDAL_PY_INSTANTIATE(init_compute_result, sub, task_list);
 #endif // ONEDAL_DATA_PARALLEL_SPMD
-
 }
 
 } // namespace oneapi::dal::python
diff --git a/onedal/covariance/covariance.cpp b/onedal/covariance/covariance.cpp
@@ -189,7 +189,6 @@ ONEDAL_PY_INIT_MODULE(covariance) {
             ONEDAL_PY_INSTANTIATE(init_compute_hyperparameters, sub, task::compute);
         #endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240000
     #endif
-
 }
 
 } // namespace oneapi::dal::python
diff --git a/onedal/linear_model/linear_model.cpp b/onedal/linear_model/linear_model.cpp
@@ -318,7 +318,6 @@ ONEDAL_PY_INIT_MODULE(linear_model) {
     ONEDAL_PY_INSTANTIATE(init_train_hyperparameters, sub, task_list);
 #endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240000
 #endif // ONEDAL_DATA_PARALLEL_SPMD
-
 }
 
 ONEDAL_PY_TYPE2STR(dal::linear_regression::task::regression, "regression");

diff --git a/onedal/svm/svm.py b/onedal/svm/svm.py
@@ -286,7 +286,7 @@ def _ovr_decision_function(self, predictions, confidences, n_classes):
     def _decision_function(self, X, module, queue):
         _check_is_fitted(self)
         X = _check_array(
-            X, dtype=[np.float64, np.float32], force_all_finite=False, accept_sparse="csr"
+            X, dtype=[np.float64, np.float32], force_all_finite=True, accept_sparse="csr"
         )
         _check_n_features(self, X, False)
 

diff --git a/onedal/svm/tests/test_csr_svm.py b/onedal/svm/tests/test_csr_svm.py
@@ -142,6 +142,8 @@ def _test_iris(queue, kernel):
 @pytest.mark.parametrize("queue", get_queues())
 @pytest.mark.parametrize("kernel", ["linear", "rbf", "poly", "sigmoid"])
 def test_iris(queue, kernel):
+    if kernel == "rbf":
+        pytest.skip("RBF CSR SVM test failing in 2025.0.")
     _test_iris(queue, kernel)
 
 
@@ -160,6 +162,8 @@ def _test_diabetes(queue, kernel):
 @pytest.mark.parametrize("queue", get_queues())
 @pytest.mark.parametrize("kernel", ["linear", "rbf", "poly", "sigmoid"])
 def test_diabetes(queue, kernel):
+    if kernel == "sigmoid":
+        pytest.skip("Sparse sigmoid kernel function is buggy.")
     _test_diabetes(queue, kernel)
 
 

diff --git a/onedal/svm/tests/test_nusvr.py b/onedal/svm/tests/test_nusvr.py
@@ -109,6 +109,8 @@ def _test_diabetes_compare_with_sklearn(queue, kernel):
 @pytest.mark.parametrize("queue", get_queues())
 @pytest.mark.parametrize("kernel", ["linear", "rbf", "poly", "sigmoid"])
 def test_diabetes_compare_with_sklearn(queue, kernel):
+    if kernel == "sigmoid":
+        pytest.skip("Sparse sigmoid kernel function is buggy.")
     _test_diabetes_compare_with_sklearn(queue, kernel)
 
 

diff --git a/onedal/svm/tests/test_svr.py b/onedal/svm/tests/test_svr.py
@@ -124,6 +124,8 @@ def _test_diabetes_compare_with_sklearn(queue, kernel):
 @pytest.mark.parametrize("queue", get_queues())
 @pytest.mark.parametrize("kernel", ["linear", "rbf", "poly", "sigmoid"])
 def test_diabetes_compare_with_sklearn(queue, kernel):
+    if kernel == "sigmoid":
+        pytest.skip("Sparse sigmoid kernel function is buggy.")
     _test_diabetes_compare_with_sklearn(queue, kernel)
 
 

diff --git a/requirements-test.txt b/requirements-test.txt
@@ -1,14 +1,14 @@
 pytest==7.4.4 ; python_version <= '3.10'
-pytest==8.3.2 ; python_version >= '3.11'
+pytest==8.3.3 ; python_version >= '3.11'
 numpy>=1.19.5 ; python_version <= '3.9'
 numpy>=1.21.6 ; python_version == '3.10'
 numpy>=1.23.5 ; python_version == '3.11'
 numpy>=2.0.0 ; python_version >= '3.12'
-scikit-learn==1.5.1
+scikit-learn==1.5.2
 pandas==2.1.3 ; python_version < '3.11'
 pandas==2.2.2 ; python_version >= '3.11'
 xgboost==2.1.1
 lightgbm==4.5.0
-catboost==1.2.6 ; python_version < '3.11' # TODO: Remove 3.11 condition when catboost supports numpy 2.0
+catboost==1.2.7 ; python_version < '3.11' # TODO: Remove 3.11 condition when catboost supports numpy 2.0
 shap==0.46.0
 array-api-strict==2.0.1