From cde94da0f5d252560aa68a248422e9a61abf46cf Mon Sep 17 00:00:00 2001 From: Samir Nasibli Date: Mon, 21 Oct 2024 15:16:26 +0200 Subject: [PATCH] DOC: Array API support (#1918) * DOC: Array API suport * Example for DBSCAN array API --- doc/sources/array_api.rst | 123 +++++++++++++++++++++++++ doc/sources/conf.py | 4 +- doc/sources/index.rst | 1 + examples/sklearnex/dbscan_array_api.py | 36 ++++++++ tests/run_examples.py | 1 + 5 files changed, 163 insertions(+), 2 deletions(-) create mode 100644 doc/sources/array_api.rst create mode 100644 examples/sklearnex/dbscan_array_api.py diff --git a/doc/sources/array_api.rst b/doc/sources/array_api.rst new file mode 100644 index 0000000000..4c71e0cb88 --- /dev/null +++ b/doc/sources/array_api.rst @@ -0,0 +1,123 @@ +.. ****************************************************************************** +.. * Copyright 2024 Intel Corporation +.. * +.. * Licensed under the Apache License, Version 2.0 (the "License"); +.. * you may not use this file except in compliance with the License. +.. * You may obtain a copy of the License at +.. * +.. * http://www.apache.org/licenses/LICENSE-2.0 +.. * +.. * Unless required by applicable law or agreed to in writing, software +.. * distributed under the License is distributed on an "AS IS" BASIS, +.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +.. * See the License for the specific language governing permissions and +.. * limitations under the License. +.. *******************************************************************************/ + +.. _array_api: + +================= +Array API support +================= +The `Array API `_ specification defines +a standard API for all array manipulation libraries with a NumPy-like API. +Intel(R) Extension for Scikit-Learn doesn't require +`array-api-compat `__ to be installed for +functional support of the array API standard. +In the current implementation, the functional support of array api follows the functional +support of different array or DataFrame inputs and does not modify the precision of the +input and output data formats unless necessary. Any array API input will be converted to host +numpy.ndarrays and all internal manipulations with data will be done with these representations of +the input data. DPNP's 'ndarray' and Data Parallel Control's 'usm_ndarray' have special handling +requirements that are described in the relevant section of this document. Output values will in +all relevant cases match the input data format. + +.. note:: + Currently, only `array-api-strict `__, + `dpctl `__, `dpnp `__ + and `numpy `__ are known to work with sklearnex estimators. +.. note:: + Stock Scikit-learn’s array API support requires `array-api-compat `__ to be installed. + + +Support for DPNP and DPCTL +========================== +The functional support of input data for sklearnex estimators also extended for SYCL USM array types. +These include SYCL USM arrays `dpnp's `__ ndarray and +`Data Parallel Control usm_ndarray `__. +DPNP ndarray and Data Parallel Control usm_ndarray contain SYCL contexts which can be used for +`sklearnex` device offloading. + +.. note:: + Current support for DPNP and DPCTL usm_ndarray data can be copied and moved to and from device in sklearnex and have + impacts on memory utilization. + +DPCTL or DPNP inputs are not required to use `config_context(target_offload=device)`. +`sklearnex` will use input usm_ndarray sycl context for device offloading. + +.. note:: + As DPCTL or DPNP inputs contain SYCL contexts, they do not require `config_context(target_offload=device)`. + However, the use of `config_context`` will override the contained SYCL context and will force movement + of data to the targeted device. + + +Support for Array API-compatible inputs +======================================= +All patched estimators, metrics, tools and non-scikit-learn estimators functionally support Array API standard. +Intel(R) Extension for scikit-Learn preserves input data format for all outputs. For all array inputs except +SYCL USM arrays `dpnp's `__ ndarray and +`Data Parallel Control usm_ndarray `__ all computation +will be only accomplished on CPU unless specified by a `config_context`` with an available GPU device. + +Stock scikit-learn uses `config_context(array_api_dispatch=True)` for enabling Array API +`support `__. +If `array_api_dispatch` is enabled and the installed Scikit-Learn version supports array API, then the original +inputs are used when falling back to Scikit-Learn functionality. + +.. note:: + Data Parallel Control usm_ndarray or DPNP ndarray inputs will use host numpy data copies when + falling back to Scikit-Learn since they are not array API compliant. +.. note:: + Functional support doesn't guarantee that after the model is trained, fitted attributes that are arrays + will also be from the same namespace as the training data. + + +Example usage +============= + +DPNP ndarrays +------------- + +Here is an example code to demonstrate how to use `dpnp `__ arrays to +run `RandomForestRegressor` on a GPU without `config_context(array_api_dispatch=True)`: + +.. literalinclude:: ../../examples/sklearnex/random_forest_regressor_dpnp.py + :language: python + + +.. note:: + Functional support doesn't guarantee that after the model is trained, fitted attributes that are arrays + will also be from the same namespace as the training data. + +For example, if `dpnp's `__ namespace was used for training, +then fitted attributes will be on the CPU and `numpy.ndarray` data format. + +DPCTL usm_ndarrays +------------------ +Here is an example code to demonstrate how to use `dpctl `__ +arrays to run `RandomForestClassifier` on a GPU witout `config_context(array_api_dispatch=True)`: + +.. literalinclude:: ../../examples/sklearnex/random_forest_classifier_dpctl.py + :language: python + +As on previous example, if `dpctl `__ Array API namespace was +used for training, then fitted attributes will be on the CPU and `numpy.ndarray` data format. + +Use of `array-api-strict` +------------------------- + +Here is an example code to demonstrate how to use `array-api-strict `__ +arrays to run `DBSCAN`. + +.. literalinclude:: ../../examples/sklearnex/dbscan_array_api.py + :language: python diff --git a/doc/sources/conf.py b/doc/sources/conf.py index 65c44f4c87..d40be16012 100755 --- a/doc/sources/conf.py +++ b/doc/sources/conf.py @@ -42,9 +42,9 @@ author = "Intel" # The short X.Y version -version = "2024.3.0" +version = "2025.0.0" # The full version, including alpha/beta/rc tags -release = "2024.3.0" +release = "2025.0.0" # -- General configuration --------------------------------------------------- diff --git a/doc/sources/index.rst b/doc/sources/index.rst index b4734ca257..4489f995a3 100755 --- a/doc/sources/index.rst +++ b/doc/sources/index.rst @@ -106,6 +106,7 @@ Enable Intel(R) GPU optimizations oneAPI and GPU support distributed-mode.rst non-scikit-algorithms.rst + array_api.rst verbose.rst deprecation.rst diff --git a/examples/sklearnex/dbscan_array_api.py b/examples/sklearnex/dbscan_array_api.py new file mode 100644 index 0000000000..90584bd152 --- /dev/null +++ b/examples/sklearnex/dbscan_array_api.py @@ -0,0 +1,36 @@ +# ============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import array_api_strict + +from sklearnex import config_context, patch_sklearn + +patch_sklearn() + +from sklearn.cluster import DBSCAN + +X = array_api_strict.asarray( + [[1.0, 2.0], [2.0, 2.0], [2.0, 3.0], [8.0, 7.0], [8.0, 8.0], [25.0, 80.0]], + dtype=array_api_strict.float32, +) + +# Could be launched without `config_context(array_api_dispatch=True)`. This context +# manager for sklearnex, only guarantee that in case of the fallback to stock +# scikit-learn, fitted attributes to be from the same Array API namespace as +# the training data. +clustering = DBSCAN(eps=3, min_samples=2).fit(X) + +print(f"Fitted labels :\n", clustering.labels_) diff --git a/tests/run_examples.py b/tests/run_examples.py index 8f2f10ad01..8aac356baf 100755 --- a/tests/run_examples.py +++ b/tests/run_examples.py @@ -165,6 +165,7 @@ def check_library(rule): req_library["basic_statistics_spmd.py"] = ["dpctl", "mpi4py"] req_library["covariance_spmd.py"] = ["dpctl", "mpi4py"] req_library["dbscan_spmd.py"] = ["dpctl", "mpi4py"] +req_library["dbscan_array_api.py"] = ["array_api_strict"] req_library["incremental_basic_statistics_dpctl.py"] = ["dpctl"] req_library["incremental_covariance_spmd.py"] = ["dpctl", "mpi4py"] req_library["incremental_linear_regression_dpctl.py"] = ["dpctl"]