diff --git a/.gitleaksignore b/.gitleaksignore index 62ac2e2360..98411ee9e9 100644 --- a/.gitleaksignore +++ b/.gitleaksignore @@ -8,3 +8,4 @@ a99389ee01cbb972e46a892d3d0e9c7f8ee23f59:use_case_examples/training/analyze.ipyn a99389ee01cbb972e46a892d3d0e9c7f8ee23f59:use_case_examples/training/analyze.ipynb:aws-access-token:18379 f41de03048a9ed27946b875e81b34138bb4bb17b:use_case_examples/training/analyze.ipynb:aws-access-token:6404 e2904473898ddd325f245f4faca526a0e9520f49:builders/Dockerfile.zamalang-env:generic-api-key:5 +7d5e885816f1f1e432dd94da38c5c8267292056a:docs/advanced_examples/XGBRegressor.ipynb:aws-access-token:1026 diff --git a/deps_licenses/licenses_mac_silicon_user.txt b/deps_licenses/licenses_mac_silicon_user.txt index db194acd7e..36720816ac 100644 --- a/deps_licenses/licenses_mac_silicon_user.txt +++ b/deps_licenses/licenses_mac_silicon_user.txt @@ -1,28 +1,28 @@ Name, Version, License PyYAML, 6.0.1, MIT License brevitas, 0.8.0, UNKNOWN -certifi, 2024.2.2, Mozilla Public License 2.0 (MPL 2.0) +certifi, 2023.7.22, Mozilla Public License 2.0 (MPL 2.0) charset-normalizer, 3.3.2, MIT License coloredlogs, 15.0.1, MIT License concrete-python, 2024.4.19, BSD-3-Clause dependencies, 2.0.1, BSD License dill, 0.3.8, BSD License -filelock, 3.14.0, The Unlicense (Unlicense) +filelock, 3.13.4, The Unlicense (Unlicense) flatbuffers, 24.3.25, Apache Software License fsspec, 2024.3.1, BSD License huggingface-hub, 0.22.2, Apache Software License humanfriendly, 10.0, MIT License -hummingbird-ml, 0.4.11, MIT License +hummingbird-ml, 0.4.8, MIT License idna, 3.7, BSD License importlib_resources, 6.4.0, Apache Software License joblib, 1.4.0, BSD License jsonpickle, 3.0.4, BSD License mpmath, 1.3.0, BSD License networkx, 3.1, BSD License -numpy, 1.24.3, BSD License -onnx, 1.16.0, Apache License v2.0 +numpy, 1.23.5, BSD License +onnx, 1.15.0, Apache License v2.0 onnxconverter-common, 1.13.0, MIT License -onnxmltools, 1.12.0, Apache Software License +onnxmltools, 1.11.0, Apache Software License onnxoptimizer, 0.3.13, Apache License v2.0 onnxruntime, 1.17.3, MIT License packaging, 24.0, Apache Software License; BSD License @@ -32,19 +32,20 @@ psutil, 5.9.8, BSD License python-dateutil, 2.9.0.post0, Apache Software License; BSD License pytz, 2024.1, MIT License requests, 2.31.0, Apache Software License -scikit-learn, 1.3.2, BSD License +scikit-learn, 1.1.3, BSD License scipy, 1.10.1, BSD License six, 1.16.0, MIT License -skl2onnx, 1.16.0, Apache Software License +skl2onnx, 1.12, Apache Software License skops, 0.5.0, MIT skorch, 0.11.0, new BSD 3-Clause sympy, 1.12, BSD License tabulate, 0.8.10, MIT License -threadpoolctl, 3.5.0, BSD License +threadpoolctl, 3.4.0, BSD License torch, 1.13.1, BSD License tqdm, 4.66.2, MIT License; Mozilla Public License 2.0 (MPL 2.0) typing_extensions, 4.5.0, Python Software Foundation License tzdata, 2024.1, Apache Software License urllib3, 2.2.1, MIT License -xgboost, 1.7.6, Apache Software License +xgboost, 1.6.2, Apache Software License z3-solver, 4.13.0.0, MIT License +zipp, 3.18.1, MIT License diff --git a/deps_licenses/licenses_mac_silicon_user.txt.md5 b/deps_licenses/licenses_mac_silicon_user.txt.md5 index 0aa27f999d..aaf4e6a9ed 100644 --- a/deps_licenses/licenses_mac_silicon_user.txt.md5 +++ b/deps_licenses/licenses_mac_silicon_user.txt.md5 @@ -1 +1 @@ -7be80ba54850fbc203015560c8acb9a8 +9b8316c2a6c823884676b39f52eb018a diff --git a/docs/advanced_examples/DecisionTreeClassifier.ipynb b/docs/advanced_examples/DecisionTreeClassifier.ipynb index b3f59fb015..8aca4f5c86 100644 --- a/docs/advanced_examples/DecisionTreeClassifier.ipynb +++ b/docs/advanced_examples/DecisionTreeClassifier.ipynb @@ -35,8 +35,12 @@ "import time\n", "\n", "import numpy\n", - "from sklearn.datasets import fetch_openml\n", - "from sklearn.model_selection import train_test_split\n", + "from sklearn.datasets import fetch_openml, make_classification\n", + "from sklearn.metrics import accuracy_score, average_precision_score, confusion_matrix\n", + "\n", + "# Find best hyper parameters with cross validation\n", + "from sklearn.model_selection import GridSearchCV, train_test_split\n", + "from tqdm.auto import tqdm\n", "\n", "features, classes = fetch_openml(data_id=44, as_frame=False, cache=True, return_X_y=True)\n", "classes = classes.astype(numpy.int64)\n", @@ -65,16 +69,29 @@ "name": "stdout", "output_type": "stream", "text": [ +<<<<<<< HEAD "Best hyper parameters: {'max_depth': None, 'max_features': None, 'min_samples_leaf': 10, 'min_samples_split': 100}\n", "Best score: 0.9302470037668247\n" +======= + "Best hyper parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 10, 'min_samples_split': 10}\n", + "Best score: 0.9300392741953442\n", + "CPU times: user 3 s, sys: 653 ms, total: 3.65 s\n", + "Wall time: 24.9 s\n" +>>>>>>> 86507b52 (feat: support `from_sklearn` for trees) ] } ], "source": [ + "from sklearn.ensemble import RandomForestClassifier as SklearnRandomForestClassifier\n", + "\n", + "%%time\n", "# Find best hyper parameters with cross validation\n", - "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.tree import DecisionTreeClassifier as SklearnDecisionTreeClassifier\n", + "from xgboost.sklearn import XGBClassifier as SklearnXGBClassifier\n", "\n", "from concrete.ml.sklearn import DecisionTreeClassifier as ConcreteDecisionTreeClassifier\n", + "from concrete.ml.sklearn import RandomForestClassifier as ConcreteRandomForestClassifier\n", + "from concrete.ml.sklearn import XGBClassifier as ConcreteXGBClassifier\n", "\n", "# List of hyper parameters to tune\n", "param_grid = {\n", @@ -132,20 +149,21 @@ "name": "stdout", "output_type": "stream", "text": [ - "Sklearn average precision score: 0.95\n", - "Concrete average precision score: 0.97\n" + "Sklearn average precision score: 0.94\n", + "Concrete average precision score: 0.94\n" ] } ], "source": [ "# Compute average precision on test\n", - "from sklearn.metrics import average_precision_score\n", "\n", "# pylint: disable=no-member\n", "y_pred_concrete = model.predict_proba(x_test)[:, 1]\n", "y_pred_sklearn = sklearn_model.predict_proba(x_test)[:, 1]\n", + "\n", "concrete_average_precision = average_precision_score(y_test, y_pred_concrete)\n", "sklearn_average_precision = average_precision_score(y_test, y_pred_sklearn)\n", + "\n", "print(f\"Sklearn average precision score: {sklearn_average_precision:0.2f}\")\n", "print(f\"Concrete average precision score: {concrete_average_precision:0.2f}\")" ] @@ -169,8 +187,8 @@ "text": [ "Number of test samples: 691\n", "Number of spams in test samples: 304\n", - "True Negative (legit mail well classified) rate: 0.9612403100775194\n", - "False Positive (legit mail classified as spam) rate: 0.03875968992248062\n", + "True Negative (legit mail well classified) rate: 0.9302325581395349\n", + "False Positive (legit mail classified as spam) rate: 0.06976744186046512\n", "False Negative (spam mail classified as legit) rate: 0.14473684210526316\n", "True Positive (spam well classified) rate: 0.8552631578947368\n" ] @@ -178,7 +196,6 @@ ], "source": [ "# Show the confusion matrix on x_test\n", - "from sklearn.metrics import confusion_matrix\n", "\n", "y_pred = model.predict(x_test)\n", "true_negative, false_positive, false_negative, true_positive = confusion_matrix(\n", @@ -247,7 +264,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Key generation time: 0.48 seconds\n" + "Key generation time: 0.82 seconds\n" ] } ], @@ -265,7 +282,7 @@ "source": [ "# Reduce the sample size for a faster total execution time\n", "FHE_SAMPLES = 10\n", - "x_test = x_test[:FHE_SAMPLES]\n", + "x_test_fhe = x_test[:FHE_SAMPLES]\n", "y_pred = y_pred[:FHE_SAMPLES]\n", "y_reference = y_test[:FHE_SAMPLES]" ] @@ -279,15 +296,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "Execution time: 0.53 seconds per sample\n" + "Execution time: 3.75 seconds per sample\n" ] } ], "source": [ "# Predict in FHE for a few examples\n", "time_begin = time.time()\n", - "y_pred_fhe = model.predict(x_test, fhe=\"execute\")\n", - "print(f\"Execution time: {(time.time() - time_begin) / len(x_test):.2f} seconds per sample\")" + "y_pred_fhe = model.predict(x_test_fhe, fhe=\"execute\")\n", + "print(f\"Execution time: {(time.time() - time_begin) / len(x_test_fhe):.2f} seconds per sample\")" ] }, { @@ -300,8 +317,8 @@ "output_type": "stream", "text": [ "Ground truth: [0 0 0 1 0 1 0 0 0 0]\n", - "Prediction sklearn: [0 0 0 1 0 1 0 0 0 0]\n", - "Prediction FHE: [0 0 0 1 0 1 0 0 0 0]\n" + "Prediction sklearn: [0 0 0 0 0 0 0 1 0 0]\n", + "Prediction FHE: [0 0 0 0 0 0 0 1 0 0]\n" ] } ], @@ -332,6 +349,234 @@ ")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Importing from scikit-learn" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "features, classes = fetch_openml(data_id=44, as_frame=False, cache=True, return_X_y=True)\n", + "classes = classes.astype(numpy.int64)\n", + "features, classes = make_classification(\n", + " **{\"n_samples\": 1000, \"n_features\": 10, \"n_classes\": 4, \"n_informative\": 10, \"n_redundant\": 0}\n", + ")\n", + "\n", + "\n", + "x_train, x_test, y_train, y_test = train_test_split(\n", + " features,\n", + " classes,\n", + " test_size=0.15,\n", + " random_state=42,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e7c1622bb3574144b642a80cb6ef1e9b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/14 [00:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "cv = False\n", + "\n", + "for model_name in [\"tree\", \"rf\", \"xgb\"]:\n", + " if model_name == \"tree\":\n", + " ClearModel = SklearnDecisionTreeClassifier\n", + " FheModel = ConcreteDecisionTreeClassifier\n", + "\n", + " param_grid = {\n", + " \"max_features\": [None, \"auto\", \"sqrt\", \"log2\"],\n", + " \"min_samples_leaf\": [1, 10, 100],\n", + " \"min_samples_split\": [2, 10, 100],\n", + " \"max_depth\": [None, 2, 4, 6, 8],\n", + " }\n", + " elif model_name == \"rf\":\n", + " ClearModel = SklearnRandomForestClassifier\n", + " FheModel = ConcreteRandomForestClassifier\n", + "\n", + " param_grid = {\n", + " \"max_features\": [None, \"auto\", \"sqrt\", \"log2\"],\n", + " \"min_samples_leaf\": [1, 10, 100],\n", + " \"min_samples_split\": [2, 10, 100],\n", + " \"max_depth\": [None, 2, 4, 6, 8],\n", + " \"n_estimators\": [2, 8, 16],\n", + " }\n", + " elif model_name == \"xgb\":\n", + " ClearModel = SklearnXGBClassifier\n", + " FheModel = ConcreteXGBClassifier\n", + "\n", + " param_grid = {\n", + " # \"max_features\": [None, \"auto\", \"sqrt\", \"log2\"],\n", + " \"min_samples_leaf\": [1, 10, 100],\n", + " \"min_samples_split\": [2, 10, 100],\n", + " #\n", + " \"max_depth\": [None, 2, 4, 6, 8],\n", + " \"n_estimators\": [2, 8, 16],\n", + " }\n", + " assert FheModel.__name__ == ClearModel.__name__\n", + "\n", + " # List of hyper parameters to tune\n", + " if cv:\n", + "\n", + " grid_search = GridSearchCV(\n", + " ClearModel(),\n", + " param_grid,\n", + " cv=10,\n", + " scoring=\"average_precision\",\n", + " error_score=\"raise\",\n", + " n_jobs=-1,\n", + " )\n", + "\n", + " gs_results = grid_search.fit(x_train, y_train)\n", + " print(\"Best hyper parameters:\", gs_results.best_params_)\n", + " print(\"Best score:\", gs_results.best_score_)\n", + "\n", + " # Build the model with best hyper parameters\n", + " sk_model = ClearModel(\n", + " **gs_results.best_params_,\n", + " )\n", + " else:\n", + " sk_model = ClearModel()\n", + "\n", + " sk_model.fit(x_train, y_train)\n", + "\n", + " # Compute average precision on test\n", + "\n", + " data = []\n", + " verbose = False\n", + "\n", + " for n_bits in tqdm(range(2, 16)):\n", + "\n", + " model_from_data = FheModel.from_sklearn_model(sk_model, x_train, n_bits=n_bits)\n", + " model_from_thresholds = FheModel.from_sklearn_model(sk_model, n_bits=n_bits)\n", + "\n", + " # pylint: disable=no-member\n", + " y_pred_concrete_from_data = model_from_data.predict_proba(x_test).argmax(axis=1)\n", + " y_pred_concrete_from_thresholds = model_from_thresholds.predict_proba(x_test).argmax(axis=1)\n", + " y_pred_sklearn = sk_model.predict_proba(x_test).argmax(axis=1)\n", + "\n", + " concrete_from_data_average_precision = accuracy_score(y_test, y_pred_concrete_from_data)\n", + " concrete_from_thresholds_average_precision = accuracy_score(\n", + " y_test, y_pred_concrete_from_thresholds\n", + " )\n", + " sklearn_average_precision = accuracy_score(y_test, y_pred_sklearn)\n", + "\n", + " data.append(\n", + " {\n", + " \"n_bits\": n_bits,\n", + " \"sklearn\": sklearn_average_precision,\n", + " \"from_thresholds\": concrete_from_thresholds_average_precision,\n", + " \"from_data\": concrete_from_data_average_precision,\n", + " }\n", + " )\n", + " if verbose:\n", + " print(f\"Sklearn average precision score: {sklearn_average_precision:0.2f}\")\n", + " print(\n", + " \"Concrete (from data) average precision score: \"\n", + " f\"{concrete_from_data_average_precision:0.2f}\"\n", + " )\n", + " print(\n", + " \"Concrete (from thresholds) average precision score: \"\n", + " f\"{concrete_from_thresholds_average_precision:0.2f}\"\n", + " )\n", + "\n", + " import matplotlib.pyplot as plt\n", + " import pandas as pd\n", + "\n", + " data = pd.DataFrame(data)\n", + "\n", + " fig, ax = plt.subplots()\n", + " for label in [\"sklearn\", \"from_thresholds\", \"from_data\"]:\n", + " ax.plot(data[\"n_bits\"], data[label], label=label)\n", + " ax.set_ylabel(\"accuracy\")\n", + " ax.set_xlabel(\"n-bits\")\n", + " ax.legend()\n", + " ax.set_title(f\"Accuracy per n-bits importing a {FheModel.__name__}\")\n", + " fig.show()\n", + "\n", + " data[\"sklearn - data\"] = data[\"sklearn\"] - data[\"from_data\"]\n", + " data[\"sklearn - threshold\"] = data[\"sklearn\"] - data[\"from_thresholds\"]" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -355,5 +600,5 @@ } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/docs/advanced_examples/XGBClassifier.ipynb b/docs/advanced_examples/XGBClassifier.ipynb index 5d9607028d..69fc290f9f 100644 --- a/docs/advanced_examples/XGBClassifier.ipynb +++ b/docs/advanced_examples/XGBClassifier.ipynb @@ -587,5 +587,5 @@ } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/poetry.lock b/poetry.lock index 296f3f21e4..0dbd3a8a3f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. [[package]] name = "absl-py" @@ -4720,38 +4720,6 @@ pytest = ">=4.6" [package.extras] testing = ["fields", "hunter", "process-tests", "pytest-xdist", "six", "virtualenv"] -[[package]] -name = "pytest-json-report" -version = "1.5.0" -description = "A pytest plugin to report test results as JSON files" -optional = false -python-versions = "*" -files = [ - {file = "pytest-json-report-1.5.0.tar.gz", hash = "sha256:2dde3c647851a19b5f3700729e8310a6e66efb2077d674f27ddea3d34dc615de"}, - {file = "pytest_json_report-1.5.0-py3-none-any.whl", hash = "sha256:9897b68c910b12a2e48dd849f9a284b2c79a732a8a9cb398452ddd23d3c8c325"}, -] - -[package.dependencies] -pytest = ">=3.8.0" -pytest-metadata = "*" - -[[package]] -name = "pytest-metadata" -version = "3.1.1" -description = "pytest plugin for test session metadata" -optional = false -python-versions = ">=3.8" -files = [ - {file = "pytest_metadata-3.1.1-py3-none-any.whl", hash = "sha256:c8e0844db684ee1c798cfa38908d20d67d0463ecb6137c72e91f418558dd5f4b"}, - {file = "pytest_metadata-3.1.1.tar.gz", hash = "sha256:d2a29b0355fbc03f168aa96d41ff88b1a3b44a3b02acbe491801c98a048017c8"}, -] - -[package.dependencies] -pytest = ">=7.0.0" - -[package.extras] -test = ["black (>=22.1.0)", "flake8 (>=4.0.1)", "pre-commit (>=2.17.0)", "tox (>=3.24.5)"] - [[package]] name = "pytest-randomly" version = "3.15.0" @@ -4781,6 +4749,21 @@ files = [ [package.dependencies] pytest = "*" +[[package]] +name = "pytest-subtests" +version = "0.11.0" +description = "unittest subTest() support and subtests fixture" +optional = false +python-versions = ">=3.7" +files = [ + {file = "pytest-subtests-0.11.0.tar.gz", hash = "sha256:51865c88457545f51fb72011942f0a3c6901ee9e24cbfb6d1b9dc1348bafbe37"}, + {file = "pytest_subtests-0.11.0-py3-none-any.whl", hash = "sha256:453389984952eec85ab0ce0c4f026337153df79587048271c7fd0f49119c07e4"}, +] + +[package.dependencies] +attrs = ">=19.2.0" +pytest = ">=7.0" + [[package]] name = "pytest-xdist" version = "3.5.0" @@ -7310,4 +7293,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.11" -content-hash = "18e37472ff221c5e6f57c7a32e3d4d84ed999622ee5ec3c9900627bd9e01349a" +content-hash = "e27e0b729449f72af645776b4002a9e8fba9bfd49a9282bc87104fecc1e410fe" diff --git a/pyproject.toml b/pyproject.toml index e35631fcf4..834cff6acd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,18 +64,18 @@ pylint = "^2.13.0" # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/2541 pytest = "7.4.1" pytest-cov = "^4.1.0" +pytest-xdist = "^3.3.1" +pytest-randomly = "^3.11.0" +pytest-repeat = "^0.9.1" +pytest-subtests = "^0.11.0" pytest_codeblocks = "^0.14.0" mypy = "^1.8.0" pydocstyle = "^6.1.1" python-semantic-release = "^7.27.0" semver = "^2.13.0" tomlkit = "^0.7.0" -pytest-json-report = "^1.5.0" -pytest-xdist = "^3.3.1" -pytest-randomly = "^3.11.0" nbmake = "^1.3.0" pygments-style-tomorrow = "^1.0.0" -pytest-repeat = "^0.9.1" mdformat = "^0.7.14" mdformat_myst = "^0.1.4" mdformat-toc = "^0.3.0" diff --git a/src/concrete/ml/onnx/onnx_impl_utils.py b/src/concrete/ml/onnx/onnx_impl_utils.py index 158f513ae4..6a04fe6c14 100644 --- a/src/concrete/ml/onnx/onnx_impl_utils.py +++ b/src/concrete/ml/onnx/onnx_impl_utils.py @@ -5,7 +5,7 @@ import numpy from concrete.fhe import conv as fhe_conv from concrete.fhe import ones as fhe_ones -from concrete.fhe import round_bit_pattern +from concrete.fhe import truncate_bit_pattern from concrete.fhe.tracing import Tracer from ..common.debugging import assert_true @@ -265,9 +265,16 @@ def rounded_comparison( # Workaround: in this context, `round_bit_pattern` is used as a truncate operation. # Consequently, we subtract a term, called `half` that will subsequently be re-added during the # `round_bit_pattern` process. - half = 1 << (lsbs_to_remove - 1) + # half = 1 << (lsbs_to_remove - 1) # To determine if 'x' 'operation' 'y' (operation being <, >, >=, <=), we evaluate 'x - y' - rounded_subtraction = round_bit_pattern((x - y) - half, lsbs_to_remove=lsbs_to_remove) + # We cast to int because if half is too high the result might be float + # intermediate = ((x - y) - half) + # intermediate_as_int = intermediate.astype(numpy.int64) + # + # if not isinstance(intermediate, Tracer): + # assert (intermediate == intermediate_as_int).all() + + rounded_subtraction = truncate_bit_pattern(x - y, lsbs_to_remove=lsbs_to_remove) return (operation(rounded_subtraction),) diff --git a/src/concrete/ml/quantization/quantizers.py b/src/concrete/ml/quantization/quantizers.py index 2807645434..c4f682f9d7 100644 --- a/src/concrete/ml/quantization/quantizers.py +++ b/src/concrete/ml/quantization/quantizers.py @@ -797,7 +797,7 @@ def dequant(self, qvalues: numpy.ndarray) -> Union[numpy.ndarray, Tracer]: values = self.scale * (qvalues - numpy.asarray(self.zero_point, dtype=numpy.float64)) - assert isinstance(values, (numpy.ndarray, Tracer)) + assert isinstance(values, (float, int, numpy.ndarray, Tracer)) return values diff --git a/src/concrete/ml/sklearn/base.py b/src/concrete/ml/sklearn/base.py index 125e3f09a0..93bbda10e0 100644 --- a/src/concrete/ml/sklearn/base.py +++ b/src/concrete/ml/sklearn/base.py @@ -3,6 +3,7 @@ from __future__ import annotations import copy +import inspect import os import tempfile @@ -13,9 +14,12 @@ from abc import ABC, abstractmethod from functools import partial from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Set, TextIO, Type, Union +from typing import Any, Callable, Dict, List, Optional, Set, TextIO, Tuple, Type, Union import brevitas.nn as qnn + +# pylint: disable-next=ungrouped-imports +import concrete.fhe as cp import numpy import onnx import sklearn @@ -27,12 +31,11 @@ from concrete.fhe.compilation.compiler import Compiler from concrete.fhe.compilation.configuration import Configuration from concrete.fhe.dtypes.integer import Integer +from onnx import numpy_helper from sklearn.base import clone from sklearn.linear_model import LinearRegression, LogisticRegression from sklearn.utils.validation import check_is_fitted - -# pylint: disable-next=ungrouped-imports -from concrete import fhe as cp +from xgboost.sklearn import XGBModel from ..common.check_inputs import check_array_and_assert, check_X_y_and_assert_multi_output from ..common.debugging.custom_assert import assert_true @@ -65,7 +68,15 @@ ) from ..torch import NumpyModule from .qnn_module import SparseQuantNeuralNetwork -from .tree_to_numpy import tree_to_numpy +from .tree_to_numpy import ( + _compute_lsb_to_remove_for_trees, + get_equivalent_numpy_forward_from_onnx_tree, + get_onnx_model, + is_regressor_or_partial_regressor, + preprocess_tree_predictions, + tree_onnx_graph_preprocessing, + tree_to_numpy, +) # Disable pylint to import Hummingbird while ignoring the warnings # pylint: disable=wrong-import-position,wrong-import-order @@ -1328,6 +1339,279 @@ def __init__(self, n_bits: Union[int, Dict[str, int]]): BaseEstimator.__init__(self) + # TODO: FIX EXACT PREDICTION WITH HIGH BIT WIDTH + # pylint: disable=too-many-locals,too-many-statements,too-many-branches + @classmethod + def from_sklearn_model( + cls, + sklearn_model: sklearn.base.BaseEstimator, + X: Optional[numpy.ndarray] = None, + n_bits: int = 8, + ): + """Build a FHE-compliant model using a fitted scikit-learn model. + + Args: + sklearn_model (sklearn.base.BaseEstimator): The fitted scikit-learn model to convert. + X (Optional[Data]): A representative set of input values used for computing quantization + parameters, as a Numpy array, Torch tensor, Pandas DataFrame or List. This is + usually the training data-set or a sub-set of it. + n_bits (int): Number of bits to quantize the model. If an int is passed + for n_bits, the value will be used for quantizing inputs and weights. If a dict is + passed, then it should contain "op_inputs" and "op_weights" as keys with + corresponding number of quantization bits so that: + - op_inputs : number of bits to quantize the input values + - op_weights: number of bits to quantize the learned parameters + Default to 8. + + Returns: + The FHE-compliant fitted model. + """ + # Check that sklearn_model is a proper fitted scikit-learn model + check_is_fitted(sklearn_model) + + # Extract scikit-learn's initialization parameters + init_params = sklearn_model.get_params() + + # Instantiate the Concrete ML model and update initialization parameters + # This update is necessary as we currently store scikit-learn attributes in Concrete ML + # classes during initialization (for example: link or power attributes in GLMs) + # Without it, these attributes will have default values instead of the ones used by the + # scikit-learn models + # This should be fixed once Concrete ML models initialize their underlying scikit-learn + # models during initialization + # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3373 + # Needed for XGB + + cls_signature = inspect.signature(cls) + init_params_keys = list(init_params.keys()) + for key in init_params_keys: + if key not in cls_signature.parameters: + init_params.pop(key) + model = cls(n_bits=n_bits, **init_params) + model._is_fitted = True + + # Update the underlying scikit-learn model with the given fitted one + model.sklearn_model = copy.deepcopy(sklearn_model) + + # Get the onnx model, all operations needed to load it properly will be done on it. + n_features = model.n_features_in_ + dummy_input = numpy.zeros((1, n_features)) + framework = "xgboost" if isinstance(sklearn_model, XGBModel) else "sklearn" + onnx_model = get_onnx_model( + model=sklearn_model, + x=dummy_input, + framework=framework, + ) + + # Get feature -> thresholds mappings and threshold values + weight_1 = numpy.empty((0,)) + bias_1 = numpy.empty((0,)) + bias_1_index = -1 + bias_1_name = "" + + for initializer_index, initializer in enumerate(onnx_model.graph.initializer): + init_tensor = numpy_helper.to_array(initializer) + if "weight_1" in initializer.name: + # weight_1 is the feature node selector + weight_1 = init_tensor.copy() + elif "bias_1" in initializer.name: + # bias _1 is the threshold tensor + bias_1 = init_tensor.copy() + bias_1_index = initializer_index + bias_1_name = initializer.name + + assert bias_1_name + assert bias_1_index >= 0 + assert weight_1.size != 0 + assert bias_1.size != 0 + + # Compute input/threshold quantizers + input_quantizers: List[UniformQuantizer] = [] + + # Quantization of each feature in X + for feature_index in range(n_features): + + # Get all thresholds for a given feature + threshold_for_feature: numpy.ndarray = bias_1[weight_1[:, feature_index] == 1][:, 0] + # Sorting threshold values makes things easier afterwards + threshold_for_feature.sort() + + # All unique threshold values + unique_threshold_for_feature_sorted = numpy.unique(threshold_for_feature) + unique_threshold_for_feature_sorted.sort() + num_unique_thresholds = len(unique_threshold_for_feature_sorted) + + if num_unique_thresholds >= 1: + max_threshold_value = unique_threshold_for_feature_sorted.max() + min_threshold_value = unique_threshold_for_feature_sorted.min() + else: + # TODO: maybe we should pick a random value here ? + max_threshold_value = 1.0 + min_threshold_value = 0.0 + + # We compute a epsilon such that we have one quantized value on each side of the range + # This offset will either be a right or left offset according to the framework + # TODO: reconsider this + number_of_need_offset_values = 2 + if num_unique_thresholds == 0: + epsilon = 1.0 + elif num_unique_thresholds == 1: + epsilon = 1.0 + else: + epsilon = (max_threshold_value - min_threshold_value) / ( + (2**n_bits) - number_of_need_offset_values + ) + + # Input quantizers based on thresholds + # TODO: DOUBLE CHECK THIS PART + if X is None: + if num_unique_thresholds: + min_quantization_value = min_threshold_value + max_quantization_value = max_threshold_value + else: + min_quantization_value = 0 + max_quantization_value = 1.0 + + if num_unique_thresholds == 1: + # If there is only one threshold for this feature + # We want the threshold to be in the middle of a quantization bin + min_quantization_value -= epsilon + max_quantization_value += epsilon + elif framework == "xgboost": + # XGBoost uses a < op so we must add a left offset + min_quantization_value -= epsilon + else: + # scikit-learn uses =< op so we must add a right offset + max_quantization_value += epsilon + + # Quantizer based on data + else: + min_quantization_value = X[:, feature_index].min() + max_quantization_value = X[:, feature_index].max() + + min_quantization_value = float(min_quantization_value) + max_quantization_value = float(max_quantization_value) + input_quantizer = QuantizedArray( + n_bits=n_bits, + values=numpy.array([min_quantization_value, max_quantization_value]), + ).quantizer + # TODO: Assert that there is one and only one bit-value above and below the threshold + input_quantizers.append(input_quantizer) + + # Convert thresholds to their quantized equivalent + quantized_thresholds_array = numpy.empty(bias_1.shape, dtype=numpy.int64) + dequantized_thresholds_array = numpy.empty(bias_1.shape, dtype=numpy.float64) + + for threshold_index, threshold_value in enumerate(bias_1[:, 0]): + feature_index = int(weight_1[threshold_index, :].argmax()) + quantized_threshold_value = ( + input_quantizers[feature_index].quant(threshold_value).astype(numpy.int64) + ) + dequantized_threshold_value = input_quantizers[feature_index].dequant( + quantized_threshold_value + ) + quantized_thresholds_array[threshold_index, 0] = quantized_threshold_value + dequantized_thresholds_array[threshold_index, 0] = dequantized_threshold_value + + # TODO: debug + if n_bits > 20: + diff = dequantized_thresholds_array - bias_1 + max_diff = numpy.abs(diff).max() + if max_diff > 1e-4: + print("ERROR") + print(max_diff) + + onnx_model.graph.initializer[bias_1_index].CopyFrom( + numpy_helper.from_array( + quantized_thresholds_array, + bias_1_name, + ) + ) + + # Tree values pre-processing + # i.e., mainly predictions quantization + # but also rounding the threshold such that they are now integers + model._set_post_processing_params() + + # Get the expected number of ONNX outputs in the sklearn model. + expected_number_of_outputs = 1 if is_regressor_or_partial_regressor(model) else 2 + + # Modify the graph inplace to keep only the parts that are of interest to us + tree_onnx_graph_preprocessing(onnx_model, framework, expected_number_of_outputs) + + # Get the preprocessed tree predictions to replace the current + # (non-quantized) values in the onnx_model. + q_y = None + for initializer_index, initializer in enumerate(onnx_model.graph.initializer): + init_tensor = numpy_helper.to_array(initializer) + if "weight_3" in initializer.name: + # weight_3 is the prediction tensor + # Here we quantize it + q_y = preprocess_tree_predictions(init_tensor, n_bits) + init_tensor_as_int = q_y.qvalues.astype(numpy.int64) + else: + init_tensor_as_int = init_tensor.astype(numpy.int64) + assert ( + isinstance(init_tensor_as_int, numpy.ndarray) + and init_tensor_as_int.dtype == numpy.int64 + ) + new_initializer = numpy_helper.from_array(init_tensor_as_int, initializer.name) + onnx_model.graph.initializer[initializer_index].CopyFrom(new_initializer) + + # Convert the tree inference with Numpy operators + enable_rounding = bool(int(os.environ.get("TREES_USE_ROUNDING", 1))) + + if not enable_rounding: + warnings.simplefilter("always") + warnings.warn( + "Using Concrete tree-based models without the `rounding feature` is deprecated. " + "Consider setting 'use_rounding' to `True` for making the FHE inference faster " + "and key generation.", + category=DeprecationWarning, + stacklevel=2, + ) + + lsbs_to_remove_for_trees: Optional[Tuple[int, int]] = None + + model.input_quantizers = input_quantizers + assert q_y is not None + model.output_quantizers = [q_y.quantizer] + + if enable_rounding: + # Quantize some data + if X is None: + assert isinstance(n_features, int) + calibration_set_size = 100_000 + q_X = numpy.empty((calibration_set_size, n_features), dtype=numpy.int64) + for feature_index in range(n_features): + min_value = input_quantizers[feature_index].rmin + assert min_value is not None + max_value = input_quantizers[feature_index].rmax + assert max_value is not None + q_X[:, feature_index] = ( + input_quantizers[feature_index] + .quant(numpy.linspace(min_value, max_value, calibration_set_size)) + .astype(numpy.int64) + ) + q_X = numpy.random.permutation(q_X) + else: + q_X = model.quantize_input(X).astype(numpy.int64) + + # Compute for tree-based models the LSB to remove in stage 1 and stage 2 + # First LSB refers to Less or LessOrEqual comparisons + # Second LSB refers to Equal comparison + assert q_X.dtype == numpy.int64 + lsbs_to_remove_for_trees = _compute_lsb_to_remove_for_trees(onnx_model, q_X) + + # mypy + assert len(lsbs_to_remove_for_trees) == 2 + + model._tree_inference, model.onnx_model_ = get_equivalent_numpy_forward_from_onnx_tree( + onnx_model, lsbs_to_remove_for_trees=lsbs_to_remove_for_trees + ) + + return model + def fit(self, X: Data, y: Target, **fit_parameters): # Reset for double fit self._is_fitted = False @@ -1560,6 +1844,8 @@ def from_sklearn_model( Returns: The FHE-compliant fitted model. """ + # For now we don't use X for quantization, only the thresholds + # We could support import with data as quantizer too # Check that sklearn_model is a proper fitted scikit-learn model check_is_fitted(sklearn_model) @@ -1660,7 +1946,7 @@ def _quantize_model(self, X): weights = self.sklearn_model.coef_.T q_weights = QuantizedArray( n_bits=n_bits["op_weights"], - values=numpy.expand_dims(weights, axis=1) if len(weights.shape) == 1 else weights, + values=(numpy.expand_dims(weights, axis=1) if len(weights.shape) == 1 else weights), options=weight_options, ) self._q_weights = q_weights.qvalues @@ -1988,7 +2274,7 @@ def fit(self, X: Data, y: Target, **fit_parameters): # We assume that the inputs have the same distribution as the _fit_X q_fit_X = QuantizedArray( n_bits=self.n_bits, - values=numpy.expand_dims(_fit_X, axis=1) if len(_fit_X.shape) == 1 else _fit_X, + values=(numpy.expand_dims(_fit_X, axis=1) if len(_fit_X.shape) == 1 else _fit_X), options=input_options, ) self._q_fit_X = q_fit_X.qvalues diff --git a/src/concrete/ml/sklearn/tree_to_numpy.py b/src/concrete/ml/sklearn/tree_to_numpy.py index 14f4ab732c..a124d64444 100644 --- a/src/concrete/ml/sklearn/tree_to_numpy.py +++ b/src/concrete/ml/sklearn/tree_to_numpy.py @@ -6,6 +6,7 @@ import numpy import onnx +import sklearn from onnx import numpy_helper from ..common.debugging.custom_assert import assert_true @@ -44,11 +45,11 @@ MIN_CIRCUIT_THRESHOLD_FOR_TREES = 4 -def get_onnx_model(model: Callable, x: numpy.ndarray, framework: str) -> onnx.ModelProto: +def get_onnx_model(model, x: numpy.ndarray, framework: str) -> onnx.ModelProto: """Create ONNX model with Hummingbird convert method. Args: - model (Callable): The tree model to convert. + model: The tree model to convert. x (numpy.ndarray): Dataset used to trace the tree inference and convert the model to ONNX. framework (str): The framework from which the ONNX model is generated. (options: 'xgboost', 'sklearn') @@ -328,7 +329,7 @@ def tree_values_preprocessing( # pylint: disable=too-many-locals def tree_to_numpy( - model: Callable, + model: sklearn.base.BaseEstimator, x: numpy.ndarray, framework: str, use_rounding: bool = True, @@ -411,6 +412,9 @@ def _compute_lsb_to_remove_for_trees( Returns: Tuple[int, int]: the number of LSB to remove for level 1 and level 2 + + Raises: + ValueError: if comparison function ('Less' or 'LessOrEqual') cannot be determined. """ def get_bitwidth(array: numpy.ndarray) -> int: @@ -502,6 +506,9 @@ def get_lsbs_to_remove_for_trees(array: numpy.ndarray) -> int: stage_1 = bias_1 - (q_x @ mat_1.transpose(0, 2, 1)) matrix_q = stage_1 >= 0 + else: + raise ValueError("Couldn't see if the comparison is 'Less' or 'LessOrEqual'") + lsbs_to_remove_for_trees_stage_1 = get_lsbs_to_remove_for_trees(stage_1) # If operator is `==`, np.equal(x, y) is equivalent to: diff --git a/tests/sklearn/test_sklearn_models.py b/tests/sklearn/test_sklearn_models.py index b9cb33e84a..5f4ebc78f6 100644 --- a/tests/sklearn/test_sklearn_models.py +++ b/tests/sklearn/test_sklearn_models.py @@ -39,7 +39,13 @@ import torch from sklearn.decomposition import PCA from sklearn.exceptions import ConvergenceWarning, UndefinedMetricWarning -from sklearn.metrics import make_scorer, matthews_corrcoef, top_k_accuracy_score +from sklearn.metrics import ( + accuracy_score, + make_scorer, + matthews_corrcoef, + mean_squared_error, + top_k_accuracy_score, +) from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler @@ -71,6 +77,7 @@ _get_sklearn_neural_net_models, _get_sklearn_tree_models, ) +from concrete.ml.sklearn.base import BaseTreeClassifierMixin, BaseTreeEstimatorMixin # Allow multiple runs in FHE to make sure we always have the correct output N_ALLOWED_FHE_RUN = 5 @@ -679,7 +686,12 @@ def check_input_support(model_class, n_bits, default_configuration, x, y, input_ def cast_input(x, y, input_type): "Convert x and y either in Pandas, List, Numpy or Torch type." - assert input_type in ["pandas", "torch", "list", "numpy"], "Not a valid type casting" + assert input_type in [ + "pandas", + "torch", + "list", + "numpy", + ], "Not a valid type casting" if input_type.lower() == "pandas": # Turn into Pandas @@ -817,7 +829,12 @@ def check_grid_search(model_class, x, y, scoring): pytest.skip("Skipping predict_proba for KNN, doesn't work for now") _ = GridSearchCV( - model_class(), param_grid, cv=2, scoring=scoring, error_score="raise", n_jobs=1 + model_class(), + param_grid, + cv=2, + scoring=scoring, + error_score="raise", + n_jobs=1, ).fit(x, y) @@ -1120,6 +1137,209 @@ def check_exposition_structural_methods_decision_trees(model, x, y): ) +# Add a test to match fp32 -> quant -> dequant weights at the ONNX level +# in the high bit width setting +# Some snippet to do this: +# if n_bits > 17: +# diff = init_tensor - init_tensor_as_int +# max_diff = numpy.abs(diff).max() +# if max_diff > 1e-3: +# raise ValueError(f"{max_diff=} > 1e-4") + + +# pylint: disable-next=too-many-locals,too-many-statements +# TODO: make this pass with rounding (high-bit-width create overflow) +@pytest.mark.parametrize("model_class, parameters", get_sklearn_tree_models_and_datasets()) +@pytest.mark.parametrize("use_rounding", [False, True]) +def test_load_fitted_sklearn_tree_models( + subtests, + model_class, + parameters, + use_rounding, + load_data, + is_weekly_option, + verbose=True, +): + """Test `from_sklearn_model` functionnality of tree-based models.""" + + numpy.random.seed(0) + os.environ["TREES_USE_ROUNDING"] = str(int(use_rounding)) + + x, y = get_dataset( + model_class, parameters, min(N_BITS_REGULAR_BUILDS), load_data, is_weekly_option + ) + + if verbose: + print("Run check_load_pre_trained_sklearn_models") + + assert issubclass(model_class, BaseTreeEstimatorMixin) + concrete_model = instantiate_model_generic(model_class, n_bits=min(N_BITS_REGULAR_BUILDS)) + # Fit the model and retrieve both the Concrete ML and the scikit-learn models + with warnings.catch_warnings(): + # Sometimes, we miss convergence, which is not a problem for our test + warnings.simplefilter("ignore", category=ConvergenceWarning) + concrete_model, sklearn_model = concrete_model.fit_benchmark(x, y) + + # This step is needed in order to handle partial classes + model_class = get_model_class(model_class) + max_n_bits = 18 + reasonable_n_bits = 8 + + # TODO: add normal bit-width comparison + if isinstance(concrete_model, BaseTreeClassifierMixin): + for n_bits, cml_tolerance, sklearn_tolerance in [ + (max_n_bits, 1e-1, 1e-7), + (reasonable_n_bits, 6e-2, 6e-2), + ]: + # Load a Concrete ML model from the fitted scikit-learn one + loaded_from_threshold = model_class.from_sklearn_model( + sklearn_model, + X=None, + n_bits=n_bits, + ) + + loaded_from_data = model_class.from_sklearn_model( + sklearn_model, + X=x, + n_bits=n_bits, + ) + + # Compile both the initial Concrete ML model and the loaded one + concrete_model.compile(x) + mode = "disable" + if n_bits <= 8: + loaded_from_threshold.compile(x) + loaded_from_data.compile(x) + + # Compute and compare the predictions from both models + # Classifiers + + # Predict with all models + sklearn_pred = sklearn_model.predict_proba(x) + cml_y_pred = concrete_model.predict_proba( + x, + fhe=mode, + ) + cml_threshold_y_pred = loaded_from_threshold.predict_proba( + x, + fhe=mode, + ) + cml_data_y_pred = loaded_from_data.predict_proba( + x, + fhe=mode, + ) + + # Compute accuracy + sklearn_accuracy = accuracy_score(sklearn_pred.argmax(axis=1), y) + cml_accuracy = accuracy_score(cml_y_pred.argmax(axis=1), y) + loaded_accuracy_from_threshold_accuracy = accuracy_score( + cml_threshold_y_pred.argmax(axis=1), y + ) + loaded_accuracy_from_data_accuracy = accuracy_score(cml_data_y_pred.argmax(axis=1), y) + + # Compare with sklearn + with subtests.test( + msg="Classifier Sklearn vs Threshold", n_bits=n_bits, tolerance=sklearn_tolerance + ): + value = numpy.abs(loaded_accuracy_from_threshold_accuracy - sklearn_accuracy) + assert ( + value < sklearn_tolerance + ), f"{loaded_accuracy_from_threshold_accuracy=} != {sklearn_accuracy} ({value})" + with subtests.test( + msg="Classifier Sklearn vs Data", n_bits=n_bits, tolerance=sklearn_tolerance + ): + value = numpy.abs(loaded_accuracy_from_data_accuracy - sklearn_accuracy) + assert ( + value < sklearn_tolerance + ), f"{loaded_accuracy_from_data_accuracy=} != {sklearn_accuracy} ({value})" + + # Compare with CML final metric + with subtests.test( + msg="Classifier CML vs Threshold", n_bits=n_bits, tolerance=cml_tolerance + ): + value = numpy.abs(loaded_accuracy_from_threshold_accuracy - cml_accuracy) + assert ( + value < cml_tolerance + ), f"{loaded_accuracy_from_threshold_accuracy=} != {cml_accuracy} ({value})" + with subtests.test( + msg="Classifier CML vs Data", n_bits=n_bits, tolerance=cml_tolerance + ): + value = numpy.abs(loaded_accuracy_from_data_accuracy - cml_accuracy) + assert ( + value < cml_tolerance + ), f"{loaded_accuracy_from_data_accuracy=} != {cml_accuracy} ({value})" + + # Regressor + else: + for n_bits, cml_tolerance, sklearn_tolerance in [ + (max_n_bits, 0.8, 1e-5), + (reasonable_n_bits, 1.4, 1.4), + ]: + # Load a Concrete ML model from the fitted scikit-learn one + loaded_from_threshold = model_class.from_sklearn_model( + sklearn_model, + n_bits=n_bits, + ) + + loaded_from_data = model_class.from_sklearn_model( + sklearn_model, + X=x, + n_bits=n_bits, + ) + + # Compile both the initial Concrete ML model and the loaded one + concrete_model.compile(x) + mode = "disable" + if n_bits <= 8: + loaded_from_threshold.compile(x) + loaded_from_data.compile(x) + + # Compute and compare the predictions from both models + # Regressors + + # Predict + sklearn_pred = sklearn_model.predict(x) + cml_y_pred = concrete_model.predict(x, fhe=mode) + cml_threshold_y_pred = loaded_from_threshold.predict(x, fhe=mode) + cml_data_y_pred = loaded_from_data.predict(x, fhe=mode) + + # Compute metric + sklearn_mse = mean_squared_error(sklearn_pred, y) + cml_mse = mean_squared_error(cml_y_pred, y) + loaded_mse_from_threshold_mse = mean_squared_error(cml_threshold_y_pred, y) + loaded_mse_from_data_mse = mean_squared_error(cml_data_y_pred, y) + + # Compare with scikit-learn + with subtests.test( + msg="Regression Sklearn vs Threshold", n_bits=n_bits, tolerance=sklearn_tolerance + ): + value = numpy.abs(loaded_mse_from_threshold_mse - sklearn_mse) / numpy.abs(y).max() + assert ( + value < sklearn_tolerance + ), f"{loaded_mse_from_threshold_mse=} != {sklearn_mse} ({value=}>={sklearn_tolerance=})" + with subtests.test( + msg="Regression Sklearn vs Data", n_bits=n_bits, tolerance=sklearn_tolerance + ): + value = numpy.abs(loaded_mse_from_data_mse - sklearn_mse) / numpy.abs(y).max() + assert ( + value < sklearn_tolerance + ), f"{loaded_mse_from_data_mse=} != {sklearn_mse} ({value=}>={sklearn_tolerance=})" + + # # Compare with Concrete ML + with subtests.test( + msg="Regression CML vs Threshold", n_bits=n_bits, tolerance=cml_tolerance + ): + value = numpy.abs(loaded_mse_from_threshold_mse - cml_mse) / numpy.abs(y).max() + assert ( + value < cml_tolerance + ), f"{loaded_mse_from_threshold_mse=} != {cml_mse} ({value=}>={cml_tolerance=})" + with subtests.test( + msg="Regression CML vs Data", n_bits=n_bits, tolerance=cml_tolerance + ): + value = numpy.abs(loaded_mse_from_data_mse - cml_mse) / numpy.abs(y).max() + assert value < cml_tolerance, f"{loaded_mse_from_data_mse=} != {cml_mse} ({value=}>={cml_tolerance=})" + + def check_load_fitted_sklearn_linear_models(model_class, n_bits, x, y, check_float_array_equal): """Check that linear models and QNNs support loading from pre-trained scikit-learn models.""" @@ -1135,7 +1355,11 @@ def check_load_fitted_sklearn_linear_models(model_class, n_bits, x, y, check_flo model_class = get_model_class(model_class) # Load a Concrete ML model from the fitted scikit-learn one - loaded_concrete_model = model_class.from_sklearn_model(sklearn_model, X=x, n_bits=n_bits) + loaded_concrete_model = model_class.from_sklearn_model( + sklearn_model, + X=x, + n_bits=n_bits, + ) # Compile both the initial Concrete ML model and the loaded one concrete_model.compile(x) @@ -1353,7 +1577,9 @@ def test_hyper_parameters( pytest.param("recall", True), pytest.param("roc_auc", True), pytest.param( - make_scorer(matthews_corrcoef, greater_is_better=True), True, id="matthews_corrcoef" + make_scorer(matthews_corrcoef, greater_is_better=True), + True, + id="matthews_corrcoef", ), pytest.param("explained_variance", False), pytest.param("max_error", False), @@ -1533,7 +1759,8 @@ def test_inference_methods( # and needs further investigations # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/2779 @pytest.mark.parametrize( - "model_class, parameters", get_sklearn_all_models_and_datasets(ignore="RandomForest") + "model_class, parameters", + get_sklearn_all_models_and_datasets(ignore="RandomForest"), ) @pytest.mark.parametrize( "n_bits", @@ -1574,7 +1801,10 @@ def test_pipeline( n_bits for n_bits in N_BITS_WEEKLY_ONLY_BUILDS + N_BITS_REGULAR_BUILDS if n_bits - < min(N_BITS_LINEAR_MODEL_CRYPTO_PARAMETERS, N_BITS_THRESHOLD_TO_FORCE_EXECUTION_NOT_IN_FHE) + < min( + N_BITS_LINEAR_MODEL_CRYPTO_PARAMETERS, + N_BITS_THRESHOLD_TO_FORCE_EXECUTION_NOT_IN_FHE, + ) ], ) # pylint: disable=too-many-branches @@ -1757,7 +1987,8 @@ def check_for_divergent_predictions( # This test is only relevant for classifier models @pytest.mark.parametrize( - "model_class, parameters", get_sklearn_all_models_and_datasets(regressor=False, classifier=True) + "model_class, parameters", + get_sklearn_all_models_and_datasets(regressor=False, classifier=True), ) def test_class_mapping( model_class, @@ -1801,7 +2032,8 @@ def test_exposition_of_sklearn_attributes( @pytest.mark.parametrize( - "model_class, parameters", get_sklearn_tree_models_and_datasets(select="DecisionTree") + "model_class, parameters", + get_sklearn_tree_models_and_datasets(select="DecisionTree"), ) def test_exposition_structural_methods_decision_trees( model_class,