From 420afc5372ba98a1563bae75567d692823746078 Mon Sep 17 00:00:00 2001 From: Villu Ruusmann Date: Mon, 18 Mar 2024 14:29:50 +0200 Subject: [PATCH] Improved support for Pandas extension data types --- sklearn2pmml/decoration/__init__.py | 14 +++++--- sklearn2pmml/decoration/tests/__init__.py | 39 +++++++++++++++++++++-- 2 files changed, 46 insertions(+), 7 deletions(-) diff --git a/sklearn2pmml/decoration/__init__.py b/sklearn2pmml/decoration/__init__.py index cd7b206..57ead29 100644 --- a/sklearn2pmml/decoration/__init__.py +++ b/sklearn2pmml/decoration/__init__.py @@ -136,8 +136,8 @@ def _to_missing(self, X, where): def _missing_value_mask(self, X): if self.missing_values is not None: def is_missing(X, missing_value): - # float("NaN") != float("NaN") - if isinstance(missing_value, float) and numpy.isnan(missing_value): + # Values like float("NaN"), Numpy.NaN and Pandas.NA fail the '==' operator + if pandas.isnull(missing_value): return pandas.isnull(X) return X == missing_value @@ -380,10 +380,14 @@ def fit(self, X, y = None): return self X = to_numpy(X) if self.with_data: - if issubclass(self.dtype_.type, numbers.Integral): - info = numpy.iinfo(self.dtype_) + dtype = self.dtype_ + # Unbox Pandas' extension data type to Numpy data type + if hasattr(dtype, "numpy_dtype"): + dtype = dtype.numpy_dtype + if issubclass(dtype.type, numbers.Integral): + info = numpy.iinfo(dtype) else: - info = numpy.finfo(self.dtype_) + info = numpy.finfo(dtype) missing_mask = self._missing_value_mask(X) nonmissing_mask = ~missing_mask if self.data_min is None: diff --git a/sklearn2pmml/decoration/tests/__init__.py b/sklearn2pmml/decoration/tests/__init__.py index bd065e0..8b8f55a 100644 --- a/sklearn2pmml/decoration/tests/__init__.py +++ b/sklearn2pmml/decoration/tests/__init__.py @@ -1,5 +1,5 @@ from datetime import datetime -from pandas import Categorical, CategoricalDtype, DataFrame, Series +from pandas import BooleanDtype, Categorical, CategoricalDtype, DataFrame, Int64Dtype, Series from sklearn.base import clone from sklearn.impute import SimpleImputer from sklearn.pipeline import Pipeline @@ -80,6 +80,27 @@ def test_init(self): class CategoricalDomainTest(TestCase): + def test_fit_boolean_missing(self): + domain = clone(CategoricalDomain()) + X = Series([0, pandas.NA, 0, 1], dtype = BooleanDtype()) + self.assertIsInstance(X.dtype, BooleanDtype) + self.assertEqual([False, True, False, False], domain._missing_value_mask(X).tolist()) + Xt = domain.fit_transform(X) + self.assertIsInstance(Xt, Series) + self.assertIsInstance(Xt.dtype, BooleanDtype) + self.assertFalse(hasattr(domain, "n_features_in_")) + self.assertEqual([False, True], domain.data_values_.tolist()) + self.assertEqual([False, pandas.NA, False, True], Xt.tolist()) + domain = clone(CategoricalDomain()) + X = to_numpy(X) + self.assertEqual([False, True, False, False], domain._missing_value_mask(X).tolist()) + Xt = domain.fit_transform(X) + self.assertIsInstance(Xt, numpy.ndarray) + self.assertEqual(numpy.dtype("O"), Xt.dtype) + self.assertFalse(hasattr(domain, "n_features_in_")) + self.assertEqual([False, True], domain.data_values_.tolist()) + self.assertEqual([False, None, False, True], Xt.tolist()) + def test_fit_int(self): domain = clone(CategoricalDomain(with_data = False, with_statistics = False)) self.assertTrue(domain._empty_fit()) @@ -162,10 +183,12 @@ def test_fit_int_categorical(self): def test_fit_int64(self): domain = clone(CategoricalDomain()) - X = Series([-1, None, 1, 2, -1]).astype("Int64") + X = Series([-1, pandas.NA, 1, 2, -1], dtype = "Int64") + self.assertIsInstance(X.dtype, Int64Dtype) self.assertEqual([False, True, False, False, False], domain._missing_value_mask(X).tolist()) Xt = domain.fit_transform(X) self.assertIsInstance(Xt, Series) + self.assertIsInstance(Xt.dtype, Int64Dtype) self.assertFalse(hasattr(domain, "n_features_in_")) self.assertEqual([-1, 1, 2], domain.data_values_.tolist()) self.assertEqual([-1, pandas.NA, 1, 2, -1], Xt.tolist()) @@ -401,6 +424,18 @@ def test_fit_int(self): self.assertEqual(3, domain.data_max_) self.assertEqual({"totalFreq" : 5, "missingFreq" : 2, "invalidFreq" : 0}, domain.counts_) + def test_fit_int64(self): + domain = clone(ContinuousDomain(with_statistics = True)) + X = Series([-2, -1, 0, -1, 3], dtype = "Int64") + self.assertIsInstance(X.dtype, Int64Dtype) + Xt = domain.fit_transform(X) + self.assertIsInstance(Xt, Series) + self.assertIsInstance(Xt.dtype, Int64Dtype) + self.assertFalse(hasattr(domain, "n_features_in_")) + self.assertEqual(-2, domain.data_min_) + self.assertEqual(3, domain.data_max_) + self.assertEqual({"totalFreq" : 5, "missingFreq" : 0, "invalidFreq" : 0}, domain.counts_) + def test_mapper(self): domain = ContinuousDomain(with_statistics = True) X = DataFrame([[2.0, 2], [1.0, 0.5], [2, float("NaN")], [float("NaN"), 2], [2.0, float("NaN")], [3.0, 3.5]], columns = ["x1", "x2"])