Skip to content

Commit

Permalink
Improved support for Pandas extension data types
Browse files Browse the repository at this point in the history
  • Loading branch information
vruusmann committed Mar 18, 2024
1 parent 80c3254 commit 420afc5
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 7 deletions.
14 changes: 9 additions & 5 deletions sklearn2pmml/decoration/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,8 @@ def _to_missing(self, X, where):
def _missing_value_mask(self, X):
if self.missing_values is not None:
def is_missing(X, missing_value):
# float("NaN") != float("NaN")
if isinstance(missing_value, float) and numpy.isnan(missing_value):
# Values like float("NaN"), Numpy.NaN and Pandas.NA fail the '==' operator
if pandas.isnull(missing_value):
return pandas.isnull(X)
return X == missing_value

Expand Down Expand Up @@ -380,10 +380,14 @@ def fit(self, X, y = None):
return self
X = to_numpy(X)
if self.with_data:
if issubclass(self.dtype_.type, numbers.Integral):
info = numpy.iinfo(self.dtype_)
dtype = self.dtype_
# Unbox Pandas' extension data type to Numpy data type
if hasattr(dtype, "numpy_dtype"):
dtype = dtype.numpy_dtype
if issubclass(dtype.type, numbers.Integral):
info = numpy.iinfo(dtype)
else:
info = numpy.finfo(self.dtype_)
info = numpy.finfo(dtype)
missing_mask = self._missing_value_mask(X)
nonmissing_mask = ~missing_mask
if self.data_min is None:
Expand Down
39 changes: 37 additions & 2 deletions sklearn2pmml/decoration/tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from datetime import datetime
from pandas import Categorical, CategoricalDtype, DataFrame, Series
from pandas import BooleanDtype, Categorical, CategoricalDtype, DataFrame, Int64Dtype, Series
from sklearn.base import clone
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
Expand Down Expand Up @@ -80,6 +80,27 @@ def test_init(self):

class CategoricalDomainTest(TestCase):

def test_fit_boolean_missing(self):
domain = clone(CategoricalDomain())
X = Series([0, pandas.NA, 0, 1], dtype = BooleanDtype())
self.assertIsInstance(X.dtype, BooleanDtype)
self.assertEqual([False, True, False, False], domain._missing_value_mask(X).tolist())
Xt = domain.fit_transform(X)
self.assertIsInstance(Xt, Series)
self.assertIsInstance(Xt.dtype, BooleanDtype)
self.assertFalse(hasattr(domain, "n_features_in_"))
self.assertEqual([False, True], domain.data_values_.tolist())
self.assertEqual([False, pandas.NA, False, True], Xt.tolist())
domain = clone(CategoricalDomain())
X = to_numpy(X)
self.assertEqual([False, True, False, False], domain._missing_value_mask(X).tolist())
Xt = domain.fit_transform(X)
self.assertIsInstance(Xt, numpy.ndarray)
self.assertEqual(numpy.dtype("O"), Xt.dtype)
self.assertFalse(hasattr(domain, "n_features_in_"))
self.assertEqual([False, True], domain.data_values_.tolist())
self.assertEqual([False, None, False, True], Xt.tolist())

def test_fit_int(self):
domain = clone(CategoricalDomain(with_data = False, with_statistics = False))
self.assertTrue(domain._empty_fit())
Expand Down Expand Up @@ -162,10 +183,12 @@ def test_fit_int_categorical(self):

def test_fit_int64(self):
domain = clone(CategoricalDomain())
X = Series([-1, None, 1, 2, -1]).astype("Int64")
X = Series([-1, pandas.NA, 1, 2, -1], dtype = "Int64")
self.assertIsInstance(X.dtype, Int64Dtype)
self.assertEqual([False, True, False, False, False], domain._missing_value_mask(X).tolist())
Xt = domain.fit_transform(X)
self.assertIsInstance(Xt, Series)
self.assertIsInstance(Xt.dtype, Int64Dtype)
self.assertFalse(hasattr(domain, "n_features_in_"))
self.assertEqual([-1, 1, 2], domain.data_values_.tolist())
self.assertEqual([-1, pandas.NA, 1, 2, -1], Xt.tolist())
Expand Down Expand Up @@ -401,6 +424,18 @@ def test_fit_int(self):
self.assertEqual(3, domain.data_max_)
self.assertEqual({"totalFreq" : 5, "missingFreq" : 2, "invalidFreq" : 0}, domain.counts_)

def test_fit_int64(self):
domain = clone(ContinuousDomain(with_statistics = True))
X = Series([-2, -1, 0, -1, 3], dtype = "Int64")
self.assertIsInstance(X.dtype, Int64Dtype)
Xt = domain.fit_transform(X)
self.assertIsInstance(Xt, Series)
self.assertIsInstance(Xt.dtype, Int64Dtype)
self.assertFalse(hasattr(domain, "n_features_in_"))
self.assertEqual(-2, domain.data_min_)
self.assertEqual(3, domain.data_max_)
self.assertEqual({"totalFreq" : 5, "missingFreq" : 0, "invalidFreq" : 0}, domain.counts_)

def test_mapper(self):
domain = ContinuousDomain(with_statistics = True)
X = DataFrame([[2.0, 2], [1.0, 0.5], [2, float("NaN")], [float("NaN"), 2], [2.0, float("NaN")], [3.0, 3.5]], columns = ["x1", "x2"])
Expand Down

0 comments on commit 420afc5

Please sign in to comment.