Deal with non-decimal float values in the unseen values calculator

NannyML · Aug 26, 2024 · 0a35cf2 · 0a35cf2
1 parent 08fa87d
commit 0a35cf2
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 3 deletions.
diff --git a/nannyml/data_quality/unseen/calculator.py b/nannyml/data_quality/unseen/calculator.py
@@ -35,6 +35,8 @@ def __init__(
  self,
  column_names: Union[str, List[str]],
  normalize: bool = True,
+ y_pred_column_name: Optional[str] = None,
+ y_true_column_name: Optional[str] = None,
  timestamp_column_name: Optional[str] = None,
  chunk_size: Optional[int] = None,
  chunk_number: Optional[int] = None,
@@ -96,6 +98,10 @@ def __init__(
  "column_names should be either a column name string or a list of columns names strings, "
  "found\n{column_names}"
  )
+
+ self.y_pred_column_name = y_pred_column_name
+ self.y_true_column_name = y_true_column_name
+
  self.result: Optional[Result] = None
  # Threshold strategy is the same across all columns
  # By default for unseen values there is no lower threshold or threshold limit.
@@ -135,6 +141,12 @@ def _fit(self, reference_data: pd.DataFrame, *args, **kwargs):
  # Included columns of dtype=int should be considered categorical. We'll try converting those explicitly.
  reference_data = _convert_int_columns_to_categorical(reference_data, self.column_names, self._logger)
 
+ # y_true and y_pred columns are treated as categorical for the purpose of this calculator
+ if self.y_pred_column_name:
+ reference_data[self.y_pred_column_name] = reference_data[self.y_pred_column_name].astype('category')
+ if self.y_true_column_name:
+ reference_data[self.y_true_column_name] = reference_data[self.y_true_column_name].astype('category')
+
  # All provided columns must be categorical
  continuous_column_names, categorical_column_names = _split_features_by_type(reference_data, self.column_names)
  if not set(self.column_names) == set(categorical_column_names):
@@ -263,8 +275,7 @@ def _convert_int_columns_to_categorical(
  int_cols = list(
  filter(
  lambda c: c in column_names
- and data[c].dtype in ('int_', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64')
- or (data[c].dtype in ('float_', 'float16', 'float32', 'float64') and (data[c] % 1 == 0).all()),
+ and data[c].dtype in ('int_', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64'),
  data.columns,
  )
  )

diff --git a/tests/data_quality/test_unseen.py b/tests/data_quality/test_unseen.py
@@ -253,7 +253,7 @@ def test_input_dataframes_are_not_altered_by_calculator(): # noqa: D103
  pd.testing.assert_frame_equal(reference, reference2)
 
 
-def test_int_values_are_treated_as_categorical(): # noqa: D103
+def test_float_target_values_are_treated_as_categorical(): # noqa: D103
  reference, analysis, _ = load_synthetic_car_loan_data_quality_dataset()
  try:
  _ = UnseenValuesCalculator(
@@ -262,6 +262,8 @@ def test_int_values_are_treated_as_categorical(): # noqa: D103
  ],
  timestamp_column_name='timestamp',
  normalize=False,
+ y_pred_column_name='y_pred',
+ y_true_column_name='repaid',
  ).fit(reference)
  except InvalidArgumentsException:
  pytest.fail()