Skip to content

Commit

Permalink
Deal with non-decimal float values in the unseen values calculator
Browse files Browse the repository at this point in the history
  • Loading branch information
nnansters committed Aug 26, 2024
1 parent 08fa87d commit 0a35cf2
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 3 deletions.
15 changes: 13 additions & 2 deletions nannyml/data_quality/unseen/calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ def __init__(
self,
column_names: Union[str, List[str]],
normalize: bool = True,
y_pred_column_name: Optional[str] = None,
y_true_column_name: Optional[str] = None,
timestamp_column_name: Optional[str] = None,
chunk_size: Optional[int] = None,
chunk_number: Optional[int] = None,
Expand Down Expand Up @@ -96,6 +98,10 @@ def __init__(
"column_names should be either a column name string or a list of columns names strings, "
"found\n{column_names}"
)

self.y_pred_column_name = y_pred_column_name
self.y_true_column_name = y_true_column_name

self.result: Optional[Result] = None
# Threshold strategy is the same across all columns
# By default for unseen values there is no lower threshold or threshold limit.
Expand Down Expand Up @@ -135,6 +141,12 @@ def _fit(self, reference_data: pd.DataFrame, *args, **kwargs):
# Included columns of dtype=int should be considered categorical. We'll try converting those explicitly.
reference_data = _convert_int_columns_to_categorical(reference_data, self.column_names, self._logger)

# y_true and y_pred columns are treated as categorical for the purpose of this calculator
if self.y_pred_column_name:
reference_data[self.y_pred_column_name] = reference_data[self.y_pred_column_name].astype('category')
if self.y_true_column_name:
reference_data[self.y_true_column_name] = reference_data[self.y_true_column_name].astype('category')

# All provided columns must be categorical
continuous_column_names, categorical_column_names = _split_features_by_type(reference_data, self.column_names)
if not set(self.column_names) == set(categorical_column_names):
Expand Down Expand Up @@ -263,8 +275,7 @@ def _convert_int_columns_to_categorical(
int_cols = list(
filter(
lambda c: c in column_names
and data[c].dtype in ('int_', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64')
or (data[c].dtype in ('float_', 'float16', 'float32', 'float64') and (data[c] % 1 == 0).all()),
and data[c].dtype in ('int_', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64'),
data.columns,
)
)
Expand Down
4 changes: 3 additions & 1 deletion tests/data_quality/test_unseen.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ def test_input_dataframes_are_not_altered_by_calculator(): # noqa: D103
pd.testing.assert_frame_equal(reference, reference2)


def test_int_values_are_treated_as_categorical(): # noqa: D103
def test_float_target_values_are_treated_as_categorical(): # noqa: D103
reference, analysis, _ = load_synthetic_car_loan_data_quality_dataset()
try:
_ = UnseenValuesCalculator(
Expand All @@ -262,6 +262,8 @@ def test_int_values_are_treated_as_categorical(): # noqa: D103
],
timestamp_column_name='timestamp',
normalize=False,
y_pred_column_name='y_pred',
y_true_column_name='repaid',
).fit(reference)
except InvalidArgumentsException:
pytest.fail()

0 comments on commit 0a35cf2

Please sign in to comment.