From 9e4c8ebf6f29c8e219b5d22038816c744b350959 Mon Sep 17 00:00:00 2001 From: Simon Date: Mon, 3 Jun 2024 00:08:15 +0200 Subject: [PATCH 1/4] add InvalidFitData and test cases --- src/safeds/exceptions/__init__.py | 2 + src/safeds/exceptions/_ml.py | 8 + .../nn/converters/_input_converter_table.py | 19 +++ tests/safeds/ml/nn/test_model.py | 140 +++++++++++++++++- 4 files changed, 168 insertions(+), 1 deletion(-) diff --git a/src/safeds/exceptions/__init__.py b/src/safeds/exceptions/__init__.py index 8f1e9de6d..2f84387c9 100644 --- a/src/safeds/exceptions/__init__.py +++ b/src/safeds/exceptions/__init__.py @@ -18,6 +18,7 @@ DatasetMissesFeaturesError, FeatureDataMismatchError, InputSizeError, + InvalidFitDataError, InvalidModelStructureError, LearningError, ModelNotFittedError, @@ -69,6 +70,7 @@ class OutOfBoundsError(SafeDsError): "DatasetMissesDataError", "DatasetMissesFeaturesError", "FeatureDataMismatchError", + "InvalidFitDataError", "InputSizeError", "InvalidModelStructureError", "LearningError", diff --git a/src/safeds/exceptions/_ml.py b/src/safeds/exceptions/_ml.py index d84395485..a35e9f73c 100644 --- a/src/safeds/exceptions/_ml.py +++ b/src/safeds/exceptions/_ml.py @@ -22,6 +22,14 @@ def __init__(self) -> None: super().__init__("Dataset contains no rows") +class InvalidFitDataError(Exception): + """Raised when a Neural Network is fitted on invalid data.""" + + def __init__(self, reason: str) -> None: + super().__init__(f"The given Fit Data is invalid:\n{reason}") + + + class LearningError(Exception): """ Raised when an error occurred while training a model. diff --git a/src/safeds/ml/nn/converters/_input_converter_table.py b/src/safeds/ml/nn/converters/_input_converter_table.py index 52d64ac01..e11dba8d1 100644 --- a/src/safeds/ml/nn/converters/_input_converter_table.py +++ b/src/safeds/ml/nn/converters/_input_converter_table.py @@ -4,6 +4,7 @@ from safeds.data.labeled.containers import TabularDataset from safeds.data.tabular.containers import Column, Table +from safeds.exceptions import InvalidFitDataError from ._input_converter import InputConversion @@ -43,6 +44,24 @@ def _is_fit_data_valid(self, input_data: TabularDataset) -> bool: self._feature_names = input_data.features.column_names self._target_name = input_data.target.name self._first = False + + columns_with_missing_values = [] + columns_with_non_numerical_data = [] + + for col in input_data.features.add_columns([input_data.target]).to_columns(): + if col.missing_value_count() > 0: + columns_with_missing_values.append(col.name) + if not col.type.is_numeric: + columns_with_non_numerical_data.append(col.name) + + reason = "" + if len(columns_with_missing_values) > 0: + reason += f"The following Columns contain missing values: {columns_with_missing_values}\n" + if len(columns_with_non_numerical_data) > 0: + reason += f"The following Columns contain non-numerical data: {columns_with_non_numerical_data}" + if reason is not "": + raise InvalidFitDataError(reason) + return (sorted(input_data.features.column_names)).__eq__(sorted(self._feature_names)) def _is_predict_data_valid(self, input_data: Table) -> bool: diff --git a/tests/safeds/ml/nn/test_model.py b/tests/safeds/ml/nn/test_model.py index 5b8022a2c..8a54db0ca 100644 --- a/tests/safeds/ml/nn/test_model.py +++ b/tests/safeds/ml/nn/test_model.py @@ -1,4 +1,5 @@ import pickle +import re import pytest from safeds.data.image.typing import ImageSize @@ -8,7 +9,7 @@ FeatureDataMismatchError, InvalidModelStructureError, ModelNotFittedError, - OutOfBoundsError, + OutOfBoundsError, InvalidFitDataError, ) from safeds.ml.nn import ( NeuralNetworkClassifier, @@ -231,6 +232,75 @@ def test_should_raise_if_train_features_mismatch(self, device: Device) -> None: ): learned_model.fit(Table.from_dict({"k": [0.1, 0, 0.2], "l": [0, 0.15, 0.5]}).to_tabular_dataset("k")) + @pytest.mark.parametrize( + ("table", "reason"), + [ + ( + Table.from_dict({"a": [1, 2, 3], "b": [1, 2, None], "c": [0, 15, 5]}).to_tabular_dataset("c"), + re.escape("The given Fit Data is invalid:\nThe following Columns contain missing values: ['b']\n"), + ), + ( + Table.from_dict({"a": [1, 2, "a"], "b": [1, 2, 3], "c": [0, 15, 5]}).to_tabular_dataset("c"), + re.escape("The given Fit Data is invalid:\nThe following Columns contain non-numerical data: ['a']"), + ), + ( + Table.from_dict({"a": [1, 2, "a"], "b": [1, 2, None], "c": [0, 15, 5]}).to_tabular_dataset("c"), + re.escape( + "The given Fit Data is invalid:\nThe following Columns contain missing values: ['b']\nThe following Columns contain non-numerical data: ['a']", + ), + ), + ( + Table.from_dict({"a": [1, 2, "a"], "b": [1, "b", None], "c": [0, 15, 5]}).to_tabular_dataset("c"), + re.escape( + "The given Fit Data is invalid:\nThe following Columns contain missing values: ['b']\nThe following Columns contain non-numerical data: ['a', 'b']", + ), + ), + ( + Table.from_dict({"a": [1, 2, 3], "b": [1, 2, 3], "c": [0, None, 5]}).to_tabular_dataset("c"), + re.escape("The given Fit Data is invalid:\nThe following Columns contain missing values: ['c']\n"), + ), + ( + Table.from_dict({"a": [1, 2, 3], "b": [1, 2, 3], "c": [0, "c", 5]}).to_tabular_dataset("c"), + re.escape("The given Fit Data is invalid:\nThe following Columns contain non-numerical data: ['c']"), + ), + ( + Table.from_dict({"a": [1, 2, 3], "b": [1, 2, 3], "c": [0, "c", None]}).to_tabular_dataset("c"), + re.escape( + "The given Fit Data is invalid:\nThe following Columns contain missing values: ['c']\nThe following Columns contain non-numerical data: ['c']", + ), + ), + ( + Table.from_dict({"a": [1, 2, "a"], "b": [1, "b", None], "c": [0, "c", None]}).to_tabular_dataset("c"), + re.escape( + "The given Fit Data is invalid:\nThe following Columns contain missing values: ['b', 'c']\nThe following Columns contain non-numerical data: ['a', 'b', 'c']", + ), + ), + ], + ids=[ + "missing value feature", + "non-numerical feature", + "missing value and non-numerical features", + "mixed missing and non-numerical features", + "missing value target", + "non-numerical target", + "missing value and non-numerical target", + "mixed missing and non-numerical features and target", + ], + ) + def test_should_catch_invalid_fit_data(self, device: Device, table: TabularDataset, reason: str) -> None: + configure_test_with_device(device) + model = NeuralNetworkClassifier( + InputConversionTable(), + [ForwardLayer(neuron_count=4), ForwardLayer(1)], + ) + with pytest.raises( + InvalidFitDataError, + match=reason, + ): + model.fit(table) + + + # def test_should_raise_if_table_size_and_input_size_mismatch(self, device: Device) -> None: # configure_test_with_device(device) # model = NeuralNetworkClassifier( @@ -609,6 +679,73 @@ def test_should_raise_if_train_features_mismatch(self, device: Device) -> None: Table.from_dict({"k": [1, 0, 2], "l": [0, 15, 5]}).to_tabular_dataset("l"), ) + + @pytest.mark.parametrize( + ("table", "reason"), + [ + ( + Table.from_dict({"a": [1, 2, 3], "b": [1, 2, None], "c": [0, 15, 5]}).to_tabular_dataset("c"), + re.escape("The given Fit Data is invalid:\nThe following Columns contain missing values: ['b']\n"), + ), + ( + Table.from_dict({"a": [1, 2, "a"], "b": [1, 2, 3], "c": [0, 15, 5]}).to_tabular_dataset("c"), + re.escape("The given Fit Data is invalid:\nThe following Columns contain non-numerical data: ['a']"), + ), + ( + Table.from_dict({"a": [1, 2, "a"], "b": [1, 2, None], "c": [0, 15, 5]}).to_tabular_dataset("c"), + re.escape( + "The given Fit Data is invalid:\nThe following Columns contain missing values: ['b']\nThe following Columns contain non-numerical data: ['a']", + ), + ), + ( + Table.from_dict({"a": [1, 2, "a"], "b": [1, "b", None], "c": [0, 15, 5]}).to_tabular_dataset("c"), + re.escape( + "The given Fit Data is invalid:\nThe following Columns contain missing values: ['b']\nThe following Columns contain non-numerical data: ['a', 'b']", + ), + ), + ( + Table.from_dict({"a": [1, 2, 3], "b": [1, 2, 3], "c": [0, None, 5]}).to_tabular_dataset("c"), + re.escape("The given Fit Data is invalid:\nThe following Columns contain missing values: ['c']\n"), + ), + ( + Table.from_dict({"a": [1, 2, 3], "b": [1, 2, 3], "c": [0, "c", 5]}).to_tabular_dataset("c"), + re.escape("The given Fit Data is invalid:\nThe following Columns contain non-numerical data: ['c']"), + ), + ( + Table.from_dict({"a": [1, 2, 3], "b": [1, 2, 3], "c": [0, "c", None]}).to_tabular_dataset("c"), + re.escape( + "The given Fit Data is invalid:\nThe following Columns contain missing values: ['c']\nThe following Columns contain non-numerical data: ['c']", + ), + ), + ( + Table.from_dict({"a": [1, 2, "a"], "b": [1, "b", None], "c": [0, "c", None]}).to_tabular_dataset("c"), + re.escape( + "The given Fit Data is invalid:\nThe following Columns contain missing values: ['b', 'c']\nThe following Columns contain non-numerical data: ['a', 'b', 'c']", + ), + ), + ], + ids=[ + "missing value feature", + "non-numerical feature", + "missing value and non-numerical features", + "mixed missing and non-numerical features", + "missing value target", + "non-numerical target", + "missing value and non-numerical target", + "mixed missing and non-numerical features and target", + ], + ) + def test_should_catch_invalid_fit_data(self, device: Device, table: TabularDataset, reason: str) -> None: + model = NeuralNetworkRegressor( + InputConversionTable(), + [ForwardLayer(neuron_count=4), ForwardLayer(1)], + ) + with pytest.raises( + InvalidFitDataError, + match=reason, + ): + model.fit(table) + # def test_should_raise_if_table_size_and_input_size_mismatch(self, device: Device) -> None: # configure_test_with_device(device) # model = NeuralNetworkRegressor( @@ -781,3 +918,4 @@ def test_should_be_pickleable(self, device: Device) -> None: # Should not raise pickle.dumps(fitted_model) + From 75a1d944b2ca1a922d9e90ad31c02a1165a070e5 Mon Sep 17 00:00:00 2001 From: Simon Date: Mon, 3 Jun 2024 00:12:37 +0200 Subject: [PATCH 2/4] linter fix --- tests/safeds/ml/nn/test_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/safeds/ml/nn/test_model.py b/tests/safeds/ml/nn/test_model.py index 8a54db0ca..1fcffcf6e 100644 --- a/tests/safeds/ml/nn/test_model.py +++ b/tests/safeds/ml/nn/test_model.py @@ -736,6 +736,7 @@ def test_should_raise_if_train_features_mismatch(self, device: Device) -> None: ], ) def test_should_catch_invalid_fit_data(self, device: Device, table: TabularDataset, reason: str) -> None: + configure_test_with_device(device) model = NeuralNetworkRegressor( InputConversionTable(), [ForwardLayer(neuron_count=4), ForwardLayer(1)], From 6b8c633d93ab7bd17143a745d031c67893842fd0 Mon Sep 17 00:00:00 2001 From: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com> Date: Sun, 2 Jun 2024 22:14:16 +0000 Subject: [PATCH 3/4] style: apply automated linter fixes --- src/safeds/exceptions/_ml.py | 1 - src/safeds/ml/nn/converters/_input_converter_table.py | 2 +- tests/safeds/ml/nn/test_model.py | 7 ++----- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/src/safeds/exceptions/_ml.py b/src/safeds/exceptions/_ml.py index a35e9f73c..649ea0455 100644 --- a/src/safeds/exceptions/_ml.py +++ b/src/safeds/exceptions/_ml.py @@ -29,7 +29,6 @@ def __init__(self, reason: str) -> None: super().__init__(f"The given Fit Data is invalid:\n{reason}") - class LearningError(Exception): """ Raised when an error occurred while training a model. diff --git a/src/safeds/ml/nn/converters/_input_converter_table.py b/src/safeds/ml/nn/converters/_input_converter_table.py index e11dba8d1..7f26b39af 100644 --- a/src/safeds/ml/nn/converters/_input_converter_table.py +++ b/src/safeds/ml/nn/converters/_input_converter_table.py @@ -59,7 +59,7 @@ def _is_fit_data_valid(self, input_data: TabularDataset) -> bool: reason += f"The following Columns contain missing values: {columns_with_missing_values}\n" if len(columns_with_non_numerical_data) > 0: reason += f"The following Columns contain non-numerical data: {columns_with_non_numerical_data}" - if reason is not "": + if reason != "": raise InvalidFitDataError(reason) return (sorted(input_data.features.column_names)).__eq__(sorted(self._feature_names)) diff --git a/tests/safeds/ml/nn/test_model.py b/tests/safeds/ml/nn/test_model.py index 1fcffcf6e..db1b94840 100644 --- a/tests/safeds/ml/nn/test_model.py +++ b/tests/safeds/ml/nn/test_model.py @@ -7,9 +7,10 @@ from safeds.data.tabular.containers import Table from safeds.exceptions import ( FeatureDataMismatchError, + InvalidFitDataError, InvalidModelStructureError, ModelNotFittedError, - OutOfBoundsError, InvalidFitDataError, + OutOfBoundsError, ) from safeds.ml.nn import ( NeuralNetworkClassifier, @@ -299,8 +300,6 @@ def test_should_catch_invalid_fit_data(self, device: Device, table: TabularDatas ): model.fit(table) - - # def test_should_raise_if_table_size_and_input_size_mismatch(self, device: Device) -> None: # configure_test_with_device(device) # model = NeuralNetworkClassifier( @@ -679,7 +678,6 @@ def test_should_raise_if_train_features_mismatch(self, device: Device) -> None: Table.from_dict({"k": [1, 0, 2], "l": [0, 15, 5]}).to_tabular_dataset("l"), ) - @pytest.mark.parametrize( ("table", "reason"), [ @@ -919,4 +917,3 @@ def test_should_be_pickleable(self, device: Device) -> None: # Should not raise pickle.dumps(fitted_model) - From 620cbf1b6aaa9513d4bc39c9315ab99d007e3c00 Mon Sep 17 00:00:00 2001 From: Simon Date: Tue, 4 Jun 2024 15:33:00 +0200 Subject: [PATCH 4/4] adjust tests --- tests/safeds/ml/nn/test_model.py | 58 ++++++-------------------------- 1 file changed, 10 insertions(+), 48 deletions(-) diff --git a/tests/safeds/ml/nn/test_model.py b/tests/safeds/ml/nn/test_model.py index db1b94840..0902d630d 100644 --- a/tests/safeds/ml/nn/test_model.py +++ b/tests/safeds/ml/nn/test_model.py @@ -241,51 +241,32 @@ def test_should_raise_if_train_features_mismatch(self, device: Device) -> None: re.escape("The given Fit Data is invalid:\nThe following Columns contain missing values: ['b']\n"), ), ( - Table.from_dict({"a": [1, 2, "a"], "b": [1, 2, 3], "c": [0, 15, 5]}).to_tabular_dataset("c"), + Table.from_dict({"a": ["a", "b", "c"], "b": [1, 2, 3], "c": [0, 15, 5]}).to_tabular_dataset("c"), re.escape("The given Fit Data is invalid:\nThe following Columns contain non-numerical data: ['a']"), ), ( - Table.from_dict({"a": [1, 2, "a"], "b": [1, 2, None], "c": [0, 15, 5]}).to_tabular_dataset("c"), + Table.from_dict({"a": ["a", "b", "c"], "b": [1, 2, None], "c": [0, 15, 5]}).to_tabular_dataset("c"), re.escape( "The given Fit Data is invalid:\nThe following Columns contain missing values: ['b']\nThe following Columns contain non-numerical data: ['a']", ), ), - ( - Table.from_dict({"a": [1, 2, "a"], "b": [1, "b", None], "c": [0, 15, 5]}).to_tabular_dataset("c"), - re.escape( - "The given Fit Data is invalid:\nThe following Columns contain missing values: ['b']\nThe following Columns contain non-numerical data: ['a', 'b']", - ), - ), ( Table.from_dict({"a": [1, 2, 3], "b": [1, 2, 3], "c": [0, None, 5]}).to_tabular_dataset("c"), - re.escape("The given Fit Data is invalid:\nThe following Columns contain missing values: ['c']\n"), - ), - ( - Table.from_dict({"a": [1, 2, 3], "b": [1, 2, 3], "c": [0, "c", 5]}).to_tabular_dataset("c"), - re.escape("The given Fit Data is invalid:\nThe following Columns contain non-numerical data: ['c']"), - ), - ( - Table.from_dict({"a": [1, 2, 3], "b": [1, 2, 3], "c": [0, "c", None]}).to_tabular_dataset("c"), re.escape( - "The given Fit Data is invalid:\nThe following Columns contain missing values: ['c']\nThe following Columns contain non-numerical data: ['c']", + "The given Fit Data is invalid:\nThe following Columns contain missing values: ['c']\n", ), ), ( - Table.from_dict({"a": [1, 2, "a"], "b": [1, "b", None], "c": [0, "c", None]}).to_tabular_dataset("c"), - re.escape( - "The given Fit Data is invalid:\nThe following Columns contain missing values: ['b', 'c']\nThe following Columns contain non-numerical data: ['a', 'b', 'c']", - ), + Table.from_dict({"a": [1, 2, 3], "b": [1, 2, 3], "c": ["a", "b", "a"]}).to_tabular_dataset("c"), + re.escape("The given Fit Data is invalid:\nThe following Columns contain non-numerical data: ['c']"), ), ], ids=[ "missing value feature", "non-numerical feature", "missing value and non-numerical features", - "mixed missing and non-numerical features", "missing value target", "non-numerical target", - "missing value and non-numerical target", - "mixed missing and non-numerical features and target", ], ) def test_should_catch_invalid_fit_data(self, device: Device, table: TabularDataset, reason: str) -> None: @@ -686,51 +667,32 @@ def test_should_raise_if_train_features_mismatch(self, device: Device) -> None: re.escape("The given Fit Data is invalid:\nThe following Columns contain missing values: ['b']\n"), ), ( - Table.from_dict({"a": [1, 2, "a"], "b": [1, 2, 3], "c": [0, 15, 5]}).to_tabular_dataset("c"), + Table.from_dict({"a": ["a", "b", "c"], "b": [1, 2, 3], "c": [0, 15, 5]}).to_tabular_dataset("c"), re.escape("The given Fit Data is invalid:\nThe following Columns contain non-numerical data: ['a']"), ), ( - Table.from_dict({"a": [1, 2, "a"], "b": [1, 2, None], "c": [0, 15, 5]}).to_tabular_dataset("c"), + Table.from_dict({"a": ["a", "b", "c"], "b": [1, 2, None], "c": [0, 15, 5]}).to_tabular_dataset("c"), re.escape( "The given Fit Data is invalid:\nThe following Columns contain missing values: ['b']\nThe following Columns contain non-numerical data: ['a']", ), ), - ( - Table.from_dict({"a": [1, 2, "a"], "b": [1, "b", None], "c": [0, 15, 5]}).to_tabular_dataset("c"), - re.escape( - "The given Fit Data is invalid:\nThe following Columns contain missing values: ['b']\nThe following Columns contain non-numerical data: ['a', 'b']", - ), - ), ( Table.from_dict({"a": [1, 2, 3], "b": [1, 2, 3], "c": [0, None, 5]}).to_tabular_dataset("c"), - re.escape("The given Fit Data is invalid:\nThe following Columns contain missing values: ['c']\n"), - ), - ( - Table.from_dict({"a": [1, 2, 3], "b": [1, 2, 3], "c": [0, "c", 5]}).to_tabular_dataset("c"), - re.escape("The given Fit Data is invalid:\nThe following Columns contain non-numerical data: ['c']"), - ), - ( - Table.from_dict({"a": [1, 2, 3], "b": [1, 2, 3], "c": [0, "c", None]}).to_tabular_dataset("c"), re.escape( - "The given Fit Data is invalid:\nThe following Columns contain missing values: ['c']\nThe following Columns contain non-numerical data: ['c']", + "The given Fit Data is invalid:\nThe following Columns contain missing values: ['c']\n", ), ), ( - Table.from_dict({"a": [1, 2, "a"], "b": [1, "b", None], "c": [0, "c", None]}).to_tabular_dataset("c"), - re.escape( - "The given Fit Data is invalid:\nThe following Columns contain missing values: ['b', 'c']\nThe following Columns contain non-numerical data: ['a', 'b', 'c']", - ), + Table.from_dict({"a": [1, 2, 3], "b": [1, 2, 3], "c": ["a", "b", "a"]}).to_tabular_dataset("c"), + re.escape("The given Fit Data is invalid:\nThe following Columns contain non-numerical data: ['c']"), ), ], ids=[ "missing value feature", "non-numerical feature", "missing value and non-numerical features", - "mixed missing and non-numerical features", "missing value target", "non-numerical target", - "missing value and non-numerical target", - "mixed missing and non-numerical features and target", ], ) def test_should_catch_invalid_fit_data(self, device: Device, table: TabularDataset, reason: str) -> None: