From 9a90de310728fbdf35c7959269f551033eedd7d5 Mon Sep 17 00:00:00 2001 From: Joppe Geluykens Date: Wed, 5 Oct 2022 11:43:00 +0000 Subject: [PATCH 1/2] extend test coverage for non-conventional booleans --- tests/integration_tests/test_preprocessing.py | 93 +++++++++++++++++++ .../utils/automl/test_type_inference.py | 2 + 2 files changed, 95 insertions(+) diff --git a/tests/integration_tests/test_preprocessing.py b/tests/integration_tests/test_preprocessing.py index b0d1f84edb6..eb3ff8a26d2 100644 --- a/tests/integration_tests/test_preprocessing.py +++ b/tests/integration_tests/test_preprocessing.py @@ -1,5 +1,7 @@ +import logging import os import random +import re import string import numpy as np @@ -398,3 +400,94 @@ def test_in_memory_dataset_size(backend, tmpdir, ray_cluster_2cpu): assert training_dataset.in_memory_size_bytes > 0 assert validation_dataset.in_memory_size_bytes > 0 assert test_dataset.in_memory_size_bytes > 0 + + +@pytest.mark.parametrize( + "binary_as_input, expected_preprocessing", + [ + pytest.param( + True, + { + "missing_value_strategy": "fill_with_false", + "fill_value": None, + "computed_fill_value": "<=50K", + "fallback_true_label": ">50K", + }, + id="binary_as_input", + ), + pytest.param( + False, + { + "missing_value_strategy": "drop_row", + "fill_value": None, + "computed_fill_value": None, + "fallback_true_label": ">50K", + }, + id="binary_as_output", + ), + ], +) +def test_non_conventional_bool_with_fallback(binary_as_input, expected_preprocessing, tmpdir): + # Specify a non-conventional boolean feature with a fallback true label. + bin_feature = binary_feature(bool2str=["<=50K", ">50K"], preprocessing={"fallback_true_label": ">50K"}) + + # Generate data with the non-conventional boolean feature. + if binary_as_input: + input_features = [bin_feature] + output_features = [number_feature()] + else: + input_features = [number_feature()] + output_features = [bin_feature] + config = {"input_features": input_features, "output_features": output_features, TRAINER: {"epochs": 2}} + + data_csv_path = os.path.join(tmpdir, "data.csv") + training_data_csv_path = generate_data(input_features, output_features, data_csv_path) + df = pd.read_csv(training_data_csv_path) + + # Preprocess the data. + ludwig_model = LudwigModel(config) + _, _, _, training_set_metadata = ludwig_model.preprocess(dataset=df) + + # Check that true/false labels are set correctly. + assert training_set_metadata[bin_feature[NAME]] == { + "str2bool": {"<=50K": False, ">50K": True}, + "bool2str": ["<=50K", ">50K"], + "fallback_true_label": ">50K", + "preprocessing": expected_preprocessing, + } + + +@pytest.mark.parametrize( + "binary_as_input", [pytest.param(True, id="binary_as_input"), pytest.param(False, id="binary_as_output")] +) +def test_non_conventional_bool_without_fallback_logs_warning(binary_as_input, caplog, tmpdir): + # Specify a non-conventional boolean feature without a fallback true label. + bin_feature = binary_feature(bool2str=["<=50K", ">50K"], preprocessing={"fallback_true_label": None}) + + # Generate data with the non-conventional boolean feature. + if binary_as_input: + input_features = [bin_feature] + output_features = [number_feature()] + else: + input_features = [number_feature()] + output_features = [bin_feature] + config = {"input_features": input_features, "output_features": output_features, TRAINER: {"epochs": 2}} + + data_csv_path = os.path.join(tmpdir, "data.csv") + training_data_csv_path = generate_data(input_features, output_features, data_csv_path) + df = pd.read_csv(training_data_csv_path) + + # Preprocess the data. + with caplog.at_level(logging.WARN, logger="ludwig.features.binary_feature"): + ludwig_model = LudwigModel(config) + ludwig_model.preprocess(dataset=df) + + # Check that a warning is logged. + warning_text = re.compile( + f"Binary feature {bin_feature[NAME]} has at least 1 unconventional boolean value:" + r" Cannot automatically map value '(>50K|<=50K)'" + " to a boolean and no `preprocessing.fallback_true_label` specified." + " We will now interpret <=50K as 1 and the other values as 0. If this is incorrect, please use the category" + " feature type or manually specify the true value with `preprocessing.fallback_true_label`." + ) + assert re.search(warning_text, caplog.text) is not None diff --git a/tests/ludwig/utils/automl/test_type_inference.py b/tests/ludwig/utils/automl/test_type_inference.py index 809202dfef6..b0714854cc8 100644 --- a/tests/ludwig/utils/automl/test_type_inference.py +++ b/tests/ludwig/utils/automl/test_type_inference.py @@ -26,6 +26,8 @@ (2, ["0", "1"], 0, 0, 0.0, BINARY), # Mostly bool-like values. (3, ["0", "1", "True"], 0, 0, 0.0, CATEGORY), + # Non-conventional booleans are treated as categories since we cannot infer true/false labels. + pytest.param(2, ["<=50K", ">50K"], 0, 0, 0.0, CATEGORY, id="non-conventional-bools"), # Finite list of strings. (2, ["human", "bot"], 0, 0, 0.0, CATEGORY), (10, [generate_string(5) for _ in range(10)], 0, 0, 0.0, CATEGORY), From de54948d5972fd00e0786224bd92c280da2dd109 Mon Sep 17 00:00:00 2001 From: Joppe Geluykens Date: Thu, 6 Oct 2022 13:16:39 +0200 Subject: [PATCH 2/2] remove verbose warning check --- tests/integration_tests/test_preprocessing.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/tests/integration_tests/test_preprocessing.py b/tests/integration_tests/test_preprocessing.py index eb3ff8a26d2..22124bed12c 100644 --- a/tests/integration_tests/test_preprocessing.py +++ b/tests/integration_tests/test_preprocessing.py @@ -1,7 +1,6 @@ import logging import os import random -import re import string import numpy as np @@ -483,11 +482,4 @@ def test_non_conventional_bool_without_fallback_logs_warning(binary_as_input, ca ludwig_model.preprocess(dataset=df) # Check that a warning is logged. - warning_text = re.compile( - f"Binary feature {bin_feature[NAME]} has at least 1 unconventional boolean value:" - r" Cannot automatically map value '(>50K|<=50K)'" - " to a boolean and no `preprocessing.fallback_true_label` specified." - " We will now interpret <=50K as 1 and the other values as 0. If this is incorrect, please use the category" - " feature type or manually specify the true value with `preprocessing.fallback_true_label`." - ) - assert re.search(warning_text, caplog.text) is not None + assert "unconventional boolean value" in caplog.text