From 9a90de310728fbdf35c7959269f551033eedd7d5 Mon Sep 17 00:00:00 2001
From: Joppe Geluykens <joppe@predibase.com>
Date: Wed, 5 Oct 2022 11:43:00 +0000
Subject: [PATCH 1/2] extend test coverage for non-conventional booleans

---
 tests/integration_tests/test_preprocessing.py | 93 +++++++++++++++++++
 .../utils/automl/test_type_inference.py       |  2 +
 2 files changed, 95 insertions(+)

diff --git a/tests/integration_tests/test_preprocessing.py b/tests/integration_tests/test_preprocessing.py
index b0d1f84edb6..eb3ff8a26d2 100644
--- a/tests/integration_tests/test_preprocessing.py
+++ b/tests/integration_tests/test_preprocessing.py
@@ -1,5 +1,7 @@
+import logging
 import os
 import random
+import re
 import string
 
 import numpy as np
@@ -398,3 +400,94 @@ def test_in_memory_dataset_size(backend, tmpdir, ray_cluster_2cpu):
     assert training_dataset.in_memory_size_bytes > 0
     assert validation_dataset.in_memory_size_bytes > 0
     assert test_dataset.in_memory_size_bytes > 0
+
+
+@pytest.mark.parametrize(
+    "binary_as_input, expected_preprocessing",
+    [
+        pytest.param(
+            True,
+            {
+                "missing_value_strategy": "fill_with_false",
+                "fill_value": None,
+                "computed_fill_value": "<=50K",
+                "fallback_true_label": ">50K",
+            },
+            id="binary_as_input",
+        ),
+        pytest.param(
+            False,
+            {
+                "missing_value_strategy": "drop_row",
+                "fill_value": None,
+                "computed_fill_value": None,
+                "fallback_true_label": ">50K",
+            },
+            id="binary_as_output",
+        ),
+    ],
+)
+def test_non_conventional_bool_with_fallback(binary_as_input, expected_preprocessing, tmpdir):
+    # Specify a non-conventional boolean feature with a fallback true label.
+    bin_feature = binary_feature(bool2str=["<=50K", ">50K"], preprocessing={"fallback_true_label": ">50K"})
+
+    # Generate data with the non-conventional boolean feature.
+    if binary_as_input:
+        input_features = [bin_feature]
+        output_features = [number_feature()]
+    else:
+        input_features = [number_feature()]
+        output_features = [bin_feature]
+    config = {"input_features": input_features, "output_features": output_features, TRAINER: {"epochs": 2}}
+
+    data_csv_path = os.path.join(tmpdir, "data.csv")
+    training_data_csv_path = generate_data(input_features, output_features, data_csv_path)
+    df = pd.read_csv(training_data_csv_path)
+
+    # Preprocess the data.
+    ludwig_model = LudwigModel(config)
+    _, _, _, training_set_metadata = ludwig_model.preprocess(dataset=df)
+
+    # Check that true/false labels are set correctly.
+    assert training_set_metadata[bin_feature[NAME]] == {
+        "str2bool": {"<=50K": False, ">50K": True},
+        "bool2str": ["<=50K", ">50K"],
+        "fallback_true_label": ">50K",
+        "preprocessing": expected_preprocessing,
+    }
+
+
+@pytest.mark.parametrize(
+    "binary_as_input", [pytest.param(True, id="binary_as_input"), pytest.param(False, id="binary_as_output")]
+)
+def test_non_conventional_bool_without_fallback_logs_warning(binary_as_input, caplog, tmpdir):
+    # Specify a non-conventional boolean feature without a fallback true label.
+    bin_feature = binary_feature(bool2str=["<=50K", ">50K"], preprocessing={"fallback_true_label": None})
+
+    # Generate data with the non-conventional boolean feature.
+    if binary_as_input:
+        input_features = [bin_feature]
+        output_features = [number_feature()]
+    else:
+        input_features = [number_feature()]
+        output_features = [bin_feature]
+    config = {"input_features": input_features, "output_features": output_features, TRAINER: {"epochs": 2}}
+
+    data_csv_path = os.path.join(tmpdir, "data.csv")
+    training_data_csv_path = generate_data(input_features, output_features, data_csv_path)
+    df = pd.read_csv(training_data_csv_path)
+
+    # Preprocess the data.
+    with caplog.at_level(logging.WARN, logger="ludwig.features.binary_feature"):
+        ludwig_model = LudwigModel(config)
+        ludwig_model.preprocess(dataset=df)
+
+    # Check that a warning is logged.
+    warning_text = re.compile(
+        f"Binary feature {bin_feature[NAME]} has at least 1 unconventional boolean value:"
+        r" Cannot automatically map value '(>50K|<=50K)'"
+        " to a boolean and no `preprocessing.fallback_true_label` specified."
+        " We will now interpret <=50K as 1 and the other values as 0. If this is incorrect, please use the category"
+        " feature type or manually specify the true value with `preprocessing.fallback_true_label`."
+    )
+    assert re.search(warning_text, caplog.text) is not None
diff --git a/tests/ludwig/utils/automl/test_type_inference.py b/tests/ludwig/utils/automl/test_type_inference.py
index 809202dfef6..b0714854cc8 100644
--- a/tests/ludwig/utils/automl/test_type_inference.py
+++ b/tests/ludwig/utils/automl/test_type_inference.py
@@ -26,6 +26,8 @@
         (2, ["0", "1"], 0, 0, 0.0, BINARY),
         # Mostly bool-like values.
         (3, ["0", "1", "True"], 0, 0, 0.0, CATEGORY),
+        # Non-conventional booleans are treated as categories since we cannot infer true/false labels.
+        pytest.param(2, ["<=50K", ">50K"], 0, 0, 0.0, CATEGORY, id="non-conventional-bools"),
         # Finite list of strings.
         (2, ["human", "bot"], 0, 0, 0.0, CATEGORY),
         (10, [generate_string(5) for _ in range(10)], 0, 0, 0.0, CATEGORY),

From de54948d5972fd00e0786224bd92c280da2dd109 Mon Sep 17 00:00:00 2001
From: Joppe Geluykens <joppe@predibase.com>
Date: Thu, 6 Oct 2022 13:16:39 +0200
Subject: [PATCH 2/2] remove verbose warning check

---
 tests/integration_tests/test_preprocessing.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/tests/integration_tests/test_preprocessing.py b/tests/integration_tests/test_preprocessing.py
index eb3ff8a26d2..22124bed12c 100644
--- a/tests/integration_tests/test_preprocessing.py
+++ b/tests/integration_tests/test_preprocessing.py
@@ -1,7 +1,6 @@
 import logging
 import os
 import random
-import re
 import string
 
 import numpy as np
@@ -483,11 +482,4 @@ def test_non_conventional_bool_without_fallback_logs_warning(binary_as_input, ca
         ludwig_model.preprocess(dataset=df)
 
     # Check that a warning is logged.
-    warning_text = re.compile(
-        f"Binary feature {bin_feature[NAME]} has at least 1 unconventional boolean value:"
-        r" Cannot automatically map value '(>50K|<=50K)'"
-        " to a boolean and no `preprocessing.fallback_true_label` specified."
-        " We will now interpret <=50K as 1 and the other values as 0. If this is incorrect, please use the category"
-        " feature type or manually specify the true value with `preprocessing.fallback_true_label`."
-    )
-    assert re.search(warning_text, caplog.text) is not None
+    assert "unconventional boolean value" in caplog.text