IGNF · leavauchier · Apr 18, 2024 · Apr 17, 2024 · Apr 18, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,7 @@
 # main
 
+Save optimized thresholds as yaml instead of pickle to make it easier to read
+
 ### 1.10.2
 - Add support for metadata propagation through compound pdal pipelines:
   - fix epsg propagation

diff --git a/configs/building_validation/optimization/default.yaml b/configs/building_validation/optimization/default.yaml
@@ -13,7 +13,7 @@ paths:
   group_info_pickle_path: ${.results_output_dir}/group_info.pickle
   prepared_las_dir: ${.results_output_dir}/prepared/
   updated_las_dir: ${.results_output_dir}/updated/
-  building_validation_thresholds_pickle: ${.results_output_dir}/optimized_thresholds.pickle # Wher
+  building_validation_thresholds: ${.results_output_dir}/optimized_thresholds.yaml # Wher
 
 # CLASSIFICATION CODES of a dataset which was inspected
 # and labeled post TerraSolid macro

diff --git a/docs/source/guides/thresholds_optimization.md b/docs/source/guides/thresholds_optimization.md
@@ -37,7 +37,7 @@ building_validation.optimization.paths.results_output_dir=[path/to/save/results]
 
 ### Evaluation of optimized thresholds on a test set
 
-Once an optimal solution was found, you may want to evaluate the decision process on unseen data to evaluate generalization capability. For that, you will need another test folder of corrected data in the same format as before (a different `input_las_dir`). You need to specify that no optimization is required using the `todo` params. You also need to give the path to the pickled decision thresholds from the previous step, and specify a different `results_output_dir` so that prepared data of test and val test are not pooled together.
+Once an optimal solution was found, you may want to evaluate the decision process on unseen data to evaluate generalization capability. For that, you will need another test folder of corrected data in the same format as before (a different `input_las_dir`). You need to specify that no optimization is required using the `todo` params. You also need to give the path to the decision thresholds file (yaml file) from the previous step, and specify a different `results_output_dir` so that prepared data of test and val test are not pooled together.
 
 
 ```bash
@@ -48,7 +48,7 @@ python lidar_prod/run.py \
 building_validation.optimization.todo='prepare+evaluate+update' \
 building_validation.optimization.paths.input_las_dir=[path/to/labelled/test/dataset/] \
 building_validation.optimization.paths.results_output_dir=[path/to/save/results] \
-building_validation.optimization.paths.building_validation_thresholds_pickle=[path/to/optimized_thresholds.pickle]
+building_validation.optimization.paths.building_validation_thresholds=[path/to/optimized_thresholds.yaml]
 ```
 
 ### Utils
@@ -57,4 +57,4 @@ Debug mode: to run on a single file during development, add a `+building_validat
 
 
 Reference:
-- [Deb et al. (2002) - A fast and elitist multiobjective genetic algorithm\: NSGA-II](https://ieeexplore.ieee.org/document/996017)).
+- [Deb et al. (2002) - A fast and elitist multiobjective genetic algorithm\: NSGA-II](https://ieeexplore.ieee.org/document/996017).
diff --git a/lidar_prod/tasks/building_validation.py b/lidar_prod/tasks/building_validation.py
@@ -9,6 +9,7 @@
 import geopandas
 import numpy as np
 import pdal
+import yaml
 from tqdm import tqdm
 
 from lidar_prod.tasks.utils import (
@@ -378,3 +379,14 @@ class thresholds:
     min_frac_refutation: float
     min_entropy_uncertainty: float
     min_frac_entropy_uncertain: float
+
+    def dump(self, filename: str):
+        with open(filename, "w") as f:
+            yaml.safe_dump(self.__dict__, f)
+
+    @staticmethod
+    def load(filename: str):
+        with open(filename, "r") as f:
+            data = yaml.safe_load(f)
+
+        return thresholds(**data)
diff --git a/lidar_prod/tasks/building_validation_optimization.py b/lidar_prod/tasks/building_validation_optimization.py
@@ -185,22 +185,22 @@ def evaluate(self) -> dict:
 
         """
         clusters = self._load_clusters()
-        self._set_thresholds_from_pickle_if_available()
+        self._set_thresholds_from_file_if_available()
         decisions = np.array([self.bv._make_group_decision(c) for c in clusters])
         mts_gt = np.array([c.target for c in clusters])
         metrics_dict = self.evaluate_decisions(mts_gt, decisions)
         log.info(f"\n Results:\n{self._get_results_logs_str(metrics_dict)}")
         return metrics_dict
 
-    def _set_thresholds_from_pickle_if_available(self):
+    def _set_thresholds_from_file_if_available(self):
         try:
-            with open(self.paths.building_validation_thresholds_pickle, "rb") as f:
-                self.bv.thresholds = pickle.load(f)
+            self.bv.thresholds = thresholds.load(self.paths.building_validation_thresholds)
+
         except FileNotFoundError:
             warnings.warn(
                 "Using default thresholds from hydra config to perform decisions. "
-                "You may want to specify different thresholds via a pickled object by specifying "
-                "building_validation.optimization.paths.building_validation_thresholds_pickle",
+                "You may want to specify different thresholds via a yaml file by specifying "
+                "building_validation.optimization.paths.building_validation_thresholds",
                 UserWarning,
             )
 
@@ -213,7 +213,7 @@ def update(self):
 
         """
         log.info(f"Updated las will be saved in {self.paths.results_output_dir}")
-        self._set_thresholds_from_pickle_if_available()
+        self._set_thresholds_from_file_if_available()
         for prepared_las_path, target_las_path in tqdm(
             zip(self.prepared_las_filepaths, self.out_las_filepaths),
             total=len(self.prepared_las_filepaths),
@@ -354,11 +354,10 @@ def _select_best_rules(self, study):
         best_rules = thresholds(**best.params)
         return best_rules
 
-    def _dump_best_rules(self, best_trial_params):
-        """Serializes best thresholds."""
-        with open(self.paths.building_validation_thresholds_pickle, "wb") as f:
-            pickle.dump(best_trial_params, f)
-            log.info(f"Pickled best params to {self.paths.building_validation_thresholds_pickle}")
+    def _dump_best_rules(self, best_trial_params: thresholds):
+        """Saves best thresholds to a yaml file."""
+        best_trial_params.dump(self.paths.building_validation_thresholds)
+        log.info(f"Saved best params to {self.paths.building_validation_thresholds}")
 
     def _dump_clusters(self, clusters):
         """Serializes the list of cluster-level information objects."""

diff --git a/tests/lidar_prod/tasks/test_building_validation.py b/tests/lidar_prod/tasks/test_building_validation.py
@@ -5,7 +5,7 @@
 import numpy as np
 import pytest
 
-from lidar_prod.tasks.building_validation import BuildingValidator
+from lidar_prod.tasks.building_validation import BuildingValidator, thresholds
 from lidar_prod.tasks.utils import BDUniConnectionParams, get_las_data_from_las
 from tests.conftest import (
     check_expected_classification,
@@ -171,3 +171,24 @@ def test_run(hydra_cfg):
             dims.candidate_buildings_flag,
         ],
     )
+
+
+def test_thresholds():
+    dump_file = str(TMP_DIR / "threshold_dump.yml")
+
+    th = thresholds(
+        min_confidence_confirmation=0.1,
+        min_frac_confirmation=0.2,
+        min_frac_confirmation_factor_if_bd_uni_overlay=0.3,
+        min_uni_db_overlay_frac=0.4,
+        min_confidence_refutation=0.5,
+        min_frac_refutation=0.6,
+        min_entropy_uncertainty=0.7,
+        min_frac_entropy_uncertain=0.8,
+    )
+
+    th.dump(dump_file)
+
+    th1 = th.load(dump_file)
+
+    assert th1 == th
diff --git a/tests/lidar_prod/tasks/test_building_validation_optimization.py b/tests/lidar_prod/tasks/test_building_validation_optimization.py
@@ -0,0 +1,194 @@
+import os
+import os.path as osp
+import shutil
+from pathlib import Path
+
+import hydra
+import numpy as np
+import pytest
+
+from lidar_prod.tasks.building_validation import thresholds
+from lidar_prod.tasks.building_validation_optimization import (
+    BuildingValidationOptimizer,
+)
+from lidar_prod.tasks.utils import BDUniConnectionParams
+from tests.conftest import pdal_read_las_array
+
+"""We test the building validation optimizer against two LAS:
+
+These datasets must have the right classification codes, i.e. the ones defined in
+buildings_correction_labels.
+
+WARNING: The large LAS cannot be versionned by git. If it is absent from environment,
+pytest expects the test to fail.
+This is to enable a shallower run of these tests without the file.
+
+"""
+
+TMP_DIR = Path("tmp/lidar_prod/tasks/building_validation_optimization")
+
+
+# Small LAS, for which we optimize thresholds and reach perfect validation,
+# to quickly check optimization logic.
+LAS_SUBSET_FILE = "tests/files/870000_6618000.subset.postIA.corrected.las"
+SUBSET_EXPECTED_METRICS = {
+    "exact": {
+        "groups_count": 15,
+        "group_no_buildings": 0.4,
+    },
+    "min": {
+        "p_auto": 1.0,
+        "recall": 1.0,
+        "precision": 1.0,
+    },
+}
+# Large LAS, for which we evaluate performance, to control that there was no regression in terms of
+# automation/precision/recall of building validation.
+LAS_LARGE_FILE = "tests/files/large/V0.5_792000_6272000.las"
+LARGE_EXPECTED_METRICS = {
+    "exact": {
+        "groups_count": 1493,
+        "group_no_buildings": 0.149,
+        "group_building": 0.847,
+    },
+    "min": {
+        "p_auto": 0.94,
+        "recall": 0.99,
+        "precision": 0.94,
+    },
+}
+
+# Relative tolerance when comparing metrics to their expected value for large LAS.
+# i.e. resulting metrics are >= (1-tolerance) * expected metrics for performance indicators.
+RELATIVE_MIN_TOLERANCE_OF_EXPECTED_METRICS = 0.05
+
+
+def test_BVOptimization_on_subset(hydra_cfg):
+    out_dir = str(TMP_DIR / "subset")
+    # Optimization output (thresholds and prepared/updated LASfiles) saved to out_dir
+    hydra_cfg.building_validation.optimization.paths.results_output_dir = out_dir
+
+    # We isolate the input file in a subdir, and prepare it for optimization
+    input_las_dir = osp.join(out_dir, "inputs/")
+    hydra_cfg.building_validation.optimization.paths.input_las_dir = input_las_dir
+    os.makedirs(input_las_dir, exist_ok=False)
+    src_las_copy_path = osp.join(input_las_dir, "copy.las")
+    shutil.copy(LAS_SUBSET_FILE, src_las_copy_path)
+
+    # Check that a full optimization run can pass successfully
+    bvo: BuildingValidationOptimizer = hydra.utils.instantiate(
+        hydra_cfg.building_validation.optimization
+    )
+    bd_uni_connection_params: BDUniConnectionParams = hydra.utils.instantiate(
+        hydra_cfg.bd_uni_connection_params
+    )
+    bvo.bv.bd_uni_connection_params = bd_uni_connection_params
+    bvo.run()
+
+    # Check that the threshold are saved in a yaml file successfully
+    th_yaml = hydra_cfg.building_validation.optimization.paths.building_validation_thresholds
+    assert os.path.isfile(th_yaml)
+    assert isinstance(thresholds.load(th_yaml), thresholds)
+
+    # Assert that a prepared and an updated file are generated in the temporary dir
+    # in subfolders.
+    assert os.path.isfile(osp.join(out_dir, "prepared", osp.basename(src_las_copy_path)))
+    updated_las_path = osp.join(out_dir, "updated", osp.basename(src_las_copy_path))
+    assert os.path.isfile(updated_las_path)
+
+    # Check the output of the evaluate method. Note that it uses the
+    # prepared data and the threshold from previous run
+    metrics_dict = bvo.evaluate()
+    print(metrics_dict)
+    # Assert inclusion
+    assert SUBSET_EXPECTED_METRICS["exact"].items() <= metrics_dict.items()
+    # Assert <= with a relative tolerance
+    for k, v in SUBSET_EXPECTED_METRICS["min"].items():
+        v <= metrics_dict[k]
+    # Update classification dimension and check if the codes are the expected ones.
+    bvo.bv.use_final_classification_codes = True
+    bvo.update()
+    assert os.path.isfile(updated_las_path)
+    arr, _ = pdal_read_las_array(updated_las_path, hydra_cfg.data_format.epsg)
+    # Check that we have either 1/2 (ground/unclassified), or one of
+    # the final classification code of the module.
+    final_codes = hydra_cfg.data_format.codes.building.final
+    expected_codes = {
+        1,
+        2,
+        final_codes.building,
+        final_codes.not_building,
+        final_codes.unsure,
+    }
+    actual_codes = {*np.unique(arr["Classification"])}
+    assert actual_codes.issubset(expected_codes)
+
+
+@pytest.mark.slow()
+def test_BVOptimization_on_large_file(hydra_cfg):
+
+    if not os.path.isfile(LAS_LARGE_FILE):
+        pytest.xfail(reason=f"File {LAS_LARGE_FILE} is not present in environment.")
+
+    out_dir = str(TMP_DIR / "large_file")
+
+    # Optimization output (thresholds and prepared/updated LASfiles) saved to td
+    hydra_cfg.building_validation.optimization.paths.results_output_dir = out_dir
+
+    # We isolate the input file in a subdir, and prepare it for optimization
+    input_las_dir = osp.join(out_dir, "inputs/")
+    hydra_cfg.building_validation.optimization.paths.input_las_dir = input_las_dir
+    os.makedirs(input_las_dir, exist_ok=False)
+    src_las_copy_path = osp.join(input_las_dir, "copy.las")
+    shutil.copy(LAS_LARGE_FILE, src_las_copy_path)
+
+    # Check that a full optimization run can pass successfully
+    bvo: BuildingValidationOptimizer = hydra.utils.instantiate(
+        hydra_cfg.building_validation.optimization
+    )
+
+    bd_uni_connection_params: BDUniConnectionParams = hydra.utils.instantiate(
+        hydra_cfg.bd_uni_connection_params
+    )
+    bvo.bv.bd_uni_connection_params = bd_uni_connection_params
+
+    bvo.prepare()
+    metrics_dict = bvo.evaluate()
+    print(metrics_dict)
+
+    exact_expected_val = LARGE_EXPECTED_METRICS["exact"]
+    for k in exact_expected_val:
+        assert (
+            pytest.approx(exact_expected_val[k], RELATIVE_MIN_TOLERANCE_OF_EXPECTED_METRICS)
+            == metrics_dict[k]
+        )
+    min_expected_val = LARGE_EXPECTED_METRICS["min"]
+    for k in min_expected_val:
+        assert (
+            (1 - RELATIVE_MIN_TOLERANCE_OF_EXPECTED_METRICS) * min_expected_val[k]
+        ) <= metrics_dict[k]
+
+
+# All expected metrics for reference:
+"""
+    groups_count=1493
+    group_unsure=0.00402
+    group_no_buildings=0.149
+    group_building=0.847
+    p_auto=0.889
+    p_unsure=0.111
+    p_refute=0.0924
+    p_confirm=0.797
+    a_refute=0.899
+    a_confirm=0.976
+    precision=0.98
+    recall=0.99
+    Confusion Matrix
+    [[   2    1    3]
+    [  74  124   25]
+    [  89   13 1162]]
+    Confusion Matrix (normalized)
+    [[0.333 0.167 0.5  ]
+    [0.332 0.556 0.112]
+    [0.07  0.01  0.919]]
+"""