diff --git a/src/power_grid_model/_utils.py b/src/power_grid_model/_utils.py index 23e230d4b..7f79eebe2 100644 --- a/src/power_grid_model/_utils.py +++ b/src/power_grid_model/_utils.py @@ -539,9 +539,28 @@ def is_columnar(component_data: ComponentData) -> bool: return not isinstance(component_data, np.ndarray) -def is_nan_or_equivalent(array): +def is_nan_or_default(x: np.ndarray) -> np.ndarray: + """ + Check if elements in the array are NaN or equal to the min of its dtype. + + Args: + x: A NumPy array to check. + + Returns: + A boolean NumPy array where each element is True if the corresponding element in x is NaN + or min of its dtype, and False otherwise. + """ + if x.dtype == np.float64: + return np.isnan(x) + if x.dtype in (np.int32, np.int8): + return x == np.iinfo(x.dtype).min + raise TypeError(f"Unsupported data type: {x.dtype}") + + +def is_nan_or_equivalent(array) -> bool: """ Check if the array contains only nan values or equivalent nan values for specific data types. + This is the aggregrated version of `is_nan_or_default` for the whole array. Args: array: The array to check. @@ -549,7 +568,7 @@ def is_nan_or_equivalent(array): Returns: bool: True if the array contains only nan or equivalent nan values, False otherwise. """ - return isinstance(array, np.ndarray) and ( + return isinstance(array, np.ndarray) and bool( (array.dtype == np.float64 and np.isnan(array).all()) or (array.dtype in (np.int32, np.int8) and np.all(array == np.iinfo(array.dtype).min)) ) @@ -749,3 +768,19 @@ def get_dataset_type(data: Dataset) -> DatasetType: raise ValueError("The dataset type could not be deduced because multiple dataset types match the data.") return next(iter(candidates)) + + +def get_comp_size(comp_data: SingleColumnarData | SingleArray) -> int: + """ + Get the number of elements in the comp_data of a single dataset. + + Args: + comp_data: Columnar or row based data of a single batch + + Returns: + Number of elements in the component + """ + if not is_columnar(comp_data): + return len(comp_data) + comp_data = cast(SingleColumnarData, comp_data) + return len(next(iter(comp_data.values()))) diff --git a/src/power_grid_model/validation/errors.py b/src/power_grid_model/validation/errors.py index 58bc84649..48e1fb6d9 100644 --- a/src/power_grid_model/validation/errors.py +++ b/src/power_grid_model/validation/errors.py @@ -325,12 +325,14 @@ def __init__( # pylint: disable=too-many-arguments self, component: ComponentType, field: str, - ids: list[int], - ref_components: ComponentType | list[ComponentType], + ids: Optional[list[int]] = None, + ref_components: Optional[ComponentType | list[ComponentType]] = None, filters: Optional[dict[str, Any]] = None, ): # pylint: disable=too-many-positional-arguments - super().__init__(component=component, field=field, ids=ids) + self.ids = ids if ids is not None else [] + ref_components = ref_components if ref_components is not None else [] + super().__init__(component=component, field=field, ids=self.ids) self.ref_components = [ref_components] if isinstance(ref_components, (str, ComponentType)) else ref_components self.filters = filters if filters else None diff --git a/src/power_grid_model/validation/rules.py b/src/power_grid_model/validation/rules.py index fdb4df991..7046fb9f6 100644 --- a/src/power_grid_model/validation/rules.py +++ b/src/power_grid_model/validation/rules.py @@ -40,6 +40,7 @@ import numpy as np from power_grid_model import ComponentType +from power_grid_model._utils import get_comp_size, is_nan_or_default from power_grid_model.data_types import SingleDataset from power_grid_model.enum import FaultPhase, FaultType, WindingType from power_grid_model.validation.errors import ( @@ -678,26 +679,39 @@ def all_not_two_values_equal( return [] -def all_ids_exist_in_data_set( - data: SingleDataset, ref_data: SingleDataset, component: ComponentType, ref_name: str -) -> list[IdNotInDatasetError]: +def ids_valid_in_update_data_set( + update_data: SingleDataset, ref_data: SingleDataset, component: ComponentType, ref_name: str +) -> list[IdNotInDatasetError | InvalidIdError]: """ - Check that for all records of a particular type of component, the ids exist in the reference data set. + Check that for all records of a particular type of component, whether the ids: + - exist and match those in the reference data set + - are not present but qualifies for optional id Args: - data: The (update) data set for all components + update_data: The update data set for all components ref_data: The reference (input) data set for all components component: The component of interest - ref_name: The name of the reference data set, e.g. 'input_data' + ref_name: The name of the reference data set, e.g. 'update_data' Returns: A list containing zero or one IdNotInDatasetError, listing all ids of the objects in the data set which do not exist in the reference data set. """ - component_data = data[component] + component_data = update_data[component] component_ref_data = ref_data[component] - if not isinstance(component_data, np.ndarray) or not isinstance(component_ref_data, np.ndarray): - raise NotImplementedError() # TODO(mgovers): add support for columnar data - + if component_ref_data["id"].size == 0: + return [InvalidIdError(component=component, field="id", ids=None)] + id_field_is_nan = np.array(is_nan_or_default(component_data["id"])) + # check whether id qualify for optional + if component_data["id"].size == 0 or np.all(id_field_is_nan): + # check if the dimension of the component_data is the same as the component_ref_data + if get_comp_size(component_data) != get_comp_size(component_ref_data): + return [InvalidIdError(component=component, field="id", ids=None)] + return [] # supported optional id + + if np.all(id_field_is_nan) and not np.all(~id_field_is_nan): + return [InvalidIdError(component=component, field="id", ids=None)] + + # normal check: exist and match with input invalid = np.isin(component_data["id"], component_ref_data["id"], invert=True) if invalid.any(): ids = component_data["id"][invalid].flatten().tolist() diff --git a/src/power_grid_model/validation/validation.py b/src/power_grid_model/validation/validation.py index dbbbd7ad6..24ba03fd9 100644 --- a/src/power_grid_model/validation/validation.py +++ b/src/power_grid_model/validation/validation.py @@ -33,6 +33,7 @@ ) from power_grid_model.validation.errors import ( IdNotInDatasetError, + InvalidIdError, MissingValueError, MultiComponentNotUniqueError, ValidationError, @@ -47,7 +48,6 @@ all_greater_or_equal, all_greater_than_or_equal_to_zero, all_greater_than_zero, - all_ids_exist_in_data_set, all_less_than, all_not_two_values_equal, all_not_two_values_zero, @@ -58,6 +58,7 @@ all_valid_enum_values, all_valid_fault_phases, all_valid_ids, + ids_valid_in_update_data_set, none_missing, valid_p_q_sigma, ) @@ -149,13 +150,11 @@ def validate_batch_data( for batch, batch_update_data in enumerate(batch_data): row_update_data = compatibility_convert_row_columnar_dataset(batch_update_data, None, DatasetType.update) assert_valid_data_structure(row_update_data, DatasetType.update) - id_errors: list[ValidationError] = list(validate_ids_exist(row_update_data, input_data_copy)) - batch_errors = input_errors + id_errors - if not id_errors: - merged_data = update_input_data(input_data_copy, row_update_data) - batch_errors += validate_required_values(merged_data, calculation_type, symmetric) - batch_errors += validate_values(merged_data, calculation_type) + batch_errors = input_errors + merged_data = update_input_data(input_data_copy, row_update_data) + batch_errors += validate_required_values(merged_data, calculation_type, symmetric) + batch_errors += validate_values(merged_data, calculation_type) if batch_errors: errors[batch] = batch_errors @@ -216,23 +215,26 @@ def validate_unique_ids_across_components(data: SingleDataset) -> list[MultiComp return all_cross_unique(data, [(component, "id") for component in data]) -def validate_ids_exist(update_data: SingleDataset, input_data: SingleDataset) -> list[IdNotInDatasetError]: +def validate_ids(update_data: SingleDataset, input_data: SingleDataset) -> list[IdNotInDatasetError | InvalidIdError]: """ - Checks if all ids of the components in the update data exist in the input data. This needs to be true, because you - can only update existing components. + Checks if all ids of the components in the update data: + - exist and match those in the input data + - are not present but qualifies for optional id This function should be called for every update dataset in a batch set Args: update_data: A single update dataset - input_data: A power-grid-model input dataset + input_data: Input dataset Returns: - An empty list if all update data ids exist in the input dataset, or a list of IdNotInDatasetErrors for - all update components of which the id does not exist in the input dataset + An empty list if all update data ids are valid, or a list of IdNotInDatasetErrors or InvalidIdError for + all update components that have invalid ids """ - errors = (all_ids_exist_in_data_set(update_data, input_data, component, "input_data") for component in update_data) + errors = ( + ids_valid_in_update_data_set(update_data, input_data, component, "update_data") for component in update_data + ) return list(chain(*errors)) diff --git a/tests/unit/validation/test_batch_validation.py b/tests/unit/validation/test_batch_validation.py index 26520dae9..064f9853d 100644 --- a/tests/unit/validation/test_batch_validation.py +++ b/tests/unit/validation/test_batch_validation.py @@ -119,6 +119,7 @@ def test_validate_batch_data_input_error(input_data, batch_data): def test_validate_batch_data_update_error(input_data, batch_data): batch_data["line"]["from_status"] = np.array([[12, 34], [0, -128], [56, 78]]) errors = validate_batch_data(input_data, batch_data) - assert len(errors) == 2 - assert [NotBooleanError("line", "from_status", [5, 6])] == errors[0] - assert [NotBooleanError("line", "from_status", [5, 7])] == errors[2] + assert len(errors) == 3 + assert NotBooleanError("line", "from_status", [5, 6]) == errors[0][0] + assert NotBooleanError("line", "from_status", [5, 7]) == errors[1][1] + assert NotBooleanError("line", "from_status", [5, 6]) == errors[2][0] diff --git a/tests/unit/validation/test_validation_functions.py b/tests/unit/validation/test_validation_functions.py index 646ecf1f8..73c1102d3 100644 --- a/tests/unit/validation/test_validation_functions.py +++ b/tests/unit/validation/test_validation_functions.py @@ -9,8 +9,16 @@ import pytest from power_grid_model import CalculationType, LoadGenType, MeasuredTerminalType, initialize_array, power_grid_meta_data +from power_grid_model._utils import compatibility_convert_row_columnar_dataset from power_grid_model.core.dataset_definitions import ComponentType, DatasetType -from power_grid_model.enum import Branch3Side, BranchSide, CalculationType, FaultType, TapChangingStrategy +from power_grid_model.enum import ( + Branch3Side, + BranchSide, + CalculationType, + ComponentAttributeFilterOptions, + FaultType, + TapChangingStrategy, +) from power_grid_model.validation import assert_valid_input_data from power_grid_model.validation.errors import ( IdNotInDatasetError, @@ -27,7 +35,7 @@ from power_grid_model.validation.validation import ( assert_valid_data_structure, validate_generic_power_sensor, - validate_ids_exist, + validate_ids, validate_required_values, validate_unique_ids_across_components, validate_values, @@ -113,7 +121,7 @@ def test_validate_unique_ids_across_components(): assert len(unique_id_errors[0].ids) == 4 -def test_validate_ids_exist(): +def test_validate_ids(): source = initialize_array("input", "source", 3) source["id"] = [1, 2, 3] @@ -135,10 +143,47 @@ def test_validate_ids_exist(): update_data = {"source": source_update, "sym_load": sym_load_update} - invalid_ids = validate_ids_exist(update_data, input_data) + invalid_ids = validate_ids(update_data, input_data) + + assert IdNotInDatasetError("source", [4], "update_data") in invalid_ids + assert IdNotInDatasetError("sym_load", [7], "update_data") in invalid_ids + + source_update_no_id = initialize_array("update", "source", 3) + source_update_no_id["u_ref"] = [1.0, 2.0, 3.0] + + update_data_col = compatibility_convert_row_columnar_dataset( + data={"source": source_update_no_id, "sym_load": sym_load_update}, + data_filter=ComponentAttributeFilterOptions.relevant, + dataset_type=DatasetType.update, + ) + invalid_ids = validate_ids(update_data_col, input_data) + assert len(invalid_ids) == 1 + assert IdNotInDatasetError("sym_load", [7], "update_data") in invalid_ids + + source_update_less_no_id = initialize_array("update", "source", 2) + source_update_less_no_id["u_ref"] = [1.0, 2.0] - assert IdNotInDatasetError("source", [4], "input_data") in invalid_ids - assert IdNotInDatasetError("sym_load", [7], "input_data") in invalid_ids + update_data_col_less_no_id = compatibility_convert_row_columnar_dataset( + data={"source": source_update_less_no_id, "sym_load": sym_load_update}, + data_filter=ComponentAttributeFilterOptions.relevant, + dataset_type=DatasetType.update, + ) + invalid_ids = validate_ids(update_data_col_less_no_id, input_data) + assert len(invalid_ids) == 2 + assert IdNotInDatasetError("sym_load", [7], "update_data") in invalid_ids + + source_update_part_nan_id = initialize_array("update", "source", 3) + source_update_part_nan_id["id"] = [1, np.iinfo(np.int32).min, 4] + source_update_part_nan_id["u_ref"] = [1.0, 2.0, 3.0] + + update_data_col_part_nan_id = compatibility_convert_row_columnar_dataset( + data={"source": source_update_part_nan_id, "sym_load": sym_load_update}, + data_filter=ComponentAttributeFilterOptions.relevant, + dataset_type=DatasetType.update, + ) + invalid_ids = validate_ids(update_data_col_part_nan_id, input_data) + assert len(invalid_ids) == 2 + assert IdNotInDatasetError("sym_load", [7], "update_data") in invalid_ids @pytest.mark.parametrize(