Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature / Update Python validation to support optional id in update data #793

Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 36 additions & 2 deletions src/power_grid_model/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -539,17 +539,36 @@ def is_columnar(component_data: ComponentData) -> bool:
return not isinstance(component_data, np.ndarray)


def is_nan_or_equivalent(array):
def is_nan_or_default(x: np.ndarray) -> np.ndarray:
"""
Check if elements in the array are NaN or equal to the min of its dtype.

Args:
x: A NumPy array to check.

Returns:
A boolean NumPy array where each element is True if the corresponding element in x is NaN
or min of its dtype, and False otherwise.
"""
if x.dtype == np.float64:
figueroa1395 marked this conversation as resolved.
Show resolved Hide resolved
return np.isnan(x)
if x.dtype in (np.int32, np.int8):
return x == np.iinfo(x.dtype).min
Jerry-Jinfeng-Guo marked this conversation as resolved.
Show resolved Hide resolved
raise TypeError(f"Unsupported data type: {x.dtype}")


def is_nan_or_equivalent(array) -> bool:
"""
Check if the array contains only nan values or equivalent nan values for specific data types.
This is the aggregrated version of `is_nan_or_default` for the whole array.

Args:
array: The array to check.

Returns:
bool: True if the array contains only nan or equivalent nan values, False otherwise.
"""
return isinstance(array, np.ndarray) and (
return isinstance(array, np.ndarray) and bool(
figueroa1395 marked this conversation as resolved.
Show resolved Hide resolved
(array.dtype == np.float64 and np.isnan(array).all())
or (array.dtype in (np.int32, np.int8) and np.all(array == np.iinfo(array.dtype).min))
)
Expand Down Expand Up @@ -749,3 +768,18 @@ def get_dataset_type(data: Dataset) -> DatasetType:
raise ValueError("The dataset type could not be deduced because multiple dataset types match the data.")

return next(iter(candidates))


def get_comp_batch_size(comp_data: dict) -> int:
Jerry-Jinfeng-Guo marked this conversation as resolved.
Show resolved Hide resolved
"""
Get the batch size of the component update data.

Args:
comp_data: A dictionary representing the component data. The dictionary can be either columnar
or row-based.

Returns:
The length of the first value in the dictionary if the data is columnar, otherwise the length
of the dictionary itself.
"""
return len(next(iter(comp_data.values()))) if is_columnar(comp_data) else len(comp_data)
8 changes: 5 additions & 3 deletions src/power_grid_model/validation/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,12 +325,14 @@ def __init__( # pylint: disable=too-many-arguments
self,
component: ComponentType,
field: str,
ids: list[int],
ref_components: ComponentType | list[ComponentType],
ids: Optional[list[int]] = None,
Jerry-Jinfeng-Guo marked this conversation as resolved.
Show resolved Hide resolved
ref_components: Optional[ComponentType | list[ComponentType]] = None,
filters: Optional[dict[str, Any]] = None,
):
# pylint: disable=too-many-positional-arguments
super().__init__(component=component, field=field, ids=ids)
self.ids = ids if ids is not None else []
ref_components = ref_components if ref_components is not None else []
super().__init__(component=component, field=field, ids=self.ids)
self.ref_components = [ref_components] if isinstance(ref_components, (str, ComponentType)) else ref_components
self.filters = filters if filters else None

Expand Down
34 changes: 24 additions & 10 deletions src/power_grid_model/validation/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
import numpy as np

from power_grid_model import ComponentType
from power_grid_model._utils import get_comp_batch_size, is_nan_or_default
from power_grid_model.data_types import SingleDataset
from power_grid_model.enum import FaultPhase, FaultType, WindingType
from power_grid_model.validation.errors import (
Expand Down Expand Up @@ -678,26 +679,39 @@ def all_not_two_values_equal(
return []


def all_ids_exist_in_data_set(
data: SingleDataset, ref_data: SingleDataset, component: ComponentType, ref_name: str
) -> list[IdNotInDatasetError]:
def ids_valid_in_update_data_set(
update_data: SingleDataset, ref_data: SingleDataset, component: ComponentType, ref_name: str
) -> list[IdNotInDatasetError | InvalidIdError]:
"""
Check that for all records of a particular type of component, the ids exist in the reference data set.
Check that for all records of a particular type of component, whether the ids:
- exist and match those in the reference data set
- are not present but qualifies for optional id

Args:
data: The (update) data set for all components
update_data: The update data set for all components
ref_data: The reference (input) data set for all components
component: The component of interest
ref_name: The name of the reference data set, e.g. 'input_data'
ref_name: The name of the reference data set, e.g. 'update_data'
Returns:
A list containing zero or one IdNotInDatasetError, listing all ids of the objects in the data set which do not
exist in the reference data set.
"""
component_data = data[component]
component_data = update_data[component]
component_ref_data = ref_data[component]
if not isinstance(component_data, np.ndarray) or not isinstance(component_ref_data, np.ndarray):
raise NotImplementedError() # TODO(mgovers): add support for columnar data

if component_ref_data["id"].size == 0:
return [InvalidIdError(component=component, field="id", ids=None)]
id_field_is_nan = np.array(is_nan_or_default(component_data["id"]))
# check whether id qualify for optional
if component_data["id"].size == 0 or np.all(id_field_is_nan):
# check if the dimension of the component_data is the same as the component_ref_data
if get_comp_batch_size(component_data) != get_comp_batch_size(component_ref_data):
return [InvalidIdError(component=component, field="id", ids=None)]
return [] # supported optional id

if np.all(id_field_is_nan) and not np.all(~id_field_is_nan):
return [InvalidIdError(component=component, field="id", ids=None)]

# normal check: exist and match with input
invalid = np.isin(component_data["id"], component_ref_data["id"], invert=True)
if invalid.any():
ids = component_data["id"][invalid].flatten().tolist()
Expand Down
30 changes: 16 additions & 14 deletions src/power_grid_model/validation/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
)
from power_grid_model.validation.errors import (
IdNotInDatasetError,
InvalidIdError,
MissingValueError,
MultiComponentNotUniqueError,
ValidationError,
Expand All @@ -47,7 +48,6 @@
all_greater_or_equal,
all_greater_than_or_equal_to_zero,
all_greater_than_zero,
all_ids_exist_in_data_set,
all_less_than,
all_not_two_values_equal,
all_not_two_values_zero,
Expand All @@ -58,6 +58,7 @@
all_valid_enum_values,
all_valid_fault_phases,
all_valid_ids,
ids_valid_in_update_data_set,
none_missing,
valid_p_q_sigma,
)
Expand Down Expand Up @@ -149,13 +150,11 @@ def validate_batch_data(
for batch, batch_update_data in enumerate(batch_data):
row_update_data = compatibility_convert_row_columnar_dataset(batch_update_data, None, DatasetType.update)
assert_valid_data_structure(row_update_data, DatasetType.update)
id_errors: list[ValidationError] = list(validate_ids_exist(row_update_data, input_data_copy))

batch_errors = input_errors + id_errors
if not id_errors:
merged_data = update_input_data(input_data_copy, row_update_data)
batch_errors += validate_required_values(merged_data, calculation_type, symmetric)
batch_errors += validate_values(merged_data, calculation_type)
batch_errors = input_errors
merged_data = update_input_data(input_data_copy, row_update_data)
batch_errors += validate_required_values(merged_data, calculation_type, symmetric)
batch_errors += validate_values(merged_data, calculation_type)

if batch_errors:
errors[batch] = batch_errors
Expand Down Expand Up @@ -216,23 +215,26 @@ def validate_unique_ids_across_components(data: SingleDataset) -> list[MultiComp
return all_cross_unique(data, [(component, "id") for component in data])


def validate_ids_exist(update_data: SingleDataset, input_data: SingleDataset) -> list[IdNotInDatasetError]:
def validate_ids(update_data: SingleDataset, input_data: SingleDataset) -> list[IdNotInDatasetError | InvalidIdError]:
"""
Checks if all ids of the components in the update data exist in the input data. This needs to be true, because you
can only update existing components.
Checks if all ids of the components in the update data:
- exist and match those in the input data
- are not present but qualifies for optional id

This function should be called for every update dataset in a batch set

Args:
update_data: A single update dataset
input_data: A power-grid-model input dataset
input_data: Input dataset

Returns:
An empty list if all update data ids exist in the input dataset, or a list of IdNotInDatasetErrors for
all update components of which the id does not exist in the input dataset
An empty list if all update data ids are valid, or a list of IdNotInDatasetErrors or InvalidIdError for
all update components that have invalid ids

"""
errors = (all_ids_exist_in_data_set(update_data, input_data, component, "input_data") for component in update_data)
errors = (
ids_valid_in_update_data_set(update_data, input_data, component, "update_data") for component in update_data
)
return list(chain(*errors))


Expand Down
7 changes: 4 additions & 3 deletions tests/unit/validation/test_batch_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ def test_validate_batch_data_input_error(input_data, batch_data):
def test_validate_batch_data_update_error(input_data, batch_data):
batch_data["line"]["from_status"] = np.array([[12, 34], [0, -128], [56, 78]])
errors = validate_batch_data(input_data, batch_data)
assert len(errors) == 2
assert [NotBooleanError("line", "from_status", [5, 6])] == errors[0]
assert [NotBooleanError("line", "from_status", [5, 7])] == errors[2]
assert len(errors) == 3
assert NotBooleanError("line", "from_status", [5, 6]) == errors[0][0]
assert NotBooleanError("line", "from_status", [5, 7]) == errors[1][1]
assert NotBooleanError("line", "from_status", [5, 6]) == errors[2][0]
57 changes: 51 additions & 6 deletions tests/unit/validation/test_validation_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,16 @@
import pytest

from power_grid_model import CalculationType, LoadGenType, MeasuredTerminalType, initialize_array, power_grid_meta_data
from power_grid_model._utils import compatibility_convert_row_columnar_dataset
from power_grid_model.core.dataset_definitions import ComponentType, DatasetType
from power_grid_model.enum import Branch3Side, BranchSide, CalculationType, FaultType, TapChangingStrategy
from power_grid_model.enum import (
Branch3Side,
BranchSide,
CalculationType,
ComponentAttributeFilterOptions,
FaultType,
TapChangingStrategy,
)
from power_grid_model.validation import assert_valid_input_data
from power_grid_model.validation.errors import (
IdNotInDatasetError,
Expand All @@ -27,7 +35,7 @@
from power_grid_model.validation.validation import (
assert_valid_data_structure,
validate_generic_power_sensor,
validate_ids_exist,
validate_ids,
validate_required_values,
validate_unique_ids_across_components,
validate_values,
Expand Down Expand Up @@ -113,7 +121,7 @@ def test_validate_unique_ids_across_components():
assert len(unique_id_errors[0].ids) == 4


def test_validate_ids_exist():
def test_validate_ids():
Jerry-Jinfeng-Guo marked this conversation as resolved.
Show resolved Hide resolved
source = initialize_array("input", "source", 3)
source["id"] = [1, 2, 3]

Expand All @@ -135,10 +143,47 @@ def test_validate_ids_exist():

update_data = {"source": source_update, "sym_load": sym_load_update}

invalid_ids = validate_ids_exist(update_data, input_data)
invalid_ids = validate_ids(update_data, input_data)

assert IdNotInDatasetError("source", [4], "update_data") in invalid_ids
assert IdNotInDatasetError("sym_load", [7], "update_data") in invalid_ids

source_update_no_id = initialize_array("update", "source", 3)
source_update_no_id["u_ref"] = [1.0, 2.0, 3.0]

update_data_col = compatibility_convert_row_columnar_dataset(
data={"source": source_update_no_id, "sym_load": sym_load_update},
data_filter=ComponentAttributeFilterOptions.relevant,
dataset_type=DatasetType.update,
)
invalid_ids = validate_ids(update_data_col, input_data)
assert len(invalid_ids) == 1
assert IdNotInDatasetError("sym_load", [7], "update_data") in invalid_ids

source_update_less_no_id = initialize_array("update", "source", 2)
source_update_less_no_id["u_ref"] = [1.0, 2.0]

assert IdNotInDatasetError("source", [4], "input_data") in invalid_ids
assert IdNotInDatasetError("sym_load", [7], "input_data") in invalid_ids
update_data_col_less_no_id = compatibility_convert_row_columnar_dataset(
data={"source": source_update_less_no_id, "sym_load": sym_load_update},
data_filter=ComponentAttributeFilterOptions.relevant,
dataset_type=DatasetType.update,
)
invalid_ids = validate_ids(update_data_col_less_no_id, input_data)
assert len(invalid_ids) == 2
assert IdNotInDatasetError("sym_load", [7], "update_data") in invalid_ids
figueroa1395 marked this conversation as resolved.
Show resolved Hide resolved

source_update_part_nan_id = initialize_array("update", "source", 3)
source_update_part_nan_id["id"] = [1, np.iinfo(np.int32).min, 4]
source_update_part_nan_id["u_ref"] = [1.0, 2.0, 3.0]

update_data_col_part_nan_id = compatibility_convert_row_columnar_dataset(
data={"source": source_update_part_nan_id, "sym_load": sym_load_update},
data_filter=ComponentAttributeFilterOptions.relevant,
dataset_type=DatasetType.update,
)
invalid_ids = validate_ids(update_data_col_part_nan_id, input_data)
assert len(invalid_ids) == 2
assert IdNotInDatasetError("sym_load", [7], "update_data") in invalid_ids
figueroa1395 marked this conversation as resolved.
Show resolved Hide resolved


@pytest.mark.parametrize(
Expand Down
Loading