Skip to content

Commit

Permalink
Better handling of whitespace in param names
Browse files Browse the repository at this point in the history
  • Loading branch information
larsevj committed Oct 9, 2024
1 parent 9567649 commit 010fe9f
Show file tree
Hide file tree
Showing 2 changed files with 84 additions and 53 deletions.
76 changes: 33 additions & 43 deletions src/ert/config/design_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,18 +80,22 @@ def read_design_matrix(
"""
Reads out all file content from different files and create dataframes
"""
param_names = pd.read_excel(
self.xls_filename,
sheet_name=self.design_sheet,
nrows=1,
header=None,
dtype=str,
).iloc[0]
param_names = (
pd.read_excel(
self.xls_filename,
sheet_name=self.design_sheet,
nrows=1,
header=None,
dtype="string",
)
.iloc[0]
.apply(lambda x: x.strip() if isinstance(x, str) else x)
)
if len(param_names) - len(set(param_names)) != 0:
raise ValueError("Duplicate parameter names found in design sheet")
design_matrix_df = DesignMatrix._read_excel(
self.xls_filename, self.design_sheet
)
).rename(columns=lambda x: str(x).strip())

if "REAL" in design_matrix_df.columns:
if not is_integer_dtype(design_matrix_df.dtypes["REAL"]) or any(
Expand Down Expand Up @@ -148,6 +152,7 @@ def _read_excel(
sheet_name: str,
usecols: int | list[int] | None = None,
header: int | None = 0,
dtype: str | None = None,
) -> pd.DataFrame:
"""
Make dataframe from excel file
Expand All @@ -160,6 +165,7 @@ def _read_excel(
sheet_name,
usecols=usecols,
header=header,
dtype=dtype,
)
return dframe.dropna(axis=1, how="all")

Expand All @@ -171,28 +177,17 @@ def _validate_design_matrix(design_matrix: pd.DataFrame) -> list[str]:
if design_matrix.empty:
return []
errors = []
try:
unnamed = design_matrix.loc[
:, design_matrix.columns.str.contains("^Unnamed")
]
except ValueError as err:
# We catch because int/floats as column headers
# in xlsx gets read as int/float and is not valid to index by.
errors.append(f"Invalid value in design matrix header, error: {err !s}")
else:
column_indexes = [int(x.split(":")[1]) for x in unnamed.columns.to_numpy()]
if len(column_indexes) > 0:
errors.append(f"Column headers not present in column {column_indexes}")

# Look for initial or trailing whitespace in column headers. This
# is disallowed as it can create user confusion and has no use-case.
for col_header in design_matrix:
if col_header != col_header.strip():
errors.append(
(
f"Column header '{col_header}' contains initial or trailing whitespace."
)
)
column_indexes_unnamed = [
index
for index, value in enumerate(
design_matrix.columns.str.contains("^Unnamed")
)
if value
]
if len(column_indexes_unnamed) > 0:
errors.append(
f"Column headers not present in column {column_indexes_unnamed}"
)

empties = [
f"Realization {design_matrix.index[i]}, column {design_matrix.columns[j]}"
Expand All @@ -215,7 +210,10 @@ def _read_defaultssheet(
:raises: ValueError if defaults sheet is non-empty but non-parsable
"""
default_df = DesignMatrix._read_excel(
xlsfilename, defaultssheetname, usecols=[0, 1], header=None
xlsfilename,
defaultssheetname,
header=None,
dtype="string",
)
if default_df.empty:
return {}
Expand All @@ -225,19 +223,11 @@ def _read_defaultssheet(
f"Row {default_df.index[i]}, column {default_df.columns[j]}"
for i, j in zip(*np.where(pd.isna(default_df)))
]
if empty_cells > 0:
if len(empty_cells) > 0:
raise ValueError(f"Default sheet contains empty cells {empty_cells}")
# Look for initial or trailing whitespace in parameter names. This
# is disallowed as it can create user confusion and has no use-case.
whitespace_errors = []
for paramname in default_df.loc[:, 0]:
if paramname != paramname.strip():
whitespace_errors.append(
f"Parameter name '{paramname}' in default values contains "
"initial or trailing whitespace."
)
if whitespace_errors > 0:
raise ValueError(whitespace_errors)
default_df[0] = default_df[0].apply(lambda x: x.strip())
if not default_df[0].is_unique:
raise ValueError("Default sheet contains duplicate parameter names")

return {row[0]: convert_to_numeric(row[1]) for _, row in default_df.iterrows()}

Expand Down
61 changes: 51 additions & 10 deletions tests/ert/unit_tests/sensitivity_analysis/test_design_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,21 +70,16 @@ def test_reading_design_matrix_validate_reals(tmp_path, real_column, error_msg):
"Duplicate parameter names found in design sheet",
id="duplicate entries",
),
pytest.param(
["a ", "b", " a"],
"Duplicate parameter names found in design sheet",
id="duplicate entries with whitespaces",
),
pytest.param(
["a", "b ", ""],
r"Column headers not present in column \[2\]",
id="missing entries",
),
pytest.param(
["a", "b", 10],
"Invalid value in design matrix header, error: Cannot mask with non-boolean array containing NA / NaN values",
id="float entries",
),
pytest.param(
["a", "b", " som "],
r"Column header ' som ' contains initial or trailing whitespace.",
id="float entries",
),
],
)
def test_reading_design_matrix_validate_headers(tmp_path, column_names, error_msg):
Expand Down Expand Up @@ -137,3 +132,49 @@ def test_reading_design_matrix_validate_cells(tmp_path, values, error_msg):
design_matrix = DesignMatrix(design_path, "DesignSheet01", "DefaultValues")
with pytest.raises(ValueError, match=error_msg):
design_matrix.read_design_matrix()


@pytest.mark.parametrize(
"data, error_msg",
[
pytest.param(
[["one"], ["b"], ["d"]],
"Defaults sheet must have at least two columns",
id="Too few columns",
),
pytest.param(
[["one", 1], ["b", ""], ["d", 6]],
r"Default sheet contains empty cells \['Row 1, column 1'\]",
id="empty cells",
),
pytest.param(
[[2, 1], ["b", ""], ["d", 6]],
r"Default sheet contains empty cells \['Row 1, column 1'\]",
id="numerical entries as param names",
),
pytest.param(
[[" a", 1], ["a ", "some"], ["d", 6]],
r"Default sheet contains duplicate parameter names",
id="duplicate parameter names",
),
],
)
def test_reading_default_sheet_validation(tmp_path, data, error_msg):
design_path = tmp_path / "design_matrix.xlsx"
design_matrix_df = pd.DataFrame(
{
"REAL": [0, 1, 2],
"a": [1, 2, 3],
"b": [0, 2, 0],
"c": [3, 1, 3],
}
)
default_sheet_df = pd.DataFrame(data)
with pd.ExcelWriter(design_path) as xl_write:
design_matrix_df.to_excel(xl_write, index=False, sheet_name="DesignSheet01")
default_sheet_df.to_excel(
xl_write, index=False, sheet_name="DefaultValues", header=False
)
design_matrix = DesignMatrix(design_path, "DesignSheet01", "DefaultValues")
with pytest.raises(ValueError, match=error_msg):
design_matrix.read_design_matrix()

0 comments on commit 010fe9f

Please sign in to comment.