Better handling of whitespace in param names

larsevj · Oct 9, 2024 · 010fe9f · 010fe9f
1 parent 9567649
commit 010fe9f
Show file tree

Hide file tree

Showing 2 changed files with 84 additions and 53 deletions.
diff --git a/src/ert/config/design_matrix.py b/src/ert/config/design_matrix.py
@@ -80,18 +80,22 @@ def read_design_matrix(
  """
  Reads out all file content from different files and create dataframes
  """
- param_names = pd.read_excel(
- self.xls_filename,
- sheet_name=self.design_sheet,
- nrows=1,
- header=None,
- dtype=str,
- ).iloc[0]
+ param_names = (
+ pd.read_excel(
+ self.xls_filename,
+ sheet_name=self.design_sheet,
+ nrows=1,
+ header=None,
+ dtype="string",
+ )
+ .iloc[0]
+ .apply(lambda x: x.strip() if isinstance(x, str) else x)
+ )
  if len(param_names) - len(set(param_names)) != 0:
  raise ValueError("Duplicate parameter names found in design sheet")
  design_matrix_df = DesignMatrix._read_excel(
  self.xls_filename, self.design_sheet
- )
+ ).rename(columns=lambda x: str(x).strip())
 
  if "REAL" in design_matrix_df.columns:
  if not is_integer_dtype(design_matrix_df.dtypes["REAL"]) or any(
@@ -148,6 +152,7 @@ def _read_excel(
  sheet_name: str,
  usecols: int | list[int] | None = None,
  header: int | None = 0,
+ dtype: str | None = None,
  ) -> pd.DataFrame:
  """
  Make dataframe from excel file
@@ -160,6 +165,7 @@ def _read_excel(
  sheet_name,
  usecols=usecols,
  header=header,
+ dtype=dtype,
  )
  return dframe.dropna(axis=1, how="all")
 
@@ -171,28 +177,17 @@ def _validate_design_matrix(design_matrix: pd.DataFrame) -> list[str]:
  if design_matrix.empty:
  return []
  errors = []
- try:
- unnamed = design_matrix.loc[
- :, design_matrix.columns.str.contains("^Unnamed")
- ]
- except ValueError as err:
- # We catch because int/floats as column headers
- # in xlsx gets read as int/float and is not valid to index by.
- errors.append(f"Invalid value in design matrix header, error: {err !s}")
- else:
- column_indexes = [int(x.split(":")[1]) for x in unnamed.columns.to_numpy()]
- if len(column_indexes) > 0:
- errors.append(f"Column headers not present in column {column_indexes}")
-
- # Look for initial or trailing whitespace in column headers. This
- # is disallowed as it can create user confusion and has no use-case.
- for col_header in design_matrix:
- if col_header != col_header.strip():
- errors.append(
- (
- f"Column header '{col_header}' contains initial or trailing whitespace."
- )
- )
+ column_indexes_unnamed = [
+ index
+ for index, value in enumerate(
+ design_matrix.columns.str.contains("^Unnamed")
+ )
+ if value
+ ]
+ if len(column_indexes_unnamed) > 0:
+ errors.append(
+ f"Column headers not present in column {column_indexes_unnamed}"
+ )
 
  empties = [
  f"Realization {design_matrix.index[i]}, column {design_matrix.columns[j]}"
@@ -215,7 +210,10 @@ def _read_defaultssheet(
  :raises: ValueError if defaults sheet is non-empty but non-parsable
  """
  default_df = DesignMatrix._read_excel(
- xlsfilename, defaultssheetname, usecols=[0, 1], header=None
+ xlsfilename,
+ defaultssheetname,
+ header=None,
+ dtype="string",
  )
  if default_df.empty:
  return {}
@@ -225,19 +223,11 @@ def _read_defaultssheet(
  f"Row {default_df.index[i]}, column {default_df.columns[j]}"
  for i, j in zip(*np.where(pd.isna(default_df)))
  ]
- if empty_cells > 0:
+ if len(empty_cells) > 0:
  raise ValueError(f"Default sheet contains empty cells {empty_cells}")
- # Look for initial or trailing whitespace in parameter names. This
- # is disallowed as it can create user confusion and has no use-case.
- whitespace_errors = []
- for paramname in default_df.loc[:, 0]:
- if paramname != paramname.strip():
- whitespace_errors.append(
- f"Parameter name '{paramname}' in default values contains "
- "initial or trailing whitespace."
- )
- if whitespace_errors > 0:
- raise ValueError(whitespace_errors)
+ default_df[0] = default_df[0].apply(lambda x: x.strip())
+ if not default_df[0].is_unique:
+ raise ValueError("Default sheet contains duplicate parameter names")
 
  return {row[0]: convert_to_numeric(row[1]) for _, row in default_df.iterrows()}
 

diff --git a/tests/ert/unit_tests/sensitivity_analysis/test_design_matrix.py b/tests/ert/unit_tests/sensitivity_analysis/test_design_matrix.py
@@ -70,21 +70,16 @@ def test_reading_design_matrix_validate_reals(tmp_path, real_column, error_msg):
  "Duplicate parameter names found in design sheet",
  id="duplicate entries",
  ),
+ pytest.param(
+ ["a ", "b", " a"],
+ "Duplicate parameter names found in design sheet",
+ id="duplicate entries with whitespaces",
+ ),
  pytest.param(
  ["a", "b ", ""],
  r"Column headers not present in column \[2\]",
  id="missing entries",
  ),
- pytest.param(
- ["a", "b", 10],
- "Invalid value in design matrix header, error: Cannot mask with non-boolean array containing NA / NaN values",
- id="float entries",
- ),
- pytest.param(
- ["a", "b", " som "],
- r"Column header ' som ' contains initial or trailing whitespace.",
- id="float entries",
- ),
  ],
 )
 def test_reading_design_matrix_validate_headers(tmp_path, column_names, error_msg):
@@ -137,3 +132,49 @@ def test_reading_design_matrix_validate_cells(tmp_path, values, error_msg):
  design_matrix = DesignMatrix(design_path, "DesignSheet01", "DefaultValues")
  with pytest.raises(ValueError, match=error_msg):
  design_matrix.read_design_matrix()
+
+
+@pytest.mark.parametrize(
+ "data, error_msg",
+ [
+ pytest.param(
+ [["one"], ["b"], ["d"]],
+ "Defaults sheet must have at least two columns",
+ id="Too few columns",
+ ),
+ pytest.param(
+ [["one", 1], ["b", ""], ["d", 6]],
+ r"Default sheet contains empty cells \['Row 1, column 1'\]",
+ id="empty cells",
+ ),
+ pytest.param(
+ [[2, 1], ["b", ""], ["d", 6]],
+ r"Default sheet contains empty cells \['Row 1, column 1'\]",
+ id="numerical entries as param names",
+ ),
+ pytest.param(
+ [[" a", 1], ["a ", "some"], ["d", 6]],
+ r"Default sheet contains duplicate parameter names",
+ id="duplicate parameter names",
+ ),
+ ],
+)
+def test_reading_default_sheet_validation(tmp_path, data, error_msg):
+ design_path = tmp_path / "design_matrix.xlsx"
+ design_matrix_df = pd.DataFrame(
+ {
+ "REAL": [0, 1, 2],
+ "a": [1, 2, 3],
+ "b": [0, 2, 0],
+ "c": [3, 1, 3],
+ }
+ )
+ default_sheet_df = pd.DataFrame(data)
+ with pd.ExcelWriter(design_path) as xl_write:
+ design_matrix_df.to_excel(xl_write, index=False, sheet_name="DesignSheet01")
+ default_sheet_df.to_excel(
+ xl_write, index=False, sheet_name="DefaultValues", header=False
+ )
+ design_matrix = DesignMatrix(design_path, "DesignSheet01", "DefaultValues")
+ with pytest.raises(ValueError, match=error_msg):
+ design_matrix.read_design_matrix()