refactor(excel2json-lists): change location of column info extraction (…

…#1115)
dasch-swiss · Aug 16, 2024 · bd163cf · bd163cf
1 parent 81b656a
commit bd163cf
Show file tree

Hide file tree

Showing 8 changed files with 232 additions and 162 deletions.
diff --git a/src/dsp_tools/commands/excel2json/new_lists/compliance_checks.py b/src/dsp_tools/commands/excel2json/new_lists/compliance_checks.py
@@ -11,9 +11,7 @@
 
 from dsp_tools.commands.excel2json.models.input_error import PositionInExcel
 from dsp_tools.commands.excel2json.models.input_error import Problem
-from dsp_tools.commands.excel2json.new_lists.models.deserialise import ColumnNodes
 from dsp_tools.commands.excel2json.new_lists.models.deserialise import Columns
-from dsp_tools.commands.excel2json.new_lists.models.deserialise import ColumnsList
 from dsp_tools.commands.excel2json.new_lists.models.deserialise import ExcelSheet
 from dsp_tools.commands.excel2json.new_lists.models.input_error import CollectedSheetProblems
 from dsp_tools.commands.excel2json.new_lists.models.input_error import DuplicateIDProblem
@@ -29,11 +27,9 @@
 from dsp_tools.commands.excel2json.new_lists.models.input_error import MultipleListPerSheetProblem
 from dsp_tools.commands.excel2json.new_lists.models.input_error import NodesPerRowProblem
 from dsp_tools.commands.excel2json.new_lists.models.input_error import SheetProblem
-from dsp_tools.commands.excel2json.new_lists.utils import get_all_languages_for_columns
 from dsp_tools.commands.excel2json.new_lists.utils import get_columns_of_preferred_lang
 from dsp_tools.commands.excel2json.new_lists.utils import get_hierarchy_nums
 from dsp_tools.commands.excel2json.new_lists.utils import get_lang_string_from_column_name
-from dsp_tools.commands.excel2json.new_lists.utils import get_preferred_language
 from dsp_tools.models.custom_warnings import DspToolsUserWarning
 from dsp_tools.models.exceptions import InputError
 
@@ -81,8 +77,7 @@ def _check_for_unique_list_names(sheet_list: list[ExcelSheet]) -> None:
     all_problems: list[Problem] = []
     sheet_problems: list[SheetProblem] = []
     for sheet in sheet_list:
-        preferred_language = get_preferred_language(sheet.df.columns, r"list")
-        unique_list_names = list(sheet.df[f"{preferred_language}_list"].unique())
+        unique_list_names = list(sheet.df[f"{sheet.col_info.preferred_lang}_list"].unique())
         if len(unique_list_names) != 1:
             sheet_problems.append(MultipleListPerSheetProblem(sheet.excel_name, sheet.sheet_name, unique_list_names))
         list_names.extend([ListInformation(sheet.excel_name, sheet.sheet_name, name) for name in unique_list_names])
@@ -227,12 +222,9 @@ def _check_for_missing_translations_all_excels(sheet_list: list[ExcelSheet]) ->
 
 
 def _check_for_missing_translations_one_sheet(sheet: ExcelSheet) -> MissingTranslationsSheetProblem | None:
-    col_endings = [str(num) for num in get_hierarchy_nums(sheet.df.columns)]
-    languages = get_all_languages_for_columns(sheet.df.columns)
-    all_cols = _compose_all_combinatoric_column_titles(col_endings, languages)
     problems = []
     for i, row in sheet.df.iterrows():
-        if problem := _check_missing_translations_one_row(int(str(i)), row, all_cols):
+        if problem := _check_missing_translations_one_row(int(str(i)), row, sheet.col_info):
             problems.append(problem)
     if problems:
         return MissingTranslationsSheetProblem(sheet.excel_name, sheet.sheet_name, problems)
@@ -245,7 +237,7 @@ def _check_missing_translations_one_row(
     missing_translations = []
     for col_group in columns.node_cols:
         missing_translations.extend(_check_for_missing_translations_one_column_group(row, col_group.columns))
-    missing_translations.extend(_check_for_missing_translations_one_column_group(row, columns.list_cols.columns))
+    missing_translations.extend(_check_for_missing_translations_one_column_group(row, columns.list_cols))
     if missing_translations:
         return MissingNodeTranslationProblem(empty_columns=missing_translations, index_num=row_index)
     return None
@@ -258,14 +250,6 @@ def _check_for_missing_translations_one_column_group(row: pd.Series[Any], column
     return []
 
 
-def _compose_all_combinatoric_column_titles(nums: list[str], languages: set[str]) -> Columns:
-    node_cols = []
-    for n in nums:
-        node_cols.append(ColumnNodes(level_num=int(n), columns=[f"{lang}_{n}" for lang in languages]))
-    list_columns = ColumnsList([f"{lang}_list" for lang in languages])
-    return Columns(list_cols=list_columns, node_cols=node_cols)
-
-
 def _check_for_erroneous_entries_all_excels(sheet_list: list[ExcelSheet]) -> None:
     problems: list[SheetProblem] = [
         p for sheet in sheet_list if (p := _check_for_erroneous_entries_one_list(sheet)) is not None
@@ -277,10 +261,9 @@ def _check_for_erroneous_entries_all_excels(sheet_list: list[ExcelSheet]) -> Non
 
 
 def _check_for_erroneous_entries_one_list(sheet: ExcelSheet) -> ListSheetContentProblem | None:
-    preferred_lang = get_preferred_language(sheet.df.columns)
-    preferred_cols = get_columns_of_preferred_lang(sheet.df.columns, preferred_lang, r"\d+")
+    preferred_cols = get_columns_of_preferred_lang(sheet.df.columns, sheet.col_info.preferred_lang, r"\d+")
     preferred_cols = sorted(preferred_cols)
-    preferred_cols.insert(0, f"{preferred_lang}_list")
+    preferred_cols.insert(0, f"{sheet.col_info.preferred_lang}_list")
     problems = _check_for_erroneous_node_info_one_df(sheet.df, preferred_cols)
     if problems:
         list_problems = cast(list[Problem], problems)

diff --git a/src/dsp_tools/commands/excel2json/new_lists/make_new_lists.py b/src/dsp_tools/commands/excel2json/new_lists/make_new_lists.py
@@ -19,10 +19,11 @@
 from dsp_tools.commands.excel2json.new_lists.models.serialise import ListNode
 from dsp_tools.commands.excel2json.new_lists.models.serialise import ListRoot
 from dsp_tools.commands.excel2json.new_lists.utils import get_all_languages_for_columns
+from dsp_tools.commands.excel2json.new_lists.utils import get_column_info
 from dsp_tools.commands.excel2json.new_lists.utils import get_columns_of_preferred_lang
 from dsp_tools.commands.excel2json.new_lists.utils import get_hierarchy_nums
 from dsp_tools.commands.excel2json.new_lists.utils import get_lang_string_from_column_name
-from dsp_tools.commands.excel2json.new_lists.utils import get_preferred_language
+from dsp_tools.commands.excel2json.utils import add_optional_columns
 from dsp_tools.commands.excel2json.utils import read_and_clean_all_sheets
 from dsp_tools.models.exceptions import InputError
 
@@ -45,8 +46,8 @@ def new_excel2lists(
     Returns:
         a tuple consisting of the "lists" section as Python list, and the success status (True if everything went well)
     """
-    sheet_list = _parse_files(excelfolder)
-    sheet_list = _prepare_dfs(sheet_list)
+    df_dict = _parse_files(excelfolder)
+    sheet_list = _prepare_sheets(df_dict)
 
     finished_lists = _make_serialised_lists(sheet_list)
     validate_lists_section_with_schema(lists_section=finished_lists)
@@ -59,55 +60,50 @@ def new_excel2lists(
     return finished_lists, True
 
 
-def _parse_files(excelfolder: Path | str) -> list[ExcelSheet]:
+def _parse_files(excelfolder: Path | str) -> dict[str, dict[str, pd.DataFrame]]:
     file_names = [file for file in Path(excelfolder).glob("*list*.xlsx", case_sensitive=False) if _non_hidden(file)]
-    all_sheets = []
+    df_dict = {}
     for file in file_names:
-        all_sheets.extend(
-            [
-                ExcelSheet(excel_name=str(file), sheet_name=name, df=df)
-                for name, df in read_and_clean_all_sheets(file).items()
-            ]
-        )
-    return all_sheets
+        df_dict[str(file)] = read_and_clean_all_sheets(file)
+    return df_dict
 
 
-def _non_hidden(path: Path) -> bool:
-    return not regex.search(r"^(\.|~\$).+", path.name)
+def _prepare_sheets(df_dict: dict[str, dict[str, pd.DataFrame]]) -> list[ExcelSheet]:
+    all_sheets: list[ExcelSheet] = []
+    for file, sheets in df_dict.items():
+        all_sheets.extend(_prepare_one_sheet(df, file, sheet_name) for sheet_name, df in sheets.items())
+    make_all_excel_compliance_checks(all_sheets)
+    return _construct_ids(all_sheets)
 
 
-def _prepare_dfs(sheet_list: list[ExcelSheet]) -> list[ExcelSheet]:
-    sheet_list = _add_id_optional_column_if_not_exists(sheet_list)
-    make_all_excel_compliance_checks(sheet_list)
-    return _construct_ids(sheet_list)
+def _prepare_one_sheet(df: pd.DataFrame, filename: str, sheet_name: str) -> ExcelSheet:
+    columns = get_column_info(df.columns)
+    df = add_optional_columns(df, {"id (optional)"})
+    return ExcelSheet(excel_name=filename, sheet_name=sheet_name, col_info=columns, df=df)
 
 
-def _add_id_optional_column_if_not_exists(sheet_list: list[ExcelSheet]) -> list[ExcelSheet]:
-    all_sheets = []
-    for sheet in sheet_list:
-        if "id (optional)" not in sheet.df.columns:
-            df = sheet.df
-            df["id (optional)"] = pd.NA
-            all_sheets.append(ExcelSheet(excel_name=sheet.excel_name, sheet_name=sheet.sheet_name, df=df))
-        else:
-            all_sheets.append(sheet)
-    return all_sheets
+def _non_hidden(path: Path) -> bool:
+    return not regex.search(r"^(\.|~\$).+", path.name)
 
 
 def _construct_ids(sheet_list: list[ExcelSheet]) -> list[ExcelSheet]:
     all_sheets = []
     for sheet in sheet_list:
-        df = _complete_id_one_df(sheet.df, get_preferred_language(sheet.df.columns))
-        all_sheets.append(ExcelSheet(excel_name=sheet.excel_name, sheet_name=sheet.sheet_name, df=df))
+        df = _complete_id_one_df(sheet.df, sheet.col_info.preferred_lang)
+        all_sheets.append(
+            ExcelSheet(excel_name=sheet.excel_name, col_info=sheet.col_info, sheet_name=sheet.sheet_name, df=df)
+        )
     all_sheets = _resolve_duplicate_ids_all_excels(all_sheets)
     return _fill_parent_id_col_all_excels(all_sheets)
 
 
 def _fill_parent_id_col_all_excels(sheet_list: list[ExcelSheet]) -> list[ExcelSheet]:
     all_sheets = []
     for sheet in sheet_list:
-        df = _fill_parent_id_col_one_df(sheet.df, get_preferred_language(sheet.df.columns))
-        all_sheets.append(ExcelSheet(excel_name=sheet.excel_name, sheet_name=sheet.sheet_name, df=df))
+        df = _fill_parent_id_col_one_df(sheet.df, sheet.col_info.preferred_lang)
+        all_sheets.append(
+            ExcelSheet(excel_name=sheet.excel_name, sheet_name=sheet.sheet_name, col_info=sheet.col_info, df=df)
+        )
     return all_sheets
 
 
@@ -139,18 +135,17 @@ def _resolve_duplicate_ids_all_excels(sheet_list: list[ExcelSheet]) -> list[Exce
 def _remove_duplicate_ids_in_all_excels(duplicate_ids: list[str], sheet_list: list[ExcelSheet]) -> list[ExcelSheet]:
     all_sheets = []
     for sheet in sheet_list:
-        preferred_lang = get_preferred_language(sheet.df.columns)
         df = sheet.df
         for i, row in df.iterrows():
             if row["id"] in duplicate_ids and pd.isna(row["id (optional)"]):
-                df.at[i, "id"] = _construct_non_duplicate_id_string(df.iloc[int(str(i))], preferred_lang)
-        all_sheets.append(ExcelSheet(sheet.excel_name, sheet.sheet_name, df))
+                df.at[i, "id"] = _construct_non_duplicate_id_string(df.iloc[int(str(i))], sheet.col_info.preferred_lang)
+        all_sheets.append(
+            ExcelSheet(excel_name=sheet.excel_name, sheet_name=sheet.sheet_name, col_info=sheet.col_info, df=df)
+        )
     return sheet_list
 
 
 def _complete_id_one_df(df: pd.DataFrame, preferred_language: str) -> pd.DataFrame:
-    if "id (optional)" not in df.columns:
-        df["id (optional)"] = pd.NA
     df = _create_auto_id_one_df(df, preferred_language)
     df["id"] = df["id (optional)"].fillna(df["auto_id"])
     df = _resolve_duplicate_ids_keep_custom_change_auto_id_one_df(df, preferred_language)

diff --git a/src/dsp_tools/commands/excel2json/new_lists/models/deserialise.py b/src/dsp_tools/commands/excel2json/new_lists/models/deserialise.py
@@ -9,23 +9,20 @@
 class ExcelSheet:
     excel_name: str
     sheet_name: str
+    col_info: Columns
     df: pd.DataFrame
 
 
 @dataclass
 class Columns:
-    list_cols: ColumnsList
+    preferred_lang: str
+    list_cols: list[str]
     node_cols: list[ColumnNodes]
 
     def __post_init__(self) -> None:
         self.node_cols = sorted(self.node_cols, key=lambda x: x.level_num, reverse=True)
 
 
-@dataclass
-class ColumnsList:
-    columns: list[str]
-
-
 @dataclass
 class ColumnNodes:
     level_num: int

diff --git a/src/dsp_tools/commands/excel2json/new_lists/utils.py b/src/dsp_tools/commands/excel2json/new_lists/utils.py
@@ -1,8 +1,12 @@
 from __future__ import annotations
 
+from typing import Any
+
 import pandas as pd
 import regex
 
+from dsp_tools.commands.excel2json.new_lists.models.deserialise import ColumnNodes
+from dsp_tools.commands.excel2json.new_lists.models.deserialise import Columns
 from dsp_tools.models.exceptions import InputError
 
 
@@ -20,6 +24,26 @@ def get_columns_of_preferred_lang(
     return sorted(col for col in columns if regex.search(rf"^{preferred_language}_{ending}$", col))
 
 
+def get_column_info(df_cols: pd.Index[Any]) -> Columns:
+    """
+    Constructs and returns all the columns that should be present in the df based on the languages used.
+
+    Args:
+        df_cols: columns of the df
+
+    Returns:
+        Object with column info
+    """
+    hierarchy_nums = get_hierarchy_nums(df_cols)
+    all_langs = get_all_languages_for_columns(df_cols)
+    preferred_lang = get_preferred_language_from_columns(df_cols)
+    node_cols = []
+    for n in hierarchy_nums:
+        node_cols.append(ColumnNodes(level_num=int(n), columns=[f"{lang}_{n}" for lang in all_langs]))
+    list_columns = [f"{lang}_list" for lang in all_langs]
+    return Columns(preferred_lang=preferred_lang, list_cols=list_columns, node_cols=node_cols)
+
+
 def get_hierarchy_nums(columns: pd.Index[str]) -> list[int]:
     """Get all the numbers that are used in the column names that contain a language tag."""
     return sorted(
@@ -32,9 +56,9 @@ def get_all_languages_for_columns(columns: pd.Index[str], ending: str = r"(\d+|l
     return set(res for x in columns if (res := get_lang_string_from_column_name(x, ending)))
 
 
-def get_preferred_language(columns: pd.Index[str], ending: str = r"(\d+|list)") -> str:
+def get_preferred_language_from_columns(columns: pd.Index[str], ending: str = r"(\d+|list)") -> str:
     """Get the language tag of the preferred language."""
-    match = [res.group(1) for x in columns if (res := regex.search(rf"^(en|de|fr|it|rm)_{ending}+$", x))]
+    match = {res.group(1) for x in columns if (res := regex.search(rf"^(en|de|fr|it|rm)_{ending}+$", x))}
     if "en" in match:
         return "en"
     elif "de" in match: