Skip to content

Commit

Permalink
refactor(excel2json-lists): change location of column info extraction (
Browse files Browse the repository at this point in the history
  • Loading branch information
Nora-Olivia-Ammann authored Aug 16, 2024
1 parent 81b656a commit bd163cf
Show file tree
Hide file tree
Showing 8 changed files with 232 additions and 162 deletions.
27 changes: 5 additions & 22 deletions src/dsp_tools/commands/excel2json/new_lists/compliance_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,7 @@

from dsp_tools.commands.excel2json.models.input_error import PositionInExcel
from dsp_tools.commands.excel2json.models.input_error import Problem
from dsp_tools.commands.excel2json.new_lists.models.deserialise import ColumnNodes
from dsp_tools.commands.excel2json.new_lists.models.deserialise import Columns
from dsp_tools.commands.excel2json.new_lists.models.deserialise import ColumnsList
from dsp_tools.commands.excel2json.new_lists.models.deserialise import ExcelSheet
from dsp_tools.commands.excel2json.new_lists.models.input_error import CollectedSheetProblems
from dsp_tools.commands.excel2json.new_lists.models.input_error import DuplicateIDProblem
Expand All @@ -29,11 +27,9 @@
from dsp_tools.commands.excel2json.new_lists.models.input_error import MultipleListPerSheetProblem
from dsp_tools.commands.excel2json.new_lists.models.input_error import NodesPerRowProblem
from dsp_tools.commands.excel2json.new_lists.models.input_error import SheetProblem
from dsp_tools.commands.excel2json.new_lists.utils import get_all_languages_for_columns
from dsp_tools.commands.excel2json.new_lists.utils import get_columns_of_preferred_lang
from dsp_tools.commands.excel2json.new_lists.utils import get_hierarchy_nums
from dsp_tools.commands.excel2json.new_lists.utils import get_lang_string_from_column_name
from dsp_tools.commands.excel2json.new_lists.utils import get_preferred_language
from dsp_tools.models.custom_warnings import DspToolsUserWarning
from dsp_tools.models.exceptions import InputError

Expand Down Expand Up @@ -81,8 +77,7 @@ def _check_for_unique_list_names(sheet_list: list[ExcelSheet]) -> None:
all_problems: list[Problem] = []
sheet_problems: list[SheetProblem] = []
for sheet in sheet_list:
preferred_language = get_preferred_language(sheet.df.columns, r"list")
unique_list_names = list(sheet.df[f"{preferred_language}_list"].unique())
unique_list_names = list(sheet.df[f"{sheet.col_info.preferred_lang}_list"].unique())
if len(unique_list_names) != 1:
sheet_problems.append(MultipleListPerSheetProblem(sheet.excel_name, sheet.sheet_name, unique_list_names))
list_names.extend([ListInformation(sheet.excel_name, sheet.sheet_name, name) for name in unique_list_names])
Expand Down Expand Up @@ -227,12 +222,9 @@ def _check_for_missing_translations_all_excels(sheet_list: list[ExcelSheet]) ->


def _check_for_missing_translations_one_sheet(sheet: ExcelSheet) -> MissingTranslationsSheetProblem | None:
col_endings = [str(num) for num in get_hierarchy_nums(sheet.df.columns)]
languages = get_all_languages_for_columns(sheet.df.columns)
all_cols = _compose_all_combinatoric_column_titles(col_endings, languages)
problems = []
for i, row in sheet.df.iterrows():
if problem := _check_missing_translations_one_row(int(str(i)), row, all_cols):
if problem := _check_missing_translations_one_row(int(str(i)), row, sheet.col_info):
problems.append(problem)
if problems:
return MissingTranslationsSheetProblem(sheet.excel_name, sheet.sheet_name, problems)
Expand All @@ -245,7 +237,7 @@ def _check_missing_translations_one_row(
missing_translations = []
for col_group in columns.node_cols:
missing_translations.extend(_check_for_missing_translations_one_column_group(row, col_group.columns))
missing_translations.extend(_check_for_missing_translations_one_column_group(row, columns.list_cols.columns))
missing_translations.extend(_check_for_missing_translations_one_column_group(row, columns.list_cols))
if missing_translations:
return MissingNodeTranslationProblem(empty_columns=missing_translations, index_num=row_index)
return None
Expand All @@ -258,14 +250,6 @@ def _check_for_missing_translations_one_column_group(row: pd.Series[Any], column
return []


def _compose_all_combinatoric_column_titles(nums: list[str], languages: set[str]) -> Columns:
node_cols = []
for n in nums:
node_cols.append(ColumnNodes(level_num=int(n), columns=[f"{lang}_{n}" for lang in languages]))
list_columns = ColumnsList([f"{lang}_list" for lang in languages])
return Columns(list_cols=list_columns, node_cols=node_cols)


def _check_for_erroneous_entries_all_excels(sheet_list: list[ExcelSheet]) -> None:
problems: list[SheetProblem] = [
p for sheet in sheet_list if (p := _check_for_erroneous_entries_one_list(sheet)) is not None
Expand All @@ -277,10 +261,9 @@ def _check_for_erroneous_entries_all_excels(sheet_list: list[ExcelSheet]) -> Non


def _check_for_erroneous_entries_one_list(sheet: ExcelSheet) -> ListSheetContentProblem | None:
preferred_lang = get_preferred_language(sheet.df.columns)
preferred_cols = get_columns_of_preferred_lang(sheet.df.columns, preferred_lang, r"\d+")
preferred_cols = get_columns_of_preferred_lang(sheet.df.columns, sheet.col_info.preferred_lang, r"\d+")
preferred_cols = sorted(preferred_cols)
preferred_cols.insert(0, f"{preferred_lang}_list")
preferred_cols.insert(0, f"{sheet.col_info.preferred_lang}_list")
problems = _check_for_erroneous_node_info_one_df(sheet.df, preferred_cols)
if problems:
list_problems = cast(list[Problem], problems)
Expand Down
69 changes: 32 additions & 37 deletions src/dsp_tools/commands/excel2json/new_lists/make_new_lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,11 @@
from dsp_tools.commands.excel2json.new_lists.models.serialise import ListNode
from dsp_tools.commands.excel2json.new_lists.models.serialise import ListRoot
from dsp_tools.commands.excel2json.new_lists.utils import get_all_languages_for_columns
from dsp_tools.commands.excel2json.new_lists.utils import get_column_info
from dsp_tools.commands.excel2json.new_lists.utils import get_columns_of_preferred_lang
from dsp_tools.commands.excel2json.new_lists.utils import get_hierarchy_nums
from dsp_tools.commands.excel2json.new_lists.utils import get_lang_string_from_column_name
from dsp_tools.commands.excel2json.new_lists.utils import get_preferred_language
from dsp_tools.commands.excel2json.utils import add_optional_columns
from dsp_tools.commands.excel2json.utils import read_and_clean_all_sheets
from dsp_tools.models.exceptions import InputError

Expand All @@ -45,8 +46,8 @@ def new_excel2lists(
Returns:
a tuple consisting of the "lists" section as Python list, and the success status (True if everything went well)
"""
sheet_list = _parse_files(excelfolder)
sheet_list = _prepare_dfs(sheet_list)
df_dict = _parse_files(excelfolder)
sheet_list = _prepare_sheets(df_dict)

finished_lists = _make_serialised_lists(sheet_list)
validate_lists_section_with_schema(lists_section=finished_lists)
Expand All @@ -59,55 +60,50 @@ def new_excel2lists(
return finished_lists, True


def _parse_files(excelfolder: Path | str) -> list[ExcelSheet]:
def _parse_files(excelfolder: Path | str) -> dict[str, dict[str, pd.DataFrame]]:
file_names = [file for file in Path(excelfolder).glob("*list*.xlsx", case_sensitive=False) if _non_hidden(file)]
all_sheets = []
df_dict = {}
for file in file_names:
all_sheets.extend(
[
ExcelSheet(excel_name=str(file), sheet_name=name, df=df)
for name, df in read_and_clean_all_sheets(file).items()
]
)
return all_sheets
df_dict[str(file)] = read_and_clean_all_sheets(file)
return df_dict


def _non_hidden(path: Path) -> bool:
return not regex.search(r"^(\.|~\$).+", path.name)
def _prepare_sheets(df_dict: dict[str, dict[str, pd.DataFrame]]) -> list[ExcelSheet]:
all_sheets: list[ExcelSheet] = []
for file, sheets in df_dict.items():
all_sheets.extend(_prepare_one_sheet(df, file, sheet_name) for sheet_name, df in sheets.items())
make_all_excel_compliance_checks(all_sheets)
return _construct_ids(all_sheets)


def _prepare_dfs(sheet_list: list[ExcelSheet]) -> list[ExcelSheet]:
sheet_list = _add_id_optional_column_if_not_exists(sheet_list)
make_all_excel_compliance_checks(sheet_list)
return _construct_ids(sheet_list)
def _prepare_one_sheet(df: pd.DataFrame, filename: str, sheet_name: str) -> ExcelSheet:
columns = get_column_info(df.columns)
df = add_optional_columns(df, {"id (optional)"})
return ExcelSheet(excel_name=filename, sheet_name=sheet_name, col_info=columns, df=df)


def _add_id_optional_column_if_not_exists(sheet_list: list[ExcelSheet]) -> list[ExcelSheet]:
all_sheets = []
for sheet in sheet_list:
if "id (optional)" not in sheet.df.columns:
df = sheet.df
df["id (optional)"] = pd.NA
all_sheets.append(ExcelSheet(excel_name=sheet.excel_name, sheet_name=sheet.sheet_name, df=df))
else:
all_sheets.append(sheet)
return all_sheets
def _non_hidden(path: Path) -> bool:
return not regex.search(r"^(\.|~\$).+", path.name)


def _construct_ids(sheet_list: list[ExcelSheet]) -> list[ExcelSheet]:
all_sheets = []
for sheet in sheet_list:
df = _complete_id_one_df(sheet.df, get_preferred_language(sheet.df.columns))
all_sheets.append(ExcelSheet(excel_name=sheet.excel_name, sheet_name=sheet.sheet_name, df=df))
df = _complete_id_one_df(sheet.df, sheet.col_info.preferred_lang)
all_sheets.append(
ExcelSheet(excel_name=sheet.excel_name, col_info=sheet.col_info, sheet_name=sheet.sheet_name, df=df)
)
all_sheets = _resolve_duplicate_ids_all_excels(all_sheets)
return _fill_parent_id_col_all_excels(all_sheets)


def _fill_parent_id_col_all_excels(sheet_list: list[ExcelSheet]) -> list[ExcelSheet]:
all_sheets = []
for sheet in sheet_list:
df = _fill_parent_id_col_one_df(sheet.df, get_preferred_language(sheet.df.columns))
all_sheets.append(ExcelSheet(excel_name=sheet.excel_name, sheet_name=sheet.sheet_name, df=df))
df = _fill_parent_id_col_one_df(sheet.df, sheet.col_info.preferred_lang)
all_sheets.append(
ExcelSheet(excel_name=sheet.excel_name, sheet_name=sheet.sheet_name, col_info=sheet.col_info, df=df)
)
return all_sheets


Expand Down Expand Up @@ -139,18 +135,17 @@ def _resolve_duplicate_ids_all_excels(sheet_list: list[ExcelSheet]) -> list[Exce
def _remove_duplicate_ids_in_all_excels(duplicate_ids: list[str], sheet_list: list[ExcelSheet]) -> list[ExcelSheet]:
all_sheets = []
for sheet in sheet_list:
preferred_lang = get_preferred_language(sheet.df.columns)
df = sheet.df
for i, row in df.iterrows():
if row["id"] in duplicate_ids and pd.isna(row["id (optional)"]):
df.at[i, "id"] = _construct_non_duplicate_id_string(df.iloc[int(str(i))], preferred_lang)
all_sheets.append(ExcelSheet(sheet.excel_name, sheet.sheet_name, df))
df.at[i, "id"] = _construct_non_duplicate_id_string(df.iloc[int(str(i))], sheet.col_info.preferred_lang)
all_sheets.append(
ExcelSheet(excel_name=sheet.excel_name, sheet_name=sheet.sheet_name, col_info=sheet.col_info, df=df)
)
return sheet_list


def _complete_id_one_df(df: pd.DataFrame, preferred_language: str) -> pd.DataFrame:
if "id (optional)" not in df.columns:
df["id (optional)"] = pd.NA
df = _create_auto_id_one_df(df, preferred_language)
df["id"] = df["id (optional)"].fillna(df["auto_id"])
df = _resolve_duplicate_ids_keep_custom_change_auto_id_one_df(df, preferred_language)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,20 @@
class ExcelSheet:
excel_name: str
sheet_name: str
col_info: Columns
df: pd.DataFrame


@dataclass
class Columns:
list_cols: ColumnsList
preferred_lang: str
list_cols: list[str]
node_cols: list[ColumnNodes]

def __post_init__(self) -> None:
self.node_cols = sorted(self.node_cols, key=lambda x: x.level_num, reverse=True)


@dataclass
class ColumnsList:
columns: list[str]


@dataclass
class ColumnNodes:
level_num: int
Expand Down
28 changes: 26 additions & 2 deletions src/dsp_tools/commands/excel2json/new_lists/utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
from __future__ import annotations

from typing import Any

import pandas as pd
import regex

from dsp_tools.commands.excel2json.new_lists.models.deserialise import ColumnNodes
from dsp_tools.commands.excel2json.new_lists.models.deserialise import Columns
from dsp_tools.models.exceptions import InputError


Expand All @@ -20,6 +24,26 @@ def get_columns_of_preferred_lang(
return sorted(col for col in columns if regex.search(rf"^{preferred_language}_{ending}$", col))


def get_column_info(df_cols: pd.Index[Any]) -> Columns:
"""
Constructs and returns all the columns that should be present in the df based on the languages used.
Args:
df_cols: columns of the df
Returns:
Object with column info
"""
hierarchy_nums = get_hierarchy_nums(df_cols)
all_langs = get_all_languages_for_columns(df_cols)
preferred_lang = get_preferred_language_from_columns(df_cols)
node_cols = []
for n in hierarchy_nums:
node_cols.append(ColumnNodes(level_num=int(n), columns=[f"{lang}_{n}" for lang in all_langs]))
list_columns = [f"{lang}_list" for lang in all_langs]
return Columns(preferred_lang=preferred_lang, list_cols=list_columns, node_cols=node_cols)


def get_hierarchy_nums(columns: pd.Index[str]) -> list[int]:
"""Get all the numbers that are used in the column names that contain a language tag."""
return sorted(
Expand All @@ -32,9 +56,9 @@ def get_all_languages_for_columns(columns: pd.Index[str], ending: str = r"(\d+|l
return set(res for x in columns if (res := get_lang_string_from_column_name(x, ending)))


def get_preferred_language(columns: pd.Index[str], ending: str = r"(\d+|list)") -> str:
def get_preferred_language_from_columns(columns: pd.Index[str], ending: str = r"(\d+|list)") -> str:
"""Get the language tag of the preferred language."""
match = [res.group(1) for x in columns if (res := regex.search(rf"^(en|de|fr|it|rm)_{ending}+$", x))]
match = {res.group(1) for x in columns if (res := regex.search(rf"^(en|de|fr|it|rm)_{ending}+$", x))}
if "en" in match:
return "en"
elif "de" in match:
Expand Down
Loading

0 comments on commit bd163cf

Please sign in to comment.