From b9ad296dd9cf44cfd491621c454d79bafe7d15fc Mon Sep 17 00:00:00 2001 From: Mikhail Sveshnikov Date: Thu, 23 Jan 2025 11:24:31 +0000 Subject: [PATCH] fix descriptor columns (#1440) --- src/evidently/base_metric.py | 5 ++- src/evidently/features/generated_features.py | 2 +- src/evidently/features/llm_judge.py | 6 +-- src/evidently/future/datasets.py | 31 +++++++-------- .../descriptors/_generate_descriptors.py | 4 +- .../descriptors/generated_descriptors.py | 39 +++++++++++-------- tests/features/test_llm_judge.py | 8 ++-- 7 files changed, 51 insertions(+), 44 deletions(-) diff --git a/src/evidently/base_metric.py b/src/evidently/base_metric.py index 631446be84..77d235f796 100644 --- a/src/evidently/base_metric.py +++ b/src/evidently/base_metric.py @@ -77,13 +77,16 @@ class DatasetType(Enum): ADDITIONAL = "additional" +DisplayName = str + + @autoregister class ColumnName(EnumValueMixin, EvidentlyBaseModel): class Config: type_alias = "evidently:base:ColumnName" name: str - display_name: str + display_name: DisplayName dataset: DatasetType _feature_class: Optional["GeneratedFeatures"] = PrivateAttr(None) diff --git a/src/evidently/features/generated_features.py b/src/evidently/features/generated_features.py index f8ebd96f4c..1f4dc221bf 100644 --- a/src/evidently/features/generated_features.py +++ b/src/evidently/features/generated_features.py @@ -93,7 +93,7 @@ def feature_name(self, subcolumn: Optional[str] = None): return self.as_column(subcolumn) def _create_column_name(self, subcolumn: Optional[str]) -> str: - subcolumn = f".{subcolumn}" if subcolumn is not None else "" + subcolumn = f".{subcolumn}" if subcolumn is not None and len(subcolumn) > 0 else "" return f"{self.get_fingerprint()}{subcolumn}" def _extract_subcolumn_name(self, column_name: str) -> Optional[str]: diff --git a/src/evidently/features/llm_judge.py b/src/evidently/features/llm_judge.py index eece324dfb..381fe4ad31 100644 --- a/src/evidently/features/llm_judge.py +++ b/src/evidently/features/llm_judge.py @@ -76,7 +76,7 @@ class Config: include_score: bool = False score_range: Tuple[float, float] = (0.0, 1.0) - output_column: str = "category" + output_column: str = "" output_reasoning_column: str = "reasoning" output_score_column: str = "score" @@ -153,7 +153,7 @@ def get_type(self, subcolumn: Optional[str]) -> ColumnType: return ColumnType.Text if subcolumn == self.output_score_column: return ColumnType.Numerical - if subcolumn == self.output_column: + if subcolumn == self.output_column or subcolumn is None: return ColumnType.Categorical raise ValueError(f"Unknown subcolumn {subcolumn}") @@ -196,7 +196,7 @@ def generate_features(self, data: pd.DataFrame, data_definition: DataDefinition, def list_columns(self) -> List["ColumnName"]: return [ - self._create_column(c, display_name=f"{self.display_name or ''} {c}") + self._create_column(c, display_name=f"{self.display_name or ''} {c}".strip()) for c in self.template.list_output_columns() ] diff --git a/src/evidently/future/datasets.py b/src/evidently/future/datasets.py index 83ba846c44..ccbfa1e185 100644 --- a/src/evidently/future/datasets.py +++ b/src/evidently/future/datasets.py @@ -16,6 +16,7 @@ from evidently import ColumnMapping from evidently import ColumnType +from evidently.base_metric import DisplayName from evidently.features.generated_features import GeneratedFeatures from evidently.metric_results import Label from evidently.options.base import Options @@ -177,7 +178,7 @@ def __init__(self, alias: str): self._alias = alias @abc.abstractmethod - def generate_data(self, dataset: "Dataset") -> Union[DatasetColumn, Dict[str, DatasetColumn]]: + def generate_data(self, dataset: "Dataset") -> Union[DatasetColumn, Dict[DisplayName, DatasetColumn]]: raise NotImplementedError() @property @@ -192,22 +193,22 @@ def __init__(self, feature: GeneratedFeatures, alias: Optional[str] = None): self._feature = feature def get_dataset_column(self, column_name: str, values: pd.Series) -> DatasetColumn: - column_type = self._feature.get_type(f"{self._feature.get_fingerprint()}.{column_name}") + column_type = self._feature.get_type(column_name) if column_type == ColumnType.Numerical: values = pd.to_numeric(values, errors="coerce") dataset_column = DatasetColumn(type=column_type, data=values) return dataset_column - def generate_data(self, dataset: "Dataset") -> Union[DatasetColumn, Dict[str, DatasetColumn]]: - feature = self._feature.generate_features( + def generate_data(self, dataset: "Dataset") -> Union[DatasetColumn, Dict[DisplayName, DatasetColumn]]: + feature = self._feature.generate_features_renamed( dataset.as_dataframe(), create_data_definition(None, dataset.as_dataframe(), ColumnMapping()), Options(), ) - if len(feature.columns) > 1: - return {col: self.get_dataset_column(col, feature[col]) for col in feature.columns} - col = feature.columns[0] - return self.get_dataset_column(col, feature[col]) + return { + col.display_name: self.get_dataset_column(col.name, feature[col.name]) + for col in self._feature.list_columns() + } def _determine_desccriptor_column_name(alias: str, columns: List[str]): @@ -361,15 +362,11 @@ def add_column(self, key: str, data: DatasetColumn): self._data_definition.categorical_descriptors.append(key) def add_descriptor(self, descriptor: Descriptor): - key = _determine_desccriptor_column_name(descriptor.alias, self._data.columns.tolist()) - new_column = descriptor.generate_data(self) - if isinstance(new_column, DatasetColumn): - self.add_column(key, new_column) - elif len(new_column) > 1: - for col, value in new_column.items(): - self.add_column(f"{key}.{col}", value) - else: - self.add_column(key, list(new_column.values())[0]) + new_columns = descriptor.generate_data(self) + if isinstance(new_columns, DatasetColumn): + new_columns = {descriptor.alias: new_columns} + for col, value in new_columns.items(): + self.add_column(_determine_desccriptor_column_name(col, self._data.columns.tolist()), value) def add_descriptors(self, descriptors: List[Descriptor]): for descriptor in descriptors: diff --git a/src/evidently/future/descriptors/_generate_descriptors.py b/src/evidently/future/descriptors/_generate_descriptors.py index aaf60ef16e..e479fa8a07 100644 --- a/src/evidently/future/descriptors/_generate_descriptors.py +++ b/src/evidently/future/descriptors/_generate_descriptors.py @@ -131,7 +131,7 @@ def create_llm_descriptor_functions(feature_class: Type[BaseLLMEval]): args, kwargs = get_args_kwargs(feature_class) kwargs["alias"] = ("Optional[str]", "None") - kwargs.pop("display_name", None) + has_display_name = kwargs.pop("display_name", None) is not None args_str = ", ".join(f"{a}: {t}" for a, t in args.items()) if len(kwargs) > 0: kwargs_str = ", ".join(f"{a}: {t} = {d}" for a, (t, d) in kwargs.items()) @@ -140,6 +140,8 @@ def create_llm_descriptor_functions(feature_class: Type[BaseLLMEval]): else: kwargs_str = "" class_args = ", ".join(f"{k}={k}" for k in chain(args, kwargs) if k != "alias") + if has_display_name: + class_args += ", display_name=alias" res = f""" class {name}(FeatureDescriptor): def __init__(self, column_name: str, {args_str}{kwargs_str}): diff --git a/src/evidently/future/descriptors/generated_descriptors.py b/src/evidently/future/descriptors/generated_descriptors.py index ab64187680..5a21abfb92 100644 --- a/src/evidently/future/descriptors/generated_descriptors.py +++ b/src/evidently/future/descriptors/generated_descriptors.py @@ -8,9 +8,6 @@ from evidently.features.llm_judge import Uncertainty from evidently.future.datasets import FeatureDescriptor -DEFAULT_LLM_PROVIDER = "openai" -DEFAULT_LLM_MODEL = "gpt-4o-mini" - class BERTScore(FeatureDescriptor): def __init__( @@ -373,8 +370,8 @@ class BiasLLMEval(FeatureDescriptor): def __init__( self, column_name: str, - provider: str = DEFAULT_LLM_PROVIDER, - model: str = DEFAULT_LLM_MODEL, + provider: str = "openai", + model: str = "gpt-4o-mini", additional_columns: Optional[Dict[str, str]] = None, include_category: Optional[bool] = None, include_score: Optional[bool] = None, @@ -392,6 +389,7 @@ def __init__( include_score=include_score, include_reasoning=include_reasoning, uncertainty=uncertainty, + display_name=alias, ).feature(column_name) super().__init__(feature, alias=alias) @@ -419,6 +417,7 @@ def __init__( include_score=include_score, include_reasoning=include_reasoning, uncertainty=uncertainty, + display_name=alias, ).feature(column_name) super().__init__(feature, alias=alias) @@ -428,8 +427,8 @@ def __init__( self, column_name: str, question: str, - provider: str = DEFAULT_LLM_PROVIDER, - model: str = DEFAULT_LLM_MODEL, + provider: str = "openai", + model: str = "gpt-4o-mini", additional_columns: Optional[Dict[str, str]] = None, include_category: Optional[bool] = None, include_score: Optional[bool] = None, @@ -448,6 +447,7 @@ def __init__( include_score=include_score, include_reasoning=include_reasoning, uncertainty=uncertainty, + display_name=alias, ).feature(column_name) super().__init__(feature, alias=alias) @@ -456,8 +456,8 @@ class DeclineLLMEval(FeatureDescriptor): def __init__( self, column_name: str, - provider: str = DEFAULT_LLM_PROVIDER, - model: str = DEFAULT_LLM_MODEL, + provider: str = "openai", + model: str = "gpt-4o-mini", additional_columns: Optional[Dict[str, str]] = None, include_category: Optional[bool] = None, include_score: Optional[bool] = None, @@ -475,6 +475,7 @@ def __init__( include_score=include_score, include_reasoning=include_reasoning, uncertainty=uncertainty, + display_name=alias, ).feature(column_name) super().__init__(feature, alias=alias) @@ -483,9 +484,9 @@ class LLMEval(FeatureDescriptor): def __init__( self, column_name: str, + provider: str, + model: str, template: BaseLLMPromptTemplate, - provider: str = DEFAULT_LLM_PROVIDER, - model: str = DEFAULT_LLM_MODEL, additional_columns: Optional[Dict[str, str]] = None, subcolumn: Optional[str] = None, alias: Optional[str] = None, @@ -498,6 +499,7 @@ def __init__( template=template, additional_columns=additional_columns, subcolumn=subcolumn, + display_name=alias, ).feature(column_name) super().__init__(feature, alias=alias) @@ -506,8 +508,8 @@ class NegativityLLMEval(FeatureDescriptor): def __init__( self, column_name: str, - provider: str = DEFAULT_LLM_PROVIDER, - model: str = DEFAULT_LLM_MODEL, + provider: str = "openai", + model: str = "gpt-4o-mini", additional_columns: Optional[Dict[str, str]] = None, include_category: Optional[bool] = None, include_score: Optional[bool] = None, @@ -525,6 +527,7 @@ def __init__( include_score=include_score, include_reasoning=include_reasoning, uncertainty=uncertainty, + display_name=alias, ).feature(column_name) super().__init__(feature, alias=alias) @@ -533,8 +536,8 @@ class PIILLMEval(FeatureDescriptor): def __init__( self, column_name: str, - provider: str = DEFAULT_LLM_PROVIDER, - model: str = DEFAULT_LLM_MODEL, + provider: str = "openai", + model: str = "gpt-4o-mini", additional_columns: Optional[Dict[str, str]] = None, include_category: Optional[bool] = None, include_score: Optional[bool] = None, @@ -552,6 +555,7 @@ def __init__( include_score=include_score, include_reasoning=include_reasoning, uncertainty=uncertainty, + display_name=alias, ).feature(column_name) super().__init__(feature, alias=alias) @@ -560,8 +564,8 @@ class ToxicityLLMEval(FeatureDescriptor): def __init__( self, column_name: str, - provider: str = DEFAULT_LLM_PROVIDER, - model: str = DEFAULT_LLM_MODEL, + provider: str = "openai", + model: str = "gpt-4o-mini", additional_columns: Optional[Dict[str, str]] = None, include_category: Optional[bool] = None, include_score: Optional[bool] = None, @@ -579,5 +583,6 @@ def __init__( include_score=include_score, include_reasoning=include_reasoning, uncertainty=uncertainty, + display_name=alias, ).feature(column_name) super().__init__(feature, alias=alias) diff --git a/tests/features/test_llm_judge.py b/tests/features/test_llm_judge.py index 96104912ff..ac6087903b 100644 --- a/tests/features/test_llm_judge.py +++ b/tests/features/test_llm_judge.py @@ -96,7 +96,7 @@ def __init__(self, model: str, options: Options): async def complete(self, messages: List[LLMMessage]) -> str: text = messages[-1].content cat = re.findall("___text_starts_here___\n(.*)\n___text_ends_here___", text)[0][0] - return json.dumps({"category": cat}) + return json.dumps({"": cat}) @pytest.mark.asyncio @@ -112,7 +112,7 @@ def test_llm_judge(): dd = DataDefinition(columns={}, reference_present=False) fts = llm_judge.generate_features(data, dd, Options()) - pd.testing.assert_frame_equal(fts, pd.DataFrame({"category": ["A", "B"]})) + pd.testing.assert_frame_equal(fts, pd.DataFrame({"": ["A", "B"]})) @pytest.mark.asyncio @@ -128,7 +128,7 @@ def test_multicol_llm_judge(): dd = DataDefinition(columns={}, reference_present=False) fts = llm_judge.generate_features(data, dd, Options()) - pd.testing.assert_frame_equal(fts, pd.DataFrame({"category": ["A", "B"]})) + pd.testing.assert_frame_equal(fts, pd.DataFrame({"": ["A", "B"]})) def test_run_snapshot_with_llm_judge(): @@ -148,7 +148,7 @@ def test_run_snapshot_with_llm_judge(): { "metric": "ColumnSummaryMetric", "result": { - "column_name": "Negativity category", + "column_name": "Negativity", "column_type": "cat", "current_characteristics": { "count": 2,