diff --git a/docs/extras/guides/evaluation/string/scoring_eval_chain.ipynb b/docs/extras/guides/evaluation/string/scoring_eval_chain.ipynb index c6e101cefc51b..72634ba64b0f8 100644 --- a/docs/extras/guides/evaluation/string/scoring_eval_chain.ipynb +++ b/docs/extras/guides/evaluation/string/scoring_eval_chain.ipynb @@ -7,7 +7,7 @@ "source": [ "# Overall quality evaluation\n", "\n", - "In scenarios where you wish to score a model's output based on a criteria set and/or reference answer, the `Score` evaluator can be helpful. This is most useful for comparing the performance of different models on a given task.\n", + "In scenarios where you wish to score a model's output from 1-10 based on a criteria set and/or reference answer, the `Score` evaluator can be helpful. This is most useful for comparing the performance of different models on a given task.\n", "\n", "Refer to the documentation of the [ScoreStringEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.scoring.eval_chain.ScoreStringEvalChain.html#langchain.evaluation.scoring.eval_chain.ScoreStringEvalChain) class for full details.\n", "\n", @@ -113,7 +113,7 @@ "- reasoning: String \"chain of thought reasoning\" from the LLM generated prior to creating the score\n", "\n", "\n", - "Similar to [CriteriaEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.criteria.eval_chain.CriteriaEvalChain.html#langchain.evaluation.criteria.eval_chain.CriteriaEvalChain) you also load the \"labeled_score_string\" evaluator for scoring labeled outputs." + "Similar to [CriteriaEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.criteria.eval_chain.CriteriaEvalChain.html#langchain.evaluation.criteria.eval_chain.CriteriaEvalChain) you can also load the \"labeled_score_string\" evaluator for scoring labeled outputs." ] } ], diff --git a/libs/langchain/langchain/evaluation/scoring/eval_chain.py b/libs/langchain/langchain/evaluation/scoring/eval_chain.py index 5b7be2779a0ee..28ba5deac76fc 100644 --- a/libs/langchain/langchain/evaluation/scoring/eval_chain.py +++ b/libs/langchain/langchain/evaluation/scoring/eval_chain.py @@ -1,4 +1,4 @@ -"""Base classes for comparing the output of two models.""" +"""Base classes for scoring the output of a model on a scale of 1-10.""" from __future__ import annotations import logging @@ -48,7 +48,7 @@ } -def resolve_pairwise_criteria( +def resolve_criteria( criteria: Optional[Union[CRITERIA_TYPE, str, List[CRITERIA_TYPE]]] ) -> dict: """Resolve the criteria for the pairwise evaluator. @@ -81,7 +81,7 @@ def resolve_pairwise_criteria( criteria_ = { k: v for criterion in criteria - for k, v in resolve_pairwise_criteria(criterion).items() + for k, v in resolve_criteria(criterion).items() } else: if not criteria: @@ -252,7 +252,7 @@ def from_llm( f"Input variables should be {expected_input_vars}, " f"but got {prompt_.input_variables}" ) - criteria_ = resolve_pairwise_criteria(criteria) + criteria_ = resolve_criteria(criteria) criteria_str = "\n".join(f"{k}: {v}" if v else k for k, v in criteria_.items()) criteria_str = ( CRITERIA_INSTRUCTIONS + criteria_str if criteria_str else DEFAULT_CRITERIA @@ -421,7 +421,7 @@ def from_llm( f"Input variables should be {expected_input_vars}, " f"but got {prompt_.input_variables}" ) - criteria_ = resolve_pairwise_criteria(criteria) + criteria_ = resolve_criteria(criteria) criteria_str = "\n".join(f"{k}: {v}" for k, v in criteria_.items()) criteria_str = CRITERIA_INSTRUCTIONS + criteria_str if criteria_str else "" return cls(llm=llm, prompt=prompt_.partial(criteria=criteria_str), **kwargs)