Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Evals with explanations #1699

Merged
merged 33 commits into from
Nov 14, 2023
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
7956822
Add explanation template
anticorrelator Nov 1, 2023
763c481
Spike out explanations
anticorrelator Nov 2, 2023
38d2be9
Ruff 🐶
anticorrelator Nov 2, 2023
c314c74
Merge remote-tracking branch 'origin' into dustin/evals-with-explanat…
anticorrelator Nov 6, 2023
a66d6eb
Use tailored explanation prompt
anticorrelator Nov 6, 2023
85092ca
Add explanation templates for all evals
anticorrelator Nov 7, 2023
2b0f29b
Wire up prompt template objects
anticorrelator Nov 7, 2023
f0aa75f
Update models to use new template object
anticorrelator Nov 8, 2023
6c6140a
Ruff 🐶
anticorrelator Nov 8, 2023
a11a4ba
Resolve type and linter issues
anticorrelator Nov 8, 2023
9c5af4e
Fix more typing issues
anticorrelator Nov 8, 2023
a551d60
Address first round of feedback
anticorrelator Nov 9, 2023
67e9e13
Extract `ClassificationTemplate` ABC
anticorrelator Nov 10, 2023
75a027c
Label extraction belongs to the "template" object
anticorrelator Nov 10, 2023
6fc6fc6
Add logging for unparseable labels
anticorrelator Nov 10, 2023
eb11ebb
Merge remote-tracking branch 'origin' into dustin/evals-with-explanat…
anticorrelator Nov 13, 2023
59d9ded
Merge remote-tracking branch 'origin' into dustin/evals-with-explanat…
anticorrelator Nov 13, 2023
a2509c9
Patch in openai key environment variable for tests
anticorrelator Nov 13, 2023
eaff46d
Refactor to address feedback
anticorrelator Nov 13, 2023
b8e13e3
Evaluators should use PromptTemplates
anticorrelator Nov 13, 2023
d0f1d8b
Pair with Mikyo
anticorrelator Nov 14, 2023
888f223
Fix for CI
anticorrelator Nov 14, 2023
cebda8c
`PROMPT_TEMPLATE_STR` -> `PROMPT_TEMPLATE`
anticorrelator Nov 14, 2023
093e59c
Print prompt if verbose
anticorrelator Nov 14, 2023
17025ef
Add __repr__ to `PromptTemplate`
anticorrelator Nov 14, 2023
29ff6b4
fix relevance notebook
mikeldking Nov 14, 2023
cc8e7e2
docs: update evals
mikeldking Nov 14, 2023
e564db0
Normalize prompt templates in llm_classify
anticorrelator Nov 14, 2023
6cdbecb
Ruff 🐶
anticorrelator Nov 14, 2023
ad1ef59
Merge remote-tracking branch 'origin/dustin/evals-with-explanations' …
anticorrelator Nov 14, 2023
2b257d2
feat(evals): add an output_parser to llm_generate (#1736)
mikeldking Nov 14, 2023
00d9cb4
docs(evals): document llm_generate with output parser (#1741)
mikeldking Nov 14, 2023
8ac5201
Merge branch 'main' into dustin/evals-with-explanations
mikeldking Nov 14, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 18 additions & 16 deletions src/phoenix/experimental/evals/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,16 @@
from .models import OpenAIModel, VertexAIModel
from .retrievals import compute_precisions_at_k
from .templates import (
CODE_READABILITY_PROMPT_RAILS_MAP,
CODE_READABILITY_PROMPT_TEMPLATE_STR,
HALLUCINATION_PROMPT_RAILS_MAP,
HALLUCINATION_PROMPT_TEMPLATE_STR,
CODE_READABILITY_PROMPT_RAILS,
CODE_READABILITY_PROMPT_TEMPLATE,
HALLUCINATION_PROMPT_RAILS,
HALLUCINATION_PROMPT_TEMPLATE,
NOT_PARSABLE,
RAG_RELEVANCY_PROMPT_RAILS_MAP,
RAG_RELEVANCY_PROMPT_TEMPLATE_STR,
TOXICITY_PROMPT_RAILS_MAP,
TOXICITY_PROMPT_TEMPLATE_STR,
RAG_RELEVANCY_PROMPT_RAILS,
RAG_RELEVANCY_PROMPT_TEMPLATE,
TOXICITY_PROMPT_RAILS,
TOXICITY_PROMPT_TEMPLATE,
ClassificationTemplate,
PromptTemplate,
)
from .utils.downloads import download_benchmark_dataset
Expand All @@ -23,14 +24,15 @@
"OpenAIModel",
"VertexAIModel",
"PromptTemplate",
"CODE_READABILITY_PROMPT_RAILS_MAP",
"CODE_READABILITY_PROMPT_TEMPLATE_STR",
"HALLUCINATION_PROMPT_RAILS_MAP",
"HALLUCINATION_PROMPT_TEMPLATE_STR",
"RAG_RELEVANCY_PROMPT_RAILS_MAP",
"RAG_RELEVANCY_PROMPT_TEMPLATE_STR",
"TOXICITY_PROMPT_TEMPLATE_STR",
"TOXICITY_PROMPT_RAILS_MAP",
"ClassificationTemplate",
"CODE_READABILITY_PROMPT_RAILS",
"CODE_READABILITY_PROMPT_TEMPLATE",
"HALLUCINATION_PROMPT_RAILS",
"HALLUCINATION_PROMPT_TEMPLATE",
"RAG_RELEVANCY_PROMPT_RAILS",
"RAG_RELEVANCY_PROMPT_TEMPLATE",
"TOXICITY_PROMPT_RAILS",
"TOXICITY_PROMPT_TEMPLATE",
"NOT_PARSABLE",
"run_relevance_eval",
]
40 changes: 23 additions & 17 deletions src/phoenix/experimental/evals/functions/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@
from phoenix.experimental.evals.models import BaseEvalModel, OpenAIModel, set_verbosity
from phoenix.experimental.evals.templates import (
NOT_PARSABLE,
RAG_RELEVANCY_PROMPT_RAILS_MAP,
RAG_RELEVANCY_PROMPT_TEMPLATE_STR,
PromptTemplate,
RAG_RELEVANCY_PROMPT_RAILS,
RAG_RELEVANCY_PROMPT_TEMPLATE,
ClassificationTemplate,
PromptOptions,
map_template,
normalize_template,
normalize_classification_template,
)
from phoenix.trace.semantic_conventions import DOCUMENT_CONTENT, INPUT_VALUE, RETRIEVAL_DOCUMENTS
from phoenix.utilities.logging import printif
Expand All @@ -32,7 +33,7 @@
def llm_classify(
dataframe: pd.DataFrame,
model: BaseEvalModel,
template: Union[PromptTemplate, str],
template: Union[ClassificationTemplate, str],
rails: List[str],
system_instruction: Optional[str] = None,
verbose: bool = False,
Expand Down Expand Up @@ -85,31 +86,36 @@ def llm_classify(
and model.supports_function_calling
)

# TODO: support explanation without function calling
if provide_explanation and not use_openai_function_call:
raise ValueError(
"explanation is not currently available for models without OpenAI function calling"
)

model_kwargs: Dict[str, Any] = {}
if use_openai_function_call:
openai_function = _default_openai_function(rails, provide_explanation)
model_kwargs["functions"] = [openai_function]
model_kwargs["function_call"] = {"name": openai_function["name"]}

eval_template = normalize_template(template)
prompts = map_template(dataframe, eval_template)
eval_template = normalize_classification_template(rails=rails, template=template)

prompt_options = PromptOptions(provide_explanation=provide_explanation)
prompts = map_template(dataframe, eval_template, options=prompt_options)

labels: List[str] = []
explanations: List[Optional[str]] = []

if generation_info := model.verbose_generation_info():
printif(verbose, generation_info)

for prompt in tqdm(prompts):
with set_verbosity(model, verbose) as verbose_model:
response = verbose_model(prompt, instruction=system_instruction, **model_kwargs)
if not use_openai_function_call:
unrailed_label = response
explanation = None
if provide_explanation:
unrailed_label, explanation = eval_template.parse_label(response), response
printif(
verbose and unrailed_label == NOT_PARSABLE,
f"- Could not parse {repr(response)}",
)
Comment on lines +118 to +121
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

documentation: I would add a bit more color here so the user understands what part of the execution is failing. Including maybe the response

else:
unrailed_label = response
explanation = None
else:
try:
function_arguments = json.loads(response, strict=False)
Expand All @@ -132,8 +138,8 @@ def llm_classify(
def run_relevance_eval(
dataframe: pd.DataFrame,
model: BaseEvalModel,
template: Union[PromptTemplate, str] = RAG_RELEVANCY_PROMPT_TEMPLATE_STR,
rails: List[str] = list(RAG_RELEVANCY_PROMPT_RAILS_MAP.values()),
template: Union[ClassificationTemplate, str] = RAG_RELEVANCY_PROMPT_TEMPLATE,
rails: List[str] = list(RAG_RELEVANCY_PROMPT_RAILS),
system_instruction: Optional[str] = None,
query_column_name: str = "query",
document_column_name: str = "reference",
Expand Down
10 changes: 7 additions & 3 deletions src/phoenix/experimental/evals/functions/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@
import pandas as pd

from phoenix.experimental.evals.models import BaseEvalModel, set_verbosity
from phoenix.experimental.evals.templates import PromptTemplate, map_template, normalize_template
from phoenix.experimental.evals.templates import (
PromptTemplate,
map_template,
normalize_prompt_template,
)

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -44,8 +48,8 @@ def llm_generate(

"""
with set_verbosity(model, verbose) as verbose_model:
template = normalize_template(template)
logger.info(f"Template: \n{template.text}\n")
template = normalize_prompt_template(template)
logger.info(f"Template: \n{template.prompt()}\n")
logger.info(f"Template variables: {template.variables}")
prompts = map_template(dataframe, template)

Expand Down
48 changes: 30 additions & 18 deletions src/phoenix/experimental/evals/templates/__init__.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,38 @@
from .default_templates import (
CODE_READABILITY_PROMPT_RAILS_MAP,
CODE_READABILITY_PROMPT_TEMPLATE_STR,
HALLUCINATION_PROMPT_RAILS_MAP,
HALLUCINATION_PROMPT_TEMPLATE_STR,
RAG_RELEVANCY_PROMPT_RAILS_MAP,
RAG_RELEVANCY_PROMPT_TEMPLATE_STR,
TOXICITY_PROMPT_RAILS_MAP,
TOXICITY_PROMPT_TEMPLATE_STR,
CODE_READABILITY_PROMPT_RAILS,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we need to preserve the binary True/False of the labels in the case of binary classification so we should not touch these mappings.

CODE_READABILITY_PROMPT_TEMPLATE,
HALLUCINATION_PROMPT_RAILS,
HALLUCINATION_PROMPT_TEMPLATE,
RAG_RELEVANCY_PROMPT_RAILS,
RAG_RELEVANCY_PROMPT_TEMPLATE,
TOXICITY_PROMPT_RAILS,
TOXICITY_PROMPT_TEMPLATE,
)
from .template import (
NOT_PARSABLE,
ClassificationTemplate,
PromptOptions,
PromptTemplate,
map_template,
normalize_classification_template,
normalize_prompt_template,
)
from .template import NOT_PARSABLE, PromptTemplate, map_template, normalize_template

__all__ = [
"UserTemplate",
"PromptOptions",
"PromptTemplate",
"normalize_template",
"ClassificationTemplate",
"normalize_classification_template",
"normalize_prompt_template",
"map_template",
"NOT_PARSABLE",
"RAG_RELEVANCY_PROMPT_RAILS_MAP",
"RAG_RELEVANCY_PROMPT_TEMPLATE_STR",
"HALLUCINATION_PROMPT_RAILS_MAP",
"HALLUCINATION_PROMPT_TEMPLATE_STR",
"CODE_READABILITY_PROMPT_RAILS_MAP",
"CODE_READABILITY_PROMPT_TEMPLATE_STR",
"TOXICITY_PROMPT_RAILS_MAP",
"TOXICITY_PROMPT_TEMPLATE_STR",
"CODE_READABILITY_PROMPT_RAILS",
"CODE_READABILITY_PROMPT_TEMPLATE",
"HALLUCINATION_PROMPT_RAILS",
"HALLUCINATION_PROMPT_TEMPLATE",
"RAG_RELEVANCY_PROMPT_RAILS",
"RAG_RELEVANCY_PROMPT_TEMPLATE",
"TOXICITY_PROMPT_RAILS",
"TOXICITY_PROMPT_TEMPLATE",
]
Loading
Loading