Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Evals with explanations #1699

Merged
merged 33 commits into from
Nov 14, 2023
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
7956822
Add explanation template
anticorrelator Nov 1, 2023
763c481
Spike out explanations
anticorrelator Nov 2, 2023
38d2be9
Ruff 🐶
anticorrelator Nov 2, 2023
c314c74
Merge remote-tracking branch 'origin' into dustin/evals-with-explanat…
anticorrelator Nov 6, 2023
a66d6eb
Use tailored explanation prompt
anticorrelator Nov 6, 2023
85092ca
Add explanation templates for all evals
anticorrelator Nov 7, 2023
2b0f29b
Wire up prompt template objects
anticorrelator Nov 7, 2023
f0aa75f
Update models to use new template object
anticorrelator Nov 8, 2023
6c6140a
Ruff 🐶
anticorrelator Nov 8, 2023
a11a4ba
Resolve type and linter issues
anticorrelator Nov 8, 2023
9c5af4e
Fix more typing issues
anticorrelator Nov 8, 2023
a551d60
Address first round of feedback
anticorrelator Nov 9, 2023
67e9e13
Extract `ClassificationTemplate` ABC
anticorrelator Nov 10, 2023
75a027c
Label extraction belongs to the "template" object
anticorrelator Nov 10, 2023
6fc6fc6
Add logging for unparseable labels
anticorrelator Nov 10, 2023
eb11ebb
Merge remote-tracking branch 'origin' into dustin/evals-with-explanat…
anticorrelator Nov 13, 2023
59d9ded
Merge remote-tracking branch 'origin' into dustin/evals-with-explanat…
anticorrelator Nov 13, 2023
a2509c9
Patch in openai key environment variable for tests
anticorrelator Nov 13, 2023
eaff46d
Refactor to address feedback
anticorrelator Nov 13, 2023
b8e13e3
Evaluators should use PromptTemplates
anticorrelator Nov 13, 2023
d0f1d8b
Pair with Mikyo
anticorrelator Nov 14, 2023
888f223
Fix for CI
anticorrelator Nov 14, 2023
cebda8c
`PROMPT_TEMPLATE_STR` -> `PROMPT_TEMPLATE`
anticorrelator Nov 14, 2023
093e59c
Print prompt if verbose
anticorrelator Nov 14, 2023
17025ef
Add __repr__ to `PromptTemplate`
anticorrelator Nov 14, 2023
29ff6b4
fix relevance notebook
mikeldking Nov 14, 2023
cc8e7e2
docs: update evals
mikeldking Nov 14, 2023
e564db0
Normalize prompt templates in llm_classify
anticorrelator Nov 14, 2023
6cdbecb
Ruff 🐶
anticorrelator Nov 14, 2023
ad1ef59
Merge remote-tracking branch 'origin/dustin/evals-with-explanations' …
anticorrelator Nov 14, 2023
2b257d2
feat(evals): add an output_parser to llm_generate (#1736)
mikeldking Nov 14, 2023
00d9cb4
docs(evals): document llm_generate with output parser (#1741)
mikeldking Nov 14, 2023
8ac5201
Merge branch 'main' into dustin/evals-with-explanations
mikeldking Nov 14, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 18 additions & 16 deletions src/phoenix/experimental/evals/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,16 @@
from .models import OpenAIModel, VertexAIModel
from .retrievals import compute_precisions_at_k
from .templates import (
CODE_READABILITY_PROMPT_RAILS_MAP,
CODE_READABILITY_PROMPT_TEMPLATE_STR,
HALLUCINATION_PROMPT_RAILS_MAP,
HALLUCINATION_PROMPT_TEMPLATE_STR,
CODE_READABILITY_PROMPT_RAILS,
CODE_READABILITY_PROMPT_TEMPLATE,
HALLUCINATION_PROMPT_RAILS,
HALLUCINATION_PROMPT_TEMPLATE,
NOT_PARSABLE,
RAG_RELEVANCY_PROMPT_RAILS_MAP,
RAG_RELEVANCY_PROMPT_TEMPLATE_STR,
TOXICITY_PROMPT_RAILS_MAP,
TOXICITY_PROMPT_TEMPLATE_STR,
RAG_RELEVANCY_PROMPT_RAILS,
RAG_RELEVANCY_PROMPT_TEMPLATE,
TOXICITY_PROMPT_RAILS,
TOXICITY_PROMPT_TEMPLATE,
ClassificationTemplate,
PromptTemplate,
)
from .utils.downloads import download_benchmark_dataset
Expand All @@ -24,14 +25,15 @@
"OpenAIModel",
"VertexAIModel",
"PromptTemplate",
"CODE_READABILITY_PROMPT_RAILS_MAP",
"CODE_READABILITY_PROMPT_TEMPLATE_STR",
"HALLUCINATION_PROMPT_RAILS_MAP",
"HALLUCINATION_PROMPT_TEMPLATE_STR",
"RAG_RELEVANCY_PROMPT_RAILS_MAP",
"RAG_RELEVANCY_PROMPT_TEMPLATE_STR",
"TOXICITY_PROMPT_TEMPLATE_STR",
"TOXICITY_PROMPT_RAILS_MAP",
"ClassificationTemplate",
"CODE_READABILITY_PROMPT_RAILS",
"CODE_READABILITY_PROMPT_TEMPLATE",
"HALLUCINATION_PROMPT_RAILS",
"HALLUCINATION_PROMPT_TEMPLATE",
"RAG_RELEVANCY_PROMPT_RAILS",
"RAG_RELEVANCY_PROMPT_TEMPLATE",
"TOXICITY_PROMPT_RAILS",
"TOXICITY_PROMPT_TEMPLATE",
"NOT_PARSABLE",
"run_relevance_eval",
]
31 changes: 18 additions & 13 deletions src/phoenix/experimental/evals/functions/classify.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import logging
import re
import warnings
from typing import Any, Dict, Iterable, List, Optional, Union, cast

Expand All @@ -8,8 +9,8 @@
from phoenix.experimental.evals.models import BaseEvalModel, OpenAIModel, set_verbosity
from phoenix.experimental.evals.templates import (
NOT_PARSABLE,
RAG_RELEVANCY_PROMPT_RAILS_MAP,
RAG_RELEVANCY_PROMPT_TEMPLATE_STR,
RAG_RELEVANCY_PROMPT_RAILS,
RAG_RELEVANCY_PROMPT_TEMPLATE,
PromptTemplate,
map_template,
normalize_template,
Expand Down Expand Up @@ -85,20 +86,16 @@ def llm_classify(
and model.supports_function_calling
)

# TODO: support explanation without function calling
if provide_explanation and not use_openai_function_call:
raise ValueError(
"explanation is not currently available for models without OpenAI function calling"
)

model_kwargs: Dict[str, Any] = {}
if use_openai_function_call:
openai_function = _default_openai_function(rails, provide_explanation)
model_kwargs["functions"] = [openai_function]
model_kwargs["function_call"] = {"name": openai_function["name"]}

eval_template = normalize_template(template)
prompts = map_template(dataframe, eval_template)

prompts = map_template(dataframe, eval_template, provide_explanation=provide_explanation)

with set_verbosity(model, verbose) as verbose_model:
responses = verbose_model.generate(
prompts.to_list(), instruction=system_instruction, **model_kwargs
Expand All @@ -112,8 +109,8 @@ def llm_classify(
if not use_openai_function_call:
raw_string = response
if provide_explanation:
# TODO: support explanation without function calling
explanations.append(None)
raw_string, explanation = _search_for_label(raw_string), raw_string
anticorrelator marked this conversation as resolved.
Show resolved Hide resolved
explanations.append(explanation)
else:
try:
function_arguments = json.loads(response, strict=False)
Expand Down Expand Up @@ -193,8 +190,8 @@ def llm_eval_binary(
def run_relevance_eval(
dataframe: pd.DataFrame,
model: BaseEvalModel,
template: Union[PromptTemplate, str] = RAG_RELEVANCY_PROMPT_TEMPLATE_STR,
rails: List[str] = list(RAG_RELEVANCY_PROMPT_RAILS_MAP.values()),
template: Union[PromptTemplate, str] = RAG_RELEVANCY_PROMPT_TEMPLATE,
rails: List[str] = list(RAG_RELEVANCY_PROMPT_RAILS.values()),
system_instruction: Optional[str] = None,
query_column_name: str = "query",
document_column_name: str = "reference",
Expand Down Expand Up @@ -343,6 +340,14 @@ def _snap_to_rail(raw_string: Optional[str], rails: List[str], verbose: bool = F
return rail


def _search_for_label(raw_string: str) -> str:
label_delimiter = r"\W*label\W*"
parts = re.split(label_delimiter, raw_string, maxsplit=1, flags=re.IGNORECASE)
if len(parts) == 2:
return parts[1]
return ""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thought: we could make these types of "parsers" be injectable into llm_classify - because this is inherently tied to what prompt you use - if you don't use a prompt that doesn't prompt for structure things like this, you are going to want to change this.

e.g.

labels_df = llm_classify(source_df, ...., label_parser: lambda (raw_str) -> str)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coming back to this after reading through everything - it's really a parser for the with_explanation template so maybe it should live with that template. We can default to this one but give the affordance for it to be overridden if needed.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's also probably worth threading through the verbose logging here so the end-user knows when their LLM is producing un-parsable code.

anticorrelator marked this conversation as resolved.
Show resolved Hide resolved


def _default_openai_function(
rails: List[str],
with_explanation: bool = False,
Expand Down
41 changes: 24 additions & 17 deletions src/phoenix/experimental/evals/templates/__init__.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,33 @@
from .default_templates import (
CODE_READABILITY_PROMPT_RAILS_MAP,
CODE_READABILITY_PROMPT_TEMPLATE_STR,
HALLUCINATION_PROMPT_RAILS_MAP,
HALLUCINATION_PROMPT_TEMPLATE_STR,
RAG_RELEVANCY_PROMPT_RAILS_MAP,
RAG_RELEVANCY_PROMPT_TEMPLATE_STR,
TOXICITY_PROMPT_RAILS_MAP,
TOXICITY_PROMPT_TEMPLATE_STR,
CODE_READABILITY_PROMPT_RAILS,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we need to preserve the binary True/False of the labels in the case of binary classification so we should not touch these mappings.

CODE_READABILITY_PROMPT_TEMPLATE,
HALLUCINATION_PROMPT_RAILS,
HALLUCINATION_PROMPT_TEMPLATE,
RAG_RELEVANCY_PROMPT_RAILS,
RAG_RELEVANCY_PROMPT_TEMPLATE,
TOXICITY_PROMPT_RAILS,
TOXICITY_PROMPT_TEMPLATE,
)
from .template import (
NOT_PARSABLE,
ClassificationTemplate,
PromptTemplate,
map_template,
normalize_template,
)
from .template import NOT_PARSABLE, PromptTemplate, map_template, normalize_template

__all__ = [
"PromptTemplate",
"ClassificationTemplate",
"normalize_template",
"map_template",
"NOT_PARSABLE",
"RAG_RELEVANCY_PROMPT_RAILS_MAP",
"RAG_RELEVANCY_PROMPT_TEMPLATE_STR",
"HALLUCINATION_PROMPT_RAILS_MAP",
"HALLUCINATION_PROMPT_TEMPLATE_STR",
"CODE_READABILITY_PROMPT_RAILS_MAP",
"CODE_READABILITY_PROMPT_TEMPLATE_STR",
"TOXICITY_PROMPT_RAILS_MAP",
"TOXICITY_PROMPT_TEMPLATE_STR",
"CODE_READABILITY_PROMPT_RAILS",
"CODE_READABILITY_PROMPT_TEMPLATE",
"HALLUCINATION_PROMPT_RAILS",
"HALLUCINATION_PROMPT_TEMPLATE",
"RAG_RELEVANCY_PROMPT_RAILS",
"RAG_RELEVANCY_PROMPT_TEMPLATE",
"TOXICITY_PROMPT_RAILS",
"TOXICITY_PROMPT_TEMPLATE",
]
Loading
Loading