Skip to content

Commit

Permalink
restore docs
Browse files Browse the repository at this point in the history
  • Loading branch information
axiomofjoy committed Oct 16, 2023
1 parent 67ad155 commit dd9fc2e
Show file tree
Hide file tree
Showing 8 changed files with 18 additions and 18 deletions.
2 changes: 1 addition & 1 deletion docs/api/evals/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ Note that once you initialize the `PromptTemplate` class, you don't need to worr
## phoenix.experimental.evals.llm\_eval\_binary

```python
def llm_classify(
def llm_eval_binary(
dataframe: pd.DataFrame,
model: BaseEvalModel,
template: Union[PromptTemplate, str],
Expand Down
10 changes: 5 additions & 5 deletions docs/concepts/llm-evals.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ eval_test_data['query'] = query
#Evals model
model_to_use = evals.OpenAIModel(model_name="gpt-4")
##### RUN RAG Retrieval Performance EVALS #####
eval_result = llm_classify(eval_test_data, evals.RAG_RELEVANCY_PROMPT_TEMPLATE_STR, model_to_use)
eval_result = llm_eval_binary(eval_test_data, evals.RAG_RELEVANCY_PROMPT_TEMPLATE_STR, model_to_use)
```

The results are designed for easy analysis is Scikit learn or our convience functions built on top of Scikit learn.
Expand All @@ -82,7 +82,7 @@ from phoenix.experimental.evals import (
RAG_RELEVANCY_PROMPT_TEMPLATE_STR,
OpenAIModel,
download_benchmark_dataset,
llm_classify,
llm_eval_binary,
)
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, ConfusionMatrixDisplay

Expand All @@ -91,7 +91,7 @@ df = download_benchmark_dataset(
task="binary-relevance-classification", dataset_name="wiki_qa-train"
)

df["eval_relevance"] = llm_classify(df, evals.RAG_RELEVANCY_PROMPT_TEMPLATE_STR, model_to_use)
df["eval_relevance"] = llm_eval_binary(df, evals.RAG_RELEVANCY_PROMPT_TEMPLATE_STR, model_to_use)
#Golden dataset has True/False map to -> "irrelevant" / "relevant"
#we can then scikit compare to output of template - same format
y_true = df["relevant"].map({True: "relevant", False: "irrelevant"})
Expand All @@ -114,7 +114,7 @@ LLM Evals where the Eval output is a numeric score or rating needs more research
LLM Evals included currently in the library make a specific binary decision "hallucination" or "factual" for example. These binary decisions generate traditional Precision/Recall/F1/MRR metrics that can be applied to the decisions giving a very intuitive understanding of performance and provide comparable metrics across models.

```python
df["eval_relevance"] = llm_classify(df, evals.RAG_RELEVANCY_PROMPT_TEMPLATE_STR, model_to_use)
df["eval_relevance"] = llm_eval_binary(df, evals.RAG_RELEVANCY_PROMPT_TEMPLATE_STR, model_to_use)
#Golden dataset has True/False map to -> "irrelevant" / "relevant"
#we can then scikit compare to output of template - same format
y_true = df["relevant"].map({True: "relevant", False: "irrelevant"})
Expand Down Expand Up @@ -195,7 +195,7 @@ The above template shows an example creation of an easy to use string template.
```python

model = OpenAIModel(model_name="gpt-4",temperature=0.6)
positive_eval = llm_classify_jason(
positive_eval = llm_eval_binary_jason(
dataframe=df,
template= MY_CUSTOM_TEMPLATE,
model=model
Expand Down
4 changes: 2 additions & 2 deletions docs/concepts/llm-evals/code-generation-eval.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ from phoenix.experimental.evals import (
CODE_READABILITY_PROMPT_TEMPLATE_STR,
OpenAIModel,
download_benchmark_dataset,
llm_classify,
llm_eval_binary,
)

model = OpenAIModel(
Expand All @@ -25,7 +25,7 @@ model = OpenAIModel(
#It will remove text such as ",,," or "..."
#Will ensure the binary value expected from the template is returned
rails = list(CODE_READABILITY_PROMPT_RAILS_MAP.values())
readability_classifications = llm_classify(
readability_classifications = llm_eval_binary(
dataframe=df,
template=CODE_READABILITY_PROMPT_TEMPLATE_STR,
model=model,
Expand Down
4 changes: 2 additions & 2 deletions docs/concepts/llm-evals/hallucinations.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ from phoenix.experimental.evals import (
HALLUCINATION_PROMPT_TEMPLATE_STR,
OpenAIModel,
download_benchmark_dataset,
llm_classify,
llm_eval_binary,
)

model = OpenAIModel(
Expand All @@ -22,7 +22,7 @@ model = OpenAIModel(
#It will remove text such as ",,," or "..."
#Will ensure the binary value expected from the template is returned
rails = list(HALLUCINATION_PROMPT_RAILS_MAP.values())
hallucination_classifications = llm_classify(
hallucination_classifications = llm_eval_binary(
dataframe=df, template=HALLUCINATION_PROMPT_TEMPLATE_STR, model=model, rails=rails
)

Expand Down
4 changes: 2 additions & 2 deletions docs/concepts/llm-evals/q-and-a-on-retrieved-data.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import phoenix.experimental.evals.templates.default_templates as templates
from phoenix.experimental.evals import (
OpenAIModel,
download_benchmark_dataset,
llm_classify,
llm_eval_binary,
)

model = OpenAIModel(
Expand All @@ -26,7 +26,7 @@ model = OpenAIModel(
#It will remove text such as ",,," or "...", anything not the
#binary value expected from the template
rails = list(templates.QA_PROMPT_RAILS_MAP.values())
Q_and_A_classifications = llm_classify(
Q_and_A_classifications = llm_eval_binary(
dataframe=df_sample,
template=templates.QA_PROMPT_TEMPLATE_STR,
model=model,
Expand Down
4 changes: 2 additions & 2 deletions docs/concepts/llm-evals/retrieval-rag-relevance.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ from phoenix.experimental.evals import (
RAG_RELEVANCY_PROMPT_TEMPLATE_STR,
OpenAIModel,
download_benchmark_dataset,
llm_classify,
llm_eval_binary,
)

model = OpenAIModel(
Expand All @@ -22,7 +22,7 @@ model = OpenAIModel(
#It will remove text such as ",,," or "..."
#Will ensure the binary value expected from the template is returned
rails = list(RAG_RELEVANCY_PROMPT_RAILS_MAP.values())
relevance_classifications = llm_classify(
relevance_classifications = llm_eval_binary(
dataframe=df,
template=RAG_RELEVANCY_PROMPT_TEMPLATE_STR,
model=model,
Expand Down
4 changes: 2 additions & 2 deletions docs/concepts/llm-evals/summarization-eval.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import phoenix.experimental.evals.templates.default_templates as templates
from phoenix.experimental.evals import (
OpenAIModel,
download_benchmark_dataset,
llm_classify,
llm_eval_binary,
)

model = OpenAIModel(
Expand All @@ -26,7 +26,7 @@ model = OpenAIModel(
#It will remove text such as ",,," or "..."
#Will ensure the binary value expected from the template is returned
rails = list(templates.SUMMARIZATION_PROMPT_RAILS_MAP.values())
summarization_classifications = llm_classify(
summarization_classifications = llm_eval_binary(
dataframe=df_sample,
template=templates.SUMMARIZATION_PROMPT_TEMPLATE_STR,
model=model,
Expand Down
4 changes: 2 additions & 2 deletions docs/concepts/llm-evals/toxicity.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ from phoenix.experimental.evals import (
TOXICITY_PROMPT_TEMPLATE_STR,
OpenAIModel,
download_benchmark_dataset,
llm_classify,
llm_eval_binary,
)

model = OpenAIModel(
Expand All @@ -26,7 +26,7 @@ model = OpenAIModel(
#It will remove text such as ",,," or "..."
#Will ensure the binary value expected from the template is returned
rails = list(TOXICITY_PROMPT_RAILS_MAP.values())
toxic_classifications = llm_classify(
toxic_classifications = llm_eval_binary(
dataframe=df_sample,
template=TOXICITY_PROMPT_TEMPLATE_STR,
model=model,
Expand Down

0 comments on commit dd9fc2e

Please sign in to comment.