diff --git a/docs/api/evals/README.md b/docs/api/evals/README.md index ebe5ead8b8..e04f40db58 100644 --- a/docs/api/evals/README.md +++ b/docs/api/evals/README.md @@ -79,7 +79,7 @@ Note that once you initialize the `PromptTemplate` class, you don't need to worr ## phoenix.experimental.evals.llm\_eval\_binary ```python -def llm_classify( +def llm_eval_binary( dataframe: pd.DataFrame, model: BaseEvalModel, template: Union[PromptTemplate, str], diff --git a/docs/concepts/llm-evals.md b/docs/concepts/llm-evals.md index 35ab67b58e..60469dbeec 100644 --- a/docs/concepts/llm-evals.md +++ b/docs/concepts/llm-evals.md @@ -72,7 +72,7 @@ eval_test_data['query'] = query #Evals model model_to_use = evals.OpenAIModel(model_name="gpt-4") ##### RUN RAG Retrieval Performance EVALS ##### -eval_result = llm_classify(eval_test_data, evals.RAG_RELEVANCY_PROMPT_TEMPLATE_STR, model_to_use) +eval_result = llm_eval_binary(eval_test_data, evals.RAG_RELEVANCY_PROMPT_TEMPLATE_STR, model_to_use) ``` The results are designed for easy analysis is Scikit learn or our convience functions built on top of Scikit learn. @@ -82,7 +82,7 @@ from phoenix.experimental.evals import ( RAG_RELEVANCY_PROMPT_TEMPLATE_STR, OpenAIModel, download_benchmark_dataset, - llm_classify, + llm_eval_binary, ) from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, ConfusionMatrixDisplay @@ -91,7 +91,7 @@ df = download_benchmark_dataset( task="binary-relevance-classification", dataset_name="wiki_qa-train" ) -df["eval_relevance"] = llm_classify(df, evals.RAG_RELEVANCY_PROMPT_TEMPLATE_STR, model_to_use) +df["eval_relevance"] = llm_eval_binary(df, evals.RAG_RELEVANCY_PROMPT_TEMPLATE_STR, model_to_use) #Golden dataset has True/False map to -> "irrelevant" / "relevant" #we can then scikit compare to output of template - same format y_true = df["relevant"].map({True: "relevant", False: "irrelevant"}) @@ -114,7 +114,7 @@ LLM Evals where the Eval output is a numeric score or rating needs more research LLM Evals included currently in the library make a specific binary decision "hallucination" or "factual" for example. These binary decisions generate traditional Precision/Recall/F1/MRR metrics that can be applied to the decisions giving a very intuitive understanding of performance and provide comparable metrics across models. ```python -df["eval_relevance"] = llm_classify(df, evals.RAG_RELEVANCY_PROMPT_TEMPLATE_STR, model_to_use) +df["eval_relevance"] = llm_eval_binary(df, evals.RAG_RELEVANCY_PROMPT_TEMPLATE_STR, model_to_use) #Golden dataset has True/False map to -> "irrelevant" / "relevant" #we can then scikit compare to output of template - same format y_true = df["relevant"].map({True: "relevant", False: "irrelevant"}) @@ -195,7 +195,7 @@ The above template shows an example creation of an easy to use string template. ```python model = OpenAIModel(model_name="gpt-4",temperature=0.6) -positive_eval = llm_classify_jason( +positive_eval = llm_eval_binary_jason( dataframe=df, template= MY_CUSTOM_TEMPLATE, model=model diff --git a/docs/concepts/llm-evals/code-generation-eval.md b/docs/concepts/llm-evals/code-generation-eval.md index 5681ee7745..db90313294 100644 --- a/docs/concepts/llm-evals/code-generation-eval.md +++ b/docs/concepts/llm-evals/code-generation-eval.md @@ -13,7 +13,7 @@ from phoenix.experimental.evals import ( CODE_READABILITY_PROMPT_TEMPLATE_STR, OpenAIModel, download_benchmark_dataset, - llm_classify, + llm_eval_binary, ) model = OpenAIModel( @@ -25,7 +25,7 @@ model = OpenAIModel( #It will remove text such as ",,," or "..." #Will ensure the binary value expected from the template is returned rails = list(CODE_READABILITY_PROMPT_RAILS_MAP.values()) -readability_classifications = llm_classify( +readability_classifications = llm_eval_binary( dataframe=df, template=CODE_READABILITY_PROMPT_TEMPLATE_STR, model=model, diff --git a/docs/concepts/llm-evals/hallucinations.md b/docs/concepts/llm-evals/hallucinations.md index 1c7d76ba05..b6e162d5e2 100644 --- a/docs/concepts/llm-evals/hallucinations.md +++ b/docs/concepts/llm-evals/hallucinations.md @@ -10,7 +10,7 @@ from phoenix.experimental.evals import ( HALLUCINATION_PROMPT_TEMPLATE_STR, OpenAIModel, download_benchmark_dataset, - llm_classify, + llm_eval_binary, ) model = OpenAIModel( @@ -22,7 +22,7 @@ model = OpenAIModel( #It will remove text such as ",,," or "..." #Will ensure the binary value expected from the template is returned rails = list(HALLUCINATION_PROMPT_RAILS_MAP.values()) -hallucination_classifications = llm_classify( +hallucination_classifications = llm_eval_binary( dataframe=df, template=HALLUCINATION_PROMPT_TEMPLATE_STR, model=model, rails=rails ) diff --git a/docs/concepts/llm-evals/q-and-a-on-retrieved-data.md b/docs/concepts/llm-evals/q-and-a-on-retrieved-data.md index 8a864d9112..1f9d9f9cc6 100644 --- a/docs/concepts/llm-evals/q-and-a-on-retrieved-data.md +++ b/docs/concepts/llm-evals/q-and-a-on-retrieved-data.md @@ -14,7 +14,7 @@ import phoenix.experimental.evals.templates.default_templates as templates from phoenix.experimental.evals import ( OpenAIModel, download_benchmark_dataset, - llm_classify, + llm_eval_binary, ) model = OpenAIModel( @@ -26,7 +26,7 @@ model = OpenAIModel( #It will remove text such as ",,," or "...", anything not the #binary value expected from the template rails = list(templates.QA_PROMPT_RAILS_MAP.values()) -Q_and_A_classifications = llm_classify( +Q_and_A_classifications = llm_eval_binary( dataframe=df_sample, template=templates.QA_PROMPT_TEMPLATE_STR, model=model, diff --git a/docs/concepts/llm-evals/retrieval-rag-relevance.md b/docs/concepts/llm-evals/retrieval-rag-relevance.md index 5591198697..3b2c2b060c 100644 --- a/docs/concepts/llm-evals/retrieval-rag-relevance.md +++ b/docs/concepts/llm-evals/retrieval-rag-relevance.md @@ -10,7 +10,7 @@ from phoenix.experimental.evals import ( RAG_RELEVANCY_PROMPT_TEMPLATE_STR, OpenAIModel, download_benchmark_dataset, - llm_classify, + llm_eval_binary, ) model = OpenAIModel( @@ -22,7 +22,7 @@ model = OpenAIModel( #It will remove text such as ",,," or "..." #Will ensure the binary value expected from the template is returned rails = list(RAG_RELEVANCY_PROMPT_RAILS_MAP.values()) -relevance_classifications = llm_classify( +relevance_classifications = llm_eval_binary( dataframe=df, template=RAG_RELEVANCY_PROMPT_TEMPLATE_STR, model=model, diff --git a/docs/concepts/llm-evals/summarization-eval.md b/docs/concepts/llm-evals/summarization-eval.md index eec1f497dd..9fa3d62ca9 100644 --- a/docs/concepts/llm-evals/summarization-eval.md +++ b/docs/concepts/llm-evals/summarization-eval.md @@ -14,7 +14,7 @@ import phoenix.experimental.evals.templates.default_templates as templates from phoenix.experimental.evals import ( OpenAIModel, download_benchmark_dataset, - llm_classify, + llm_eval_binary, ) model = OpenAIModel( @@ -26,7 +26,7 @@ model = OpenAIModel( #It will remove text such as ",,," or "..." #Will ensure the binary value expected from the template is returned rails = list(templates.SUMMARIZATION_PROMPT_RAILS_MAP.values()) -summarization_classifications = llm_classify( +summarization_classifications = llm_eval_binary( dataframe=df_sample, template=templates.SUMMARIZATION_PROMPT_TEMPLATE_STR, model=model, diff --git a/docs/concepts/llm-evals/toxicity.md b/docs/concepts/llm-evals/toxicity.md index f71c84ac35..177e096b55 100644 --- a/docs/concepts/llm-evals/toxicity.md +++ b/docs/concepts/llm-evals/toxicity.md @@ -14,7 +14,7 @@ from phoenix.experimental.evals import ( TOXICITY_PROMPT_TEMPLATE_STR, OpenAIModel, download_benchmark_dataset, - llm_classify, + llm_eval_binary, ) model = OpenAIModel( @@ -26,7 +26,7 @@ model = OpenAIModel( #It will remove text such as ",,," or "..." #Will ensure the binary value expected from the template is returned rails = list(TOXICITY_PROMPT_RAILS_MAP.values()) -toxic_classifications = llm_classify( +toxic_classifications = llm_eval_binary( dataframe=df_sample, template=TOXICITY_PROMPT_TEMPLATE_STR, model=model,