diff --git a/docs/api/evals/README.md b/docs/api/evals/README.md
index ebe5ead8b8..e04f40db58 100644
--- a/docs/api/evals/README.md
+++ b/docs/api/evals/README.md
@@ -79,7 +79,7 @@ Note that once you initialize the `PromptTemplate` class, you don't need to worr
 ## phoenix.experimental.evals.llm\_eval\_binary
 
 ```python
-def llm_classify(
+def llm_eval_binary(
     dataframe: pd.DataFrame,
     model: BaseEvalModel,
     template: Union[PromptTemplate, str],
diff --git a/docs/concepts/llm-evals.md b/docs/concepts/llm-evals.md
index 35ab67b58e..60469dbeec 100644
--- a/docs/concepts/llm-evals.md
+++ b/docs/concepts/llm-evals.md
@@ -72,7 +72,7 @@ eval_test_data['query'] = query
 #Evals model
 model_to_use = evals.OpenAIModel(model_name="gpt-4")
 ##### RUN RAG Retrieval Performance EVALS #####
-eval_result = llm_classify(eval_test_data, evals.RAG_RELEVANCY_PROMPT_TEMPLATE_STR, model_to_use)
+eval_result = llm_eval_binary(eval_test_data, evals.RAG_RELEVANCY_PROMPT_TEMPLATE_STR, model_to_use)
 ```
 
 The results are designed for easy analysis is Scikit learn or our convience functions built on top of Scikit learn.
@@ -82,7 +82,7 @@ from phoenix.experimental.evals import (
     RAG_RELEVANCY_PROMPT_TEMPLATE_STR,
     OpenAIModel,
     download_benchmark_dataset,
-    llm_classify,
+    llm_eval_binary,
 )
 from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, ConfusionMatrixDisplay
 
@@ -91,7 +91,7 @@ df = download_benchmark_dataset(
     task="binary-relevance-classification", dataset_name="wiki_qa-train"
 )
 
-df["eval_relevance"] = llm_classify(df, evals.RAG_RELEVANCY_PROMPT_TEMPLATE_STR, model_to_use)
+df["eval_relevance"] = llm_eval_binary(df, evals.RAG_RELEVANCY_PROMPT_TEMPLATE_STR, model_to_use)
 #Golden dataset has True/False map to -> "irrelevant" / "relevant"
 #we can then scikit compare to output of template - same format
 y_true = df["relevant"].map({True: "relevant", False: "irrelevant"})
@@ -114,7 +114,7 @@ LLM Evals where the Eval output is a numeric score or rating needs more research
 LLM Evals included currently in the library make a specific binary decision "hallucination" or "factual" for example. These binary decisions generate traditional Precision/Recall/F1/MRR metrics that can be applied to the decisions giving a very intuitive understanding of performance and provide comparable metrics across models.
 
 ```python
-df["eval_relevance"] = llm_classify(df, evals.RAG_RELEVANCY_PROMPT_TEMPLATE_STR, model_to_use)
+df["eval_relevance"] = llm_eval_binary(df, evals.RAG_RELEVANCY_PROMPT_TEMPLATE_STR, model_to_use)
 #Golden dataset has True/False map to -> "irrelevant" / "relevant"
 #we can then scikit compare to output of template - same format
 y_true = df["relevant"].map({True: "relevant", False: "irrelevant"})
@@ -195,7 +195,7 @@ The above template shows an example creation of an easy to use string template.
 ```python
 
 model = OpenAIModel(model_name="gpt-4",temperature=0.6)
-positive_eval = llm_classify_jason(
+positive_eval = llm_eval_binary_jason(
     dataframe=df,
     template= MY_CUSTOM_TEMPLATE,
     model=model
diff --git a/docs/concepts/llm-evals/code-generation-eval.md b/docs/concepts/llm-evals/code-generation-eval.md
index 5681ee7745..db90313294 100644
--- a/docs/concepts/llm-evals/code-generation-eval.md
+++ b/docs/concepts/llm-evals/code-generation-eval.md
@@ -13,7 +13,7 @@ from phoenix.experimental.evals import (
     CODE_READABILITY_PROMPT_TEMPLATE_STR,
     OpenAIModel,
     download_benchmark_dataset,
-    llm_classify,
+    llm_eval_binary,
 )
 
 model = OpenAIModel(
@@ -25,7 +25,7 @@ model = OpenAIModel(
 #It will remove text such as ",,," or "..."
 #Will ensure the binary value expected from the template is returned 
 rails = list(CODE_READABILITY_PROMPT_RAILS_MAP.values())
-readability_classifications = llm_classify(
+readability_classifications = llm_eval_binary(
     dataframe=df,
     template=CODE_READABILITY_PROMPT_TEMPLATE_STR,
     model=model,
diff --git a/docs/concepts/llm-evals/hallucinations.md b/docs/concepts/llm-evals/hallucinations.md
index 1c7d76ba05..b6e162d5e2 100644
--- a/docs/concepts/llm-evals/hallucinations.md
+++ b/docs/concepts/llm-evals/hallucinations.md
@@ -10,7 +10,7 @@ from phoenix.experimental.evals import (
     HALLUCINATION_PROMPT_TEMPLATE_STR,
     OpenAIModel,
     download_benchmark_dataset,
-    llm_classify,
+    llm_eval_binary,
 )
 
 model = OpenAIModel(
@@ -22,7 +22,7 @@ model = OpenAIModel(
 #It will remove text such as ",,," or "..."
 #Will ensure the binary value expected from the template is returned 
 rails = list(HALLUCINATION_PROMPT_RAILS_MAP.values())
-hallucination_classifications = llm_classify(
+hallucination_classifications = llm_eval_binary(
     dataframe=df, template=HALLUCINATION_PROMPT_TEMPLATE_STR, model=model, rails=rails
 )
 
diff --git a/docs/concepts/llm-evals/q-and-a-on-retrieved-data.md b/docs/concepts/llm-evals/q-and-a-on-retrieved-data.md
index 8a864d9112..1f9d9f9cc6 100644
--- a/docs/concepts/llm-evals/q-and-a-on-retrieved-data.md
+++ b/docs/concepts/llm-evals/q-and-a-on-retrieved-data.md
@@ -14,7 +14,7 @@ import phoenix.experimental.evals.templates.default_templates as templates
 from phoenix.experimental.evals import (
     OpenAIModel,
     download_benchmark_dataset,
-    llm_classify,
+    llm_eval_binary,
 )
 
 model = OpenAIModel(
@@ -26,7 +26,7 @@ model = OpenAIModel(
 #It will remove text such as ",,," or "...", anything not the
 #binary value expected from the template
 rails = list(templates.QA_PROMPT_RAILS_MAP.values())
-Q_and_A_classifications = llm_classify(
+Q_and_A_classifications = llm_eval_binary(
     dataframe=df_sample,
     template=templates.QA_PROMPT_TEMPLATE_STR,
     model=model,
diff --git a/docs/concepts/llm-evals/retrieval-rag-relevance.md b/docs/concepts/llm-evals/retrieval-rag-relevance.md
index 5591198697..3b2c2b060c 100644
--- a/docs/concepts/llm-evals/retrieval-rag-relevance.md
+++ b/docs/concepts/llm-evals/retrieval-rag-relevance.md
@@ -10,7 +10,7 @@ from phoenix.experimental.evals import (
     RAG_RELEVANCY_PROMPT_TEMPLATE_STR,
     OpenAIModel,
     download_benchmark_dataset,
-    llm_classify,
+    llm_eval_binary,
 )
 
 model = OpenAIModel(
@@ -22,7 +22,7 @@ model = OpenAIModel(
 #It will remove text such as ",,," or "..."
 #Will ensure the binary value expected from the template is returned
 rails = list(RAG_RELEVANCY_PROMPT_RAILS_MAP.values())
-relevance_classifications = llm_classify(
+relevance_classifications = llm_eval_binary(
     dataframe=df,
     template=RAG_RELEVANCY_PROMPT_TEMPLATE_STR,
     model=model,
diff --git a/docs/concepts/llm-evals/summarization-eval.md b/docs/concepts/llm-evals/summarization-eval.md
index eec1f497dd..9fa3d62ca9 100644
--- a/docs/concepts/llm-evals/summarization-eval.md
+++ b/docs/concepts/llm-evals/summarization-eval.md
@@ -14,7 +14,7 @@ import phoenix.experimental.evals.templates.default_templates as templates
 from phoenix.experimental.evals import (
     OpenAIModel,
     download_benchmark_dataset,
-    llm_classify,
+    llm_eval_binary,
 )
 
 model = OpenAIModel(
@@ -26,7 +26,7 @@ model = OpenAIModel(
 #It will remove text such as ",,," or "..."
 #Will ensure the binary value expected from the template is returned 
 rails = list(templates.SUMMARIZATION_PROMPT_RAILS_MAP.values())
-summarization_classifications = llm_classify(
+summarization_classifications = llm_eval_binary(
     dataframe=df_sample,
     template=templates.SUMMARIZATION_PROMPT_TEMPLATE_STR,
     model=model,
diff --git a/docs/concepts/llm-evals/toxicity.md b/docs/concepts/llm-evals/toxicity.md
index f71c84ac35..177e096b55 100644
--- a/docs/concepts/llm-evals/toxicity.md
+++ b/docs/concepts/llm-evals/toxicity.md
@@ -14,7 +14,7 @@ from phoenix.experimental.evals import (
     TOXICITY_PROMPT_TEMPLATE_STR,
     OpenAIModel,
     download_benchmark_dataset,
-    llm_classify,
+    llm_eval_binary,
 )
 
 model = OpenAIModel(
@@ -26,7 +26,7 @@ model = OpenAIModel(
 #It will remove text such as ",,," or "..."
 #Will ensure the binary value expected from the template is returned 
 rails = list(TOXICITY_PROMPT_RAILS_MAP.values())
-toxic_classifications = llm_classify(
+toxic_classifications = llm_eval_binary(
     dataframe=df_sample,
     template=TOXICITY_PROMPT_TEMPLATE_STR,
     model=model,