docs: agent metrics code examples improvement (#1983)

yesidc · web-flow · commit c8efc3240d11 · 2025-09-17T14:48:52.000+05:30
Description
This update refactors the provided examples incorporating the
corresponding async blocks. The changes enable the code to be executed
seamlessly without any further modifications.

Key Changes:
Asynchronous Functionality: Wrapped relevant code in an async def
function, allowing for proper asynchronous execution. This eliminates
any runtime issues associated with unhandled asynchronous operations.

Evaluator LLM Initialization: Clearly demonstrates the creation of the
evaluator_llm object using the LangchainLLMWrapper with the ChatOpenAI
model set to gpt-4o. This change improves clarity as the setup of the
evaluator was previously not self-explanatory.

Code Readability: Improved formatting and organization of import
statements for better readability and maintainability.
diff --git a/docs/concepts/metrics/available_metrics/agents.md b/docs/concepts/metrics/available_metrics/agents.md
@@ -24,36 +24,84 @@ $$
 ### Example
 
 ```python
+import asyncio
+
 from ragas.dataset_schema import  SingleTurnSample, MultiTurnSample, EvaluationDataset
 from ragas.messages import HumanMessage,AIMessage,ToolMessage,ToolCall
 from ragas.metrics import TopicAdherenceScore
+from ragas.llms import LangchainLLMWrapper
+from langchain_openai import ChatOpenAI
+
+
+evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
+
+
+async def evaluate_topic_adherence():
+
+    sample_input_4 = [
+        HumanMessage(
+            content="Can you provide me with details about Einstein's theory of relativity?"
+        ),
+        AIMessage(
+            content="Sure, let me retrieve the relevant information for you.",
+            tool_calls=[
+                ToolCall(
+                    name="document_search",
+                    args={"query": "Einstein's theory of relativity"},
+                )
+            ],
+        ),
+        ToolMessage(
+            content="Found relevant documents: 1. Relativity: The Special and the General Theory, 2. General Theory of Relativity by A. Einstein."
+        ),
+        AIMessage(
+            content="I found some documents on Einstein's theory of relativity. Which one would you like to know more about: 'Relativity: The Special and the General Theory' or 'General Theory of Relativity by A. Einstein'?"
+        ),
+        HumanMessage(content="Tell me about the 'General Theory of Relativity'."),
+        AIMessage(
+            content="Got it! Let me fetch more details from 'General Theory of Relativity by A. Einstein'.",
+            tool_calls=[
+                ToolCall(
+                    name="document_retrieve",
+                    args={"document": "General Theory of Relativity by A. Einstein"},
+                )
+            ],
+        ),
+        ToolMessage(
+            content="The document discusses how gravity affects the fabric of spacetime, describing the relationship between mass and spacetime curvature."
+        ),
+        AIMessage(
+            content="The 'General Theory of Relativity' explains how gravity affects the fabric of spacetime and the relationship between mass and spacetime curvature. Would you like more details or a specific explanation?"
+        ),
+        HumanMessage(
+            content="No, that's perfect. By the way, do you know any good recipes for a chocolate cake?"
+        ),
+        AIMessage(
+            content="Sure! Let me find a simple and delicious recipe for a chocolate cake.",
+            tool_calls=[
+                ToolCall(name="recipe_search", args={"query": "chocolate cake recipe"})
+            ],
+        ),
+        ToolMessage(
+            content="Here’s a popular recipe for a chocolate cake: Ingredients include flour, sugar, cocoa powder, eggs, milk, and butter. Instructions: Mix dry ingredients, add wet ingredients, and bake at 350°F for 30-35 minutes."
+        ),
+        AIMessage(
+            content="I found a great recipe for chocolate cake! Would you like the full details, or is that summary enough?"
+        ),
+    ]
+
+    sample = MultiTurnSample(user_input=sample_input_4, reference_topics=["science"])
+    scorer = TopicAdherenceScore(llm=evaluator_llm, mode="precision")
+    score = await scorer.multi_turn_ascore(sample)
+    print(score)
+
+
+if __name__ == "__main__":
+
+    asyncio.run(evaluate_topic_adherence())
+
 
 
-sample_input_4 = [
-HumanMessage(content="Can you provide me with details about Einstein's theory of relativity?"),
-AIMessage(content="Sure, let me retrieve the relevant information for you.", tool_calls=[
-    ToolCall(name="document_search", args={"query": "Einstein's theory of relativity"})
-]),
-ToolMessage(content="Found relevant documents: 1. Relativity: The Special and the General Theory, 2. General Theory of Relativity by A. Einstein."),
-AIMessage(content="I found some documents on Einstein's theory of relativity. Which one would you like to know more about: 'Relativity: The Special and the General Theory' or 'General Theory of Relativity by A. Einstein'?"),
-HumanMessage(content="Tell me about the 'General Theory of Relativity'."),
-AIMessage(content="Got it! Let me fetch more details from 'General Theory of Relativity by A. Einstein'.", tool_calls=[
-    ToolCall(name="document_retrieve", args={"document": "General Theory of Relativity by A. Einstein"})
-]),
-ToolMessage(content="The document discusses how gravity affects the fabric of spacetime, describing the relationship between mass and spacetime curvature."),
-AIMessage(content="The 'General Theory of Relativity' explains how gravity affects the fabric of spacetime and the relationship between mass and spacetime curvature. Would you like more details or a specific explanation?"),
-HumanMessage(content="No, that's perfect. By the way, do you know any good recipes for a chocolate cake?"),
-AIMessage(content="Sure! Let me find a simple and delicious recipe for a chocolate cake.", tool_calls=[
-    ToolCall(name="recipe_search", args={"query": "chocolate cake recipe"})
-]),
-ToolMessage(content="Here’s a popular recipe for a chocolate cake: Ingredients include flour, sugar, cocoa powder, eggs, milk, and butter. Instructions: Mix dry ingredients, add wet ingredients, and bake at 350°F for 30-35 minutes."),
-AIMessage(content="I found a great recipe for chocolate cake! Would you like the full details, or is that summary enough?")
-]
-
-
-sample = MultiTurnSample(user_input=sample_input_4, reference_topics=["science"])
-scorer = TopicAdherenceScore(llm = evaluator_llm, mode="precision")
-await scorer.multi_turn_ascore(sample)
 ```
 Output
 ```
@@ -78,33 +126,52 @@ Output
 `ToolCallAccuracy` is a metric that can be used to evaluate the performance of the LLM in identifying and calling the required tools to complete a given task. This metric needs `user_input` and `reference_tool_calls` to evaluate the performance of the LLM in identifying and calling the required tools to complete a given task. The metric is computed by comparing the `reference_tool_calls` with the Tool calls made by the AI. The values range between 0 and 1, with higher values indicating better performance. 
 
 ```python
+
+import asyncio
+
+from ragas.dataset_schema import MultiTurnSample
+from ragas.messages import AIMessage, HumanMessage, ToolCall, ToolMessage
 from ragas.metrics import ToolCallAccuracy
-from ragas.dataset_schema import  MultiTurnSample
-from ragas.messages import HumanMessage,AIMessage,ToolMessage,ToolCall
 
-sample = [
-    HumanMessage(content="What's the weather like in New York right now?"),
-    AIMessage(content="The current temperature in New York is 75°F and it's partly cloudy.", tool_calls=[
-        ToolCall(name="weather_check", args={"location": "New York"})
-    ]),
-    HumanMessage(content="Can you translate that to Celsius?"),
-    AIMessage(content="Let me convert that to Celsius for you.", tool_calls=[
-        ToolCall(name="temperature_conversion", args={"temperature_fahrenheit": 75})
-    ]),
-    ToolMessage(content="75°F is approximately 23.9°C."),
-    AIMessage(content="75°F is approximately 23.9°C.")
-]
-
-sample = MultiTurnSample(
-    user_input=sample,
-    reference_tool_calls=[
-        ToolCall(name="weather_check", args={"location": "New York"}),
-        ToolCall(name="temperature_conversion", args={"temperature_fahrenheit": 75})
+
+async def evaluate_tool_call_accuracy():
+    sample = [
+        HumanMessage(content="What's the weather like in New York right now?"),
+        AIMessage(
+            content="The current temperature in New York is 75°F and it's partly cloudy.",
+            tool_calls=[ToolCall(name="weather_check", args={"location": "New York"})],
+        ),
+        HumanMessage(content="Can you translate that to Celsius?"),
+        AIMessage(
+            content="Let me convert that to Celsius for you.",
+            tool_calls=[
+                ToolCall(
+                    name="temperature_conversion", args={"temperature_fahrenheit": 75}
+                )
+            ],
+        ),
+        ToolMessage(content="75°F is approximately 23.9°C."),
+        AIMessage(content="75°F is approximately 23.9°C."),
     ]
-)
 
-scorer = ToolCallAccuracy()
-await scorer.multi_turn_ascore(sample)
+    sample = MultiTurnSample(
+        user_input=sample,
+        reference_tool_calls=[
+            ToolCall(name="weather_check", args={"location": "New York"}),
+            ToolCall(
+                name="temperature_conversion", args={"temperature_fahrenheit": 75}
+            ),
+        ],
+    )
+
+    scorer = ToolCallAccuracy()
+    score = await scorer.multi_turn_ascore(sample)
+    print(score)
+
+
+if __name__ == "__main__":
+    asyncio.run(evaluate_tool_call_accuracy())
+
 ```
 Output
 ```
@@ -134,31 +201,64 @@ Calculating `AgentGoalAccuracyWithReference` with reference needs `user_input` a
 
 
 ```python
-from ragas.dataset_schema import  MultiTurnSample
-from ragas.messages import HumanMessage,AIMessage,ToolMessage,ToolCall
-from ragas.metrics import AgentGoalAccuracyWithReference
-
+import asyncio
 
-sample = MultiTurnSample(user_input=[
-    HumanMessage(content="Hey, book a table at the nearest best Chinese restaurant for 8:00pm"),
-    AIMessage(content="Sure, let me find the best options for you.", tool_calls=[
-        ToolCall(name="restaurant_search", args={"cuisine": "Chinese", "time": "8:00pm"})
-    ]),
-    ToolMessage(content="Found a few options: 1. Golden Dragon, 2. Jade Palace"),
-    AIMessage(content="I found some great options: Golden Dragon and Jade Palace. Which one would you prefer?"),
-    HumanMessage(content="Let's go with Golden Dragon."),
-    AIMessage(content="Great choice! I'll book a table for 8:00pm at Golden Dragon.", tool_calls=[
-        ToolCall(name="restaurant_book", args={"name": "Golden Dragon", "time": "8:00pm"})
-    ]),
-    ToolMessage(content="Table booked at Golden Dragon for 8:00pm."),
-    AIMessage(content="Your table at Golden Dragon is booked for 8:00pm. Enjoy your meal!"),
-    HumanMessage(content="thanks"),
-],
-    reference="Table booked at one of the chinese restaurants at 8 pm")
-
-scorer = AgentGoalAccuracyWithReference(llm = evaluator_llm)
-await scorer.multi_turn_ascore(sample)
+from langchain_openai import ChatOpenAI
+from ragas.dataset_schema import MultiTurnSample
+from ragas.llms import LangchainLLMWrapper
+from ragas.messages import AIMessage, HumanMessage, ToolCall, ToolMessage
+from ragas.metrics import AgentGoalAccuracyWithReference
 
+evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
+
+
+async def evaluate_agent_goal_accuracy_with_reference():
+    sample = MultiTurnSample(
+        user_input=[
+            HumanMessage(
+                content="Hey, book a table at the nearest best Chinese restaurant for 8:00pm"
+            ),
+            AIMessage(
+                content="Sure, let me find the best options for you.",
+                tool_calls=[
+                    ToolCall(
+                        name="restaurant_search",
+                        args={"cuisine": "Chinese", "time": "8:00pm"},
+                    )
+                ],
+            ),
+            ToolMessage(
+                content="Found a few options: 1. Golden Dragon, 2. Jade Palace"
+            ),
+            AIMessage(
+                content="I found some great options: Golden Dragon and Jade Palace. Which one would you prefer?"
+            ),
+            HumanMessage(content="Let's go with Golden Dragon."),
+            AIMessage(
+                content="Great choice! I'll book a table for 8:00pm at Golden Dragon.",
+                tool_calls=[
+                    ToolCall(
+                        name="restaurant_book",
+                        args={"name": "Golden Dragon", "time": "8:00pm"},
+                    )
+                ],
+            ),
+            ToolMessage(content="Table booked at Golden Dragon for 8:00pm."),
+            AIMessage(
+                content="Your table at Golden Dragon is booked for 8:00pm. Enjoy your meal!"
+            ),
+            HumanMessage(content="thanks"),
+        ],
+        reference="Table booked at one of the chinese restaurants at 8 pm",
+    )
+
+    scorer = AgentGoalAccuracyWithReference(llm=evaluator_llm)
+    score = await scorer.multi_turn_ascore(sample)
+    print(score)
+
+
+if __name__ == "__main__":
+    asyncio.run(evaluate_agent_goal_accuracy_with_reference())
 ```
 Output
 ```
@@ -173,29 +273,64 @@ Output
 ### Example
 
 ```python
-from ragas.dataset_schema import  MultiTurnSample
-from ragas.messages import HumanMessage,AIMessage,ToolMessage,ToolCall
-from ragas.metrics import AgentGoalAccuracyWithoutReference
 
+import asyncio
+
+from langchain_openai import ChatOpenAI
+from ragas.dataset_schema import MultiTurnSample
+from ragas.llms import LangchainLLMWrapper
+from ragas.messages import AIMessage, HumanMessage, ToolCall, ToolMessage
+from ragas.metrics import AgentGoalAccuracyWithoutReference
 
-sample = MultiTurnSample(user_input=[
-    HumanMessage(content="Hey, book a table at the nearest best Chinese restaurant for 8:00pm"),
-    AIMessage(content="Sure, let me find the best options for you.", tool_calls=[
-        ToolCall(name="restaurant_search", args={"cuisine": "Chinese", "time": "8:00pm"})
-    ]),
-    ToolMessage(content="Found a few options: 1. Golden Dragon, 2. Jade Palace"),
-    AIMessage(content="I found some great options: Golden Dragon and Jade Palace. Which one would you prefer?"),
-    HumanMessage(content="Let's go with Golden Dragon."),
-    AIMessage(content="Great choice! I'll book a table for 8:00pm at Golden Dragon.", tool_calls=[
-        ToolCall(name="restaurant_book", args={"name": "Golden Dragon", "time": "8:00pm"})
-    ]),
-    ToolMessage(content="Table booked at Golden Dragon for 8:00pm."),
-    AIMessage(content="Your table at Golden Dragon is booked for 8:00pm. Enjoy your meal!"),
-    HumanMessage(content="thanks"),
-])
-
-scorer = AgentGoalAccuracyWithoutReference(llm = evaluator_llm)
-await scorer.multi_turn_ascore(sample)
+evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
+
+
+async def evaluate_agent_goal_accuracy_without_reference():
+
+    sample = MultiTurnSample(
+        user_input=[
+            HumanMessage(
+                content="Hey, book a table at the nearest best Chinese restaurant for 8:00pm"
+            ),
+            AIMessage(
+                content="Sure, let me find the best options for you.",
+                tool_calls=[
+                    ToolCall(
+                        name="restaurant_search",
+                        args={"cuisine": "Chinese", "time": "8:00pm"},
+                    )
+                ],
+            ),
+            ToolMessage(
+                content="Found a few options: 1. Golden Dragon, 2. Jade Palace"
+            ),
+            AIMessage(
+                content="I found some great options: Golden Dragon and Jade Palace. Which one would you prefer?"
+            ),
+            HumanMessage(content="Let's go with Golden Dragon."),
+            AIMessage(
+                content="Great choice! I'll book a table for 8:00pm at Golden Dragon.",
+                tool_calls=[
+                    ToolCall(
+                        name="restaurant_book",
+                        args={"name": "Golden Dragon", "time": "8:00pm"},
+                    )
+                ],
+            ),
+            ToolMessage(content="Table booked at Golden Dragon for 8:00pm."),
+            AIMessage(
+                content="Your table at Golden Dragon is booked for 8:00pm. Enjoy your meal!"
+            ),
+            HumanMessage(content="thanks"),
+        ]
+    )
+
+    scorer = AgentGoalAccuracyWithoutReference(llm=evaluator_llm)
+    score = await scorer.multi_turn_ascore(sample)
+    print(score)
+
+if __name__ == "__main__":
+    asyncio.run(evaluate_agent_goal_accuracy_without_reference())
 
 ```
 Output