Skip to content

Commit c8efc32

Browse files
authored
docs: agent metrics code examples improvement (#1983)
Description This update refactors the provided examples incorporating the corresponding async blocks. The changes enable the code to be executed seamlessly without any further modifications. Key Changes: Asynchronous Functionality: Wrapped relevant code in an async def function, allowing for proper asynchronous execution. This eliminates any runtime issues associated with unhandled asynchronous operations. Evaluator LLM Initialization: Clearly demonstrates the creation of the evaluator_llm object using the LangchainLLMWrapper with the ChatOpenAI model set to gpt-4o. This change improves clarity as the setup of the evaluator was previously not self-explanatory. Code Readability: Improved formatting and organization of import statements for better readability and maintainability.
1 parent 2dae552 commit c8efc32

File tree

1 file changed

+227
-92
lines changed
  • docs/concepts/metrics/available_metrics

1 file changed

+227
-92
lines changed

docs/concepts/metrics/available_metrics/agents.md

Lines changed: 227 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -24,36 +24,84 @@ $$
2424
### Example
2525

2626
```python
27+
import asyncio
28+
2729
from ragas.dataset_schema import SingleTurnSample, MultiTurnSample, EvaluationDataset
2830
from ragas.messages import HumanMessage,AIMessage,ToolMessage,ToolCall
2931
from ragas.metrics import TopicAdherenceScore
32+
from ragas.llms import LangchainLLMWrapper
33+
from langchain_openai import ChatOpenAI
34+
35+
36+
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
37+
38+
39+
async def evaluate_topic_adherence():
40+
41+
sample_input_4 = [
42+
HumanMessage(
43+
content="Can you provide me with details about Einstein's theory of relativity?"
44+
),
45+
AIMessage(
46+
content="Sure, let me retrieve the relevant information for you.",
47+
tool_calls=[
48+
ToolCall(
49+
name="document_search",
50+
args={"query": "Einstein's theory of relativity"},
51+
)
52+
],
53+
),
54+
ToolMessage(
55+
content="Found relevant documents: 1. Relativity: The Special and the General Theory, 2. General Theory of Relativity by A. Einstein."
56+
),
57+
AIMessage(
58+
content="I found some documents on Einstein's theory of relativity. Which one would you like to know more about: 'Relativity: The Special and the General Theory' or 'General Theory of Relativity by A. Einstein'?"
59+
),
60+
HumanMessage(content="Tell me about the 'General Theory of Relativity'."),
61+
AIMessage(
62+
content="Got it! Let me fetch more details from 'General Theory of Relativity by A. Einstein'.",
63+
tool_calls=[
64+
ToolCall(
65+
name="document_retrieve",
66+
args={"document": "General Theory of Relativity by A. Einstein"},
67+
)
68+
],
69+
),
70+
ToolMessage(
71+
content="The document discusses how gravity affects the fabric of spacetime, describing the relationship between mass and spacetime curvature."
72+
),
73+
AIMessage(
74+
content="The 'General Theory of Relativity' explains how gravity affects the fabric of spacetime and the relationship between mass and spacetime curvature. Would you like more details or a specific explanation?"
75+
),
76+
HumanMessage(
77+
content="No, that's perfect. By the way, do you know any good recipes for a chocolate cake?"
78+
),
79+
AIMessage(
80+
content="Sure! Let me find a simple and delicious recipe for a chocolate cake.",
81+
tool_calls=[
82+
ToolCall(name="recipe_search", args={"query": "chocolate cake recipe"})
83+
],
84+
),
85+
ToolMessage(
86+
content="Here’s a popular recipe for a chocolate cake: Ingredients include flour, sugar, cocoa powder, eggs, milk, and butter. Instructions: Mix dry ingredients, add wet ingredients, and bake at 350°F for 30-35 minutes."
87+
),
88+
AIMessage(
89+
content="I found a great recipe for chocolate cake! Would you like the full details, or is that summary enough?"
90+
),
91+
]
92+
93+
sample = MultiTurnSample(user_input=sample_input_4, reference_topics=["science"])
94+
scorer = TopicAdherenceScore(llm=evaluator_llm, mode="precision")
95+
score = await scorer.multi_turn_ascore(sample)
96+
print(score)
97+
98+
99+
if __name__ == "__main__":
100+
101+
asyncio.run(evaluate_topic_adherence())
102+
30103

31104

32-
sample_input_4 = [
33-
HumanMessage(content="Can you provide me with details about Einstein's theory of relativity?"),
34-
AIMessage(content="Sure, let me retrieve the relevant information for you.", tool_calls=[
35-
ToolCall(name="document_search", args={"query": "Einstein's theory of relativity"})
36-
]),
37-
ToolMessage(content="Found relevant documents: 1. Relativity: The Special and the General Theory, 2. General Theory of Relativity by A. Einstein."),
38-
AIMessage(content="I found some documents on Einstein's theory of relativity. Which one would you like to know more about: 'Relativity: The Special and the General Theory' or 'General Theory of Relativity by A. Einstein'?"),
39-
HumanMessage(content="Tell me about the 'General Theory of Relativity'."),
40-
AIMessage(content="Got it! Let me fetch more details from 'General Theory of Relativity by A. Einstein'.", tool_calls=[
41-
ToolCall(name="document_retrieve", args={"document": "General Theory of Relativity by A. Einstein"})
42-
]),
43-
ToolMessage(content="The document discusses how gravity affects the fabric of spacetime, describing the relationship between mass and spacetime curvature."),
44-
AIMessage(content="The 'General Theory of Relativity' explains how gravity affects the fabric of spacetime and the relationship between mass and spacetime curvature. Would you like more details or a specific explanation?"),
45-
HumanMessage(content="No, that's perfect. By the way, do you know any good recipes for a chocolate cake?"),
46-
AIMessage(content="Sure! Let me find a simple and delicious recipe for a chocolate cake.", tool_calls=[
47-
ToolCall(name="recipe_search", args={"query": "chocolate cake recipe"})
48-
]),
49-
ToolMessage(content="Here’s a popular recipe for a chocolate cake: Ingredients include flour, sugar, cocoa powder, eggs, milk, and butter. Instructions: Mix dry ingredients, add wet ingredients, and bake at 350°F for 30-35 minutes."),
50-
AIMessage(content="I found a great recipe for chocolate cake! Would you like the full details, or is that summary enough?")
51-
]
52-
53-
54-
sample = MultiTurnSample(user_input=sample_input_4, reference_topics=["science"])
55-
scorer = TopicAdherenceScore(llm = evaluator_llm, mode="precision")
56-
await scorer.multi_turn_ascore(sample)
57105
```
58106
Output
59107
```
@@ -78,33 +126,52 @@ Output
78126
`ToolCallAccuracy` is a metric that can be used to evaluate the performance of the LLM in identifying and calling the required tools to complete a given task. This metric needs `user_input` and `reference_tool_calls` to evaluate the performance of the LLM in identifying and calling the required tools to complete a given task. The metric is computed by comparing the `reference_tool_calls` with the Tool calls made by the AI. The values range between 0 and 1, with higher values indicating better performance.
79127

80128
```python
129+
130+
import asyncio
131+
132+
from ragas.dataset_schema import MultiTurnSample
133+
from ragas.messages import AIMessage, HumanMessage, ToolCall, ToolMessage
81134
from ragas.metrics import ToolCallAccuracy
82-
from ragas.dataset_schema import MultiTurnSample
83-
from ragas.messages import HumanMessage,AIMessage,ToolMessage,ToolCall
84135

85-
sample = [
86-
HumanMessage(content="What's the weather like in New York right now?"),
87-
AIMessage(content="The current temperature in New York is 75°F and it's partly cloudy.", tool_calls=[
88-
ToolCall(name="weather_check", args={"location": "New York"})
89-
]),
90-
HumanMessage(content="Can you translate that to Celsius?"),
91-
AIMessage(content="Let me convert that to Celsius for you.", tool_calls=[
92-
ToolCall(name="temperature_conversion", args={"temperature_fahrenheit": 75})
93-
]),
94-
ToolMessage(content="75°F is approximately 23.9°C."),
95-
AIMessage(content="75°F is approximately 23.9°C.")
96-
]
97-
98-
sample = MultiTurnSample(
99-
user_input=sample,
100-
reference_tool_calls=[
101-
ToolCall(name="weather_check", args={"location": "New York"}),
102-
ToolCall(name="temperature_conversion", args={"temperature_fahrenheit": 75})
136+
137+
async def evaluate_tool_call_accuracy():
138+
sample = [
139+
HumanMessage(content="What's the weather like in New York right now?"),
140+
AIMessage(
141+
content="The current temperature in New York is 75°F and it's partly cloudy.",
142+
tool_calls=[ToolCall(name="weather_check", args={"location": "New York"})],
143+
),
144+
HumanMessage(content="Can you translate that to Celsius?"),
145+
AIMessage(
146+
content="Let me convert that to Celsius for you.",
147+
tool_calls=[
148+
ToolCall(
149+
name="temperature_conversion", args={"temperature_fahrenheit": 75}
150+
)
151+
],
152+
),
153+
ToolMessage(content="75°F is approximately 23.9°C."),
154+
AIMessage(content="75°F is approximately 23.9°C."),
103155
]
104-
)
105156

106-
scorer = ToolCallAccuracy()
107-
await scorer.multi_turn_ascore(sample)
157+
sample = MultiTurnSample(
158+
user_input=sample,
159+
reference_tool_calls=[
160+
ToolCall(name="weather_check", args={"location": "New York"}),
161+
ToolCall(
162+
name="temperature_conversion", args={"temperature_fahrenheit": 75}
163+
),
164+
],
165+
)
166+
167+
scorer = ToolCallAccuracy()
168+
score = await scorer.multi_turn_ascore(sample)
169+
print(score)
170+
171+
172+
if __name__ == "__main__":
173+
asyncio.run(evaluate_tool_call_accuracy())
174+
108175
```
109176
Output
110177
```
@@ -134,31 +201,64 @@ Calculating `AgentGoalAccuracyWithReference` with reference needs `user_input` a
134201

135202

136203
```python
137-
from ragas.dataset_schema import MultiTurnSample
138-
from ragas.messages import HumanMessage,AIMessage,ToolMessage,ToolCall
139-
from ragas.metrics import AgentGoalAccuracyWithReference
140-
204+
import asyncio
141205

142-
sample = MultiTurnSample(user_input=[
143-
HumanMessage(content="Hey, book a table at the nearest best Chinese restaurant for 8:00pm"),
144-
AIMessage(content="Sure, let me find the best options for you.", tool_calls=[
145-
ToolCall(name="restaurant_search", args={"cuisine": "Chinese", "time": "8:00pm"})
146-
]),
147-
ToolMessage(content="Found a few options: 1. Golden Dragon, 2. Jade Palace"),
148-
AIMessage(content="I found some great options: Golden Dragon and Jade Palace. Which one would you prefer?"),
149-
HumanMessage(content="Let's go with Golden Dragon."),
150-
AIMessage(content="Great choice! I'll book a table for 8:00pm at Golden Dragon.", tool_calls=[
151-
ToolCall(name="restaurant_book", args={"name": "Golden Dragon", "time": "8:00pm"})
152-
]),
153-
ToolMessage(content="Table booked at Golden Dragon for 8:00pm."),
154-
AIMessage(content="Your table at Golden Dragon is booked for 8:00pm. Enjoy your meal!"),
155-
HumanMessage(content="thanks"),
156-
],
157-
reference="Table booked at one of the chinese restaurants at 8 pm")
158-
159-
scorer = AgentGoalAccuracyWithReference(llm = evaluator_llm)
160-
await scorer.multi_turn_ascore(sample)
206+
from langchain_openai import ChatOpenAI
207+
from ragas.dataset_schema import MultiTurnSample
208+
from ragas.llms import LangchainLLMWrapper
209+
from ragas.messages import AIMessage, HumanMessage, ToolCall, ToolMessage
210+
from ragas.metrics import AgentGoalAccuracyWithReference
161211

212+
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
213+
214+
215+
async def evaluate_agent_goal_accuracy_with_reference():
216+
sample = MultiTurnSample(
217+
user_input=[
218+
HumanMessage(
219+
content="Hey, book a table at the nearest best Chinese restaurant for 8:00pm"
220+
),
221+
AIMessage(
222+
content="Sure, let me find the best options for you.",
223+
tool_calls=[
224+
ToolCall(
225+
name="restaurant_search",
226+
args={"cuisine": "Chinese", "time": "8:00pm"},
227+
)
228+
],
229+
),
230+
ToolMessage(
231+
content="Found a few options: 1. Golden Dragon, 2. Jade Palace"
232+
),
233+
AIMessage(
234+
content="I found some great options: Golden Dragon and Jade Palace. Which one would you prefer?"
235+
),
236+
HumanMessage(content="Let's go with Golden Dragon."),
237+
AIMessage(
238+
content="Great choice! I'll book a table for 8:00pm at Golden Dragon.",
239+
tool_calls=[
240+
ToolCall(
241+
name="restaurant_book",
242+
args={"name": "Golden Dragon", "time": "8:00pm"},
243+
)
244+
],
245+
),
246+
ToolMessage(content="Table booked at Golden Dragon for 8:00pm."),
247+
AIMessage(
248+
content="Your table at Golden Dragon is booked for 8:00pm. Enjoy your meal!"
249+
),
250+
HumanMessage(content="thanks"),
251+
],
252+
reference="Table booked at one of the chinese restaurants at 8 pm",
253+
)
254+
255+
scorer = AgentGoalAccuracyWithReference(llm=evaluator_llm)
256+
score = await scorer.multi_turn_ascore(sample)
257+
print(score)
258+
259+
260+
if __name__ == "__main__":
261+
asyncio.run(evaluate_agent_goal_accuracy_with_reference())
162262
```
163263
Output
164264
```
@@ -173,29 +273,64 @@ Output
173273
### Example
174274

175275
```python
176-
from ragas.dataset_schema import MultiTurnSample
177-
from ragas.messages import HumanMessage,AIMessage,ToolMessage,ToolCall
178-
from ragas.metrics import AgentGoalAccuracyWithoutReference
179276

277+
import asyncio
278+
279+
from langchain_openai import ChatOpenAI
280+
from ragas.dataset_schema import MultiTurnSample
281+
from ragas.llms import LangchainLLMWrapper
282+
from ragas.messages import AIMessage, HumanMessage, ToolCall, ToolMessage
283+
from ragas.metrics import AgentGoalAccuracyWithoutReference
180284

181-
sample = MultiTurnSample(user_input=[
182-
HumanMessage(content="Hey, book a table at the nearest best Chinese restaurant for 8:00pm"),
183-
AIMessage(content="Sure, let me find the best options for you.", tool_calls=[
184-
ToolCall(name="restaurant_search", args={"cuisine": "Chinese", "time": "8:00pm"})
185-
]),
186-
ToolMessage(content="Found a few options: 1. Golden Dragon, 2. Jade Palace"),
187-
AIMessage(content="I found some great options: Golden Dragon and Jade Palace. Which one would you prefer?"),
188-
HumanMessage(content="Let's go with Golden Dragon."),
189-
AIMessage(content="Great choice! I'll book a table for 8:00pm at Golden Dragon.", tool_calls=[
190-
ToolCall(name="restaurant_book", args={"name": "Golden Dragon", "time": "8:00pm"})
191-
]),
192-
ToolMessage(content="Table booked at Golden Dragon for 8:00pm."),
193-
AIMessage(content="Your table at Golden Dragon is booked for 8:00pm. Enjoy your meal!"),
194-
HumanMessage(content="thanks"),
195-
])
196-
197-
scorer = AgentGoalAccuracyWithoutReference(llm = evaluator_llm)
198-
await scorer.multi_turn_ascore(sample)
285+
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
286+
287+
288+
async def evaluate_agent_goal_accuracy_without_reference():
289+
290+
sample = MultiTurnSample(
291+
user_input=[
292+
HumanMessage(
293+
content="Hey, book a table at the nearest best Chinese restaurant for 8:00pm"
294+
),
295+
AIMessage(
296+
content="Sure, let me find the best options for you.",
297+
tool_calls=[
298+
ToolCall(
299+
name="restaurant_search",
300+
args={"cuisine": "Chinese", "time": "8:00pm"},
301+
)
302+
],
303+
),
304+
ToolMessage(
305+
content="Found a few options: 1. Golden Dragon, 2. Jade Palace"
306+
),
307+
AIMessage(
308+
content="I found some great options: Golden Dragon and Jade Palace. Which one would you prefer?"
309+
),
310+
HumanMessage(content="Let's go with Golden Dragon."),
311+
AIMessage(
312+
content="Great choice! I'll book a table for 8:00pm at Golden Dragon.",
313+
tool_calls=[
314+
ToolCall(
315+
name="restaurant_book",
316+
args={"name": "Golden Dragon", "time": "8:00pm"},
317+
)
318+
],
319+
),
320+
ToolMessage(content="Table booked at Golden Dragon for 8:00pm."),
321+
AIMessage(
322+
content="Your table at Golden Dragon is booked for 8:00pm. Enjoy your meal!"
323+
),
324+
HumanMessage(content="thanks"),
325+
]
326+
)
327+
328+
scorer = AgentGoalAccuracyWithoutReference(llm=evaluator_llm)
329+
score = await scorer.multi_turn_ascore(sample)
330+
print(score)
331+
332+
if __name__ == "__main__":
333+
asyncio.run(evaluate_agent_goal_accuracy_without_reference())
199334

200335
```
201336
Output

0 commit comments

Comments
 (0)