Skip to content

docs: agent metrics code examples improvement #1983

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
319 changes: 227 additions & 92 deletions docs/concepts/metrics/available_metrics/agents.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,36 +24,84 @@ $$
### Example

```python
import asyncio

from ragas.dataset_schema import SingleTurnSample, MultiTurnSample, EvaluationDataset
from ragas.messages import HumanMessage,AIMessage,ToolMessage,ToolCall
from ragas.metrics import TopicAdherenceScore
from ragas.llms import LangchainLLMWrapper
from langchain_openai import ChatOpenAI


evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))


async def evaluate_topic_adherence():

sample_input_4 = [
HumanMessage(
content="Can you provide me with details about Einstein's theory of relativity?"
),
AIMessage(
content="Sure, let me retrieve the relevant information for you.",
tool_calls=[
ToolCall(
name="document_search",
args={"query": "Einstein's theory of relativity"},
)
],
),
ToolMessage(
content="Found relevant documents: 1. Relativity: The Special and the General Theory, 2. General Theory of Relativity by A. Einstein."
),
AIMessage(
content="I found some documents on Einstein's theory of relativity. Which one would you like to know more about: 'Relativity: The Special and the General Theory' or 'General Theory of Relativity by A. Einstein'?"
),
HumanMessage(content="Tell me about the 'General Theory of Relativity'."),
AIMessage(
content="Got it! Let me fetch more details from 'General Theory of Relativity by A. Einstein'.",
tool_calls=[
ToolCall(
name="document_retrieve",
args={"document": "General Theory of Relativity by A. Einstein"},
)
],
),
ToolMessage(
content="The document discusses how gravity affects the fabric of spacetime, describing the relationship between mass and spacetime curvature."
),
AIMessage(
content="The 'General Theory of Relativity' explains how gravity affects the fabric of spacetime and the relationship between mass and spacetime curvature. Would you like more details or a specific explanation?"
),
HumanMessage(
content="No, that's perfect. By the way, do you know any good recipes for a chocolate cake?"
),
AIMessage(
content="Sure! Let me find a simple and delicious recipe for a chocolate cake.",
tool_calls=[
ToolCall(name="recipe_search", args={"query": "chocolate cake recipe"})
],
),
ToolMessage(
content="Here’s a popular recipe for a chocolate cake: Ingredients include flour, sugar, cocoa powder, eggs, milk, and butter. Instructions: Mix dry ingredients, add wet ingredients, and bake at 350°F for 30-35 minutes."
),
AIMessage(
content="I found a great recipe for chocolate cake! Would you like the full details, or is that summary enough?"
),
]

sample = MultiTurnSample(user_input=sample_input_4, reference_topics=["science"])
scorer = TopicAdherenceScore(llm=evaluator_llm, mode="precision")
score = await scorer.multi_turn_ascore(sample)
print(score)


if __name__ == "__main__":

asyncio.run(evaluate_topic_adherence())



sample_input_4 = [
HumanMessage(content="Can you provide me with details about Einstein's theory of relativity?"),
AIMessage(content="Sure, let me retrieve the relevant information for you.", tool_calls=[
ToolCall(name="document_search", args={"query": "Einstein's theory of relativity"})
]),
ToolMessage(content="Found relevant documents: 1. Relativity: The Special and the General Theory, 2. General Theory of Relativity by A. Einstein."),
AIMessage(content="I found some documents on Einstein's theory of relativity. Which one would you like to know more about: 'Relativity: The Special and the General Theory' or 'General Theory of Relativity by A. Einstein'?"),
HumanMessage(content="Tell me about the 'General Theory of Relativity'."),
AIMessage(content="Got it! Let me fetch more details from 'General Theory of Relativity by A. Einstein'.", tool_calls=[
ToolCall(name="document_retrieve", args={"document": "General Theory of Relativity by A. Einstein"})
]),
ToolMessage(content="The document discusses how gravity affects the fabric of spacetime, describing the relationship between mass and spacetime curvature."),
AIMessage(content="The 'General Theory of Relativity' explains how gravity affects the fabric of spacetime and the relationship between mass and spacetime curvature. Would you like more details or a specific explanation?"),
HumanMessage(content="No, that's perfect. By the way, do you know any good recipes for a chocolate cake?"),
AIMessage(content="Sure! Let me find a simple and delicious recipe for a chocolate cake.", tool_calls=[
ToolCall(name="recipe_search", args={"query": "chocolate cake recipe"})
]),
ToolMessage(content="Here’s a popular recipe for a chocolate cake: Ingredients include flour, sugar, cocoa powder, eggs, milk, and butter. Instructions: Mix dry ingredients, add wet ingredients, and bake at 350°F for 30-35 minutes."),
AIMessage(content="I found a great recipe for chocolate cake! Would you like the full details, or is that summary enough?")
]


sample = MultiTurnSample(user_input=sample_input_4, reference_topics=["science"])
scorer = TopicAdherenceScore(llm = evaluator_llm, mode="precision")
await scorer.multi_turn_ascore(sample)
```
Output
```
Expand All @@ -78,33 +126,52 @@ Output
`ToolCallAccuracy` is a metric that can be used to evaluate the performance of the LLM in identifying and calling the required tools to complete a given task. This metric needs `user_input` and `reference_tool_calls` to evaluate the performance of the LLM in identifying and calling the required tools to complete a given task. The metric is computed by comparing the `reference_tool_calls` with the Tool calls made by the AI. The values range between 0 and 1, with higher values indicating better performance.

```python

import asyncio

from ragas.dataset_schema import MultiTurnSample
from ragas.messages import AIMessage, HumanMessage, ToolCall, ToolMessage
from ragas.metrics import ToolCallAccuracy
from ragas.dataset_schema import MultiTurnSample
from ragas.messages import HumanMessage,AIMessage,ToolMessage,ToolCall

sample = [
HumanMessage(content="What's the weather like in New York right now?"),
AIMessage(content="The current temperature in New York is 75°F and it's partly cloudy.", tool_calls=[
ToolCall(name="weather_check", args={"location": "New York"})
]),
HumanMessage(content="Can you translate that to Celsius?"),
AIMessage(content="Let me convert that to Celsius for you.", tool_calls=[
ToolCall(name="temperature_conversion", args={"temperature_fahrenheit": 75})
]),
ToolMessage(content="75°F is approximately 23.9°C."),
AIMessage(content="75°F is approximately 23.9°C.")
]

sample = MultiTurnSample(
user_input=sample,
reference_tool_calls=[
ToolCall(name="weather_check", args={"location": "New York"}),
ToolCall(name="temperature_conversion", args={"temperature_fahrenheit": 75})

async def evaluate_tool_call_accuracy():
sample = [
HumanMessage(content="What's the weather like in New York right now?"),
AIMessage(
content="The current temperature in New York is 75°F and it's partly cloudy.",
tool_calls=[ToolCall(name="weather_check", args={"location": "New York"})],
),
HumanMessage(content="Can you translate that to Celsius?"),
AIMessage(
content="Let me convert that to Celsius for you.",
tool_calls=[
ToolCall(
name="temperature_conversion", args={"temperature_fahrenheit": 75}
)
],
),
ToolMessage(content="75°F is approximately 23.9°C."),
AIMessage(content="75°F is approximately 23.9°C."),
]
)

scorer = ToolCallAccuracy()
await scorer.multi_turn_ascore(sample)
sample = MultiTurnSample(
user_input=sample,
reference_tool_calls=[
ToolCall(name="weather_check", args={"location": "New York"}),
ToolCall(
name="temperature_conversion", args={"temperature_fahrenheit": 75}
),
],
)

scorer = ToolCallAccuracy()
score = await scorer.multi_turn_ascore(sample)
print(score)


if __name__ == "__main__":
asyncio.run(evaluate_tool_call_accuracy())

```
Output
```
Expand Down Expand Up @@ -134,31 +201,64 @@ Calculating `AgentGoalAccuracyWithReference` with reference needs `user_input` a


```python
from ragas.dataset_schema import MultiTurnSample
from ragas.messages import HumanMessage,AIMessage,ToolMessage,ToolCall
from ragas.metrics import AgentGoalAccuracyWithReference

import asyncio

sample = MultiTurnSample(user_input=[
HumanMessage(content="Hey, book a table at the nearest best Chinese restaurant for 8:00pm"),
AIMessage(content="Sure, let me find the best options for you.", tool_calls=[
ToolCall(name="restaurant_search", args={"cuisine": "Chinese", "time": "8:00pm"})
]),
ToolMessage(content="Found a few options: 1. Golden Dragon, 2. Jade Palace"),
AIMessage(content="I found some great options: Golden Dragon and Jade Palace. Which one would you prefer?"),
HumanMessage(content="Let's go with Golden Dragon."),
AIMessage(content="Great choice! I'll book a table for 8:00pm at Golden Dragon.", tool_calls=[
ToolCall(name="restaurant_book", args={"name": "Golden Dragon", "time": "8:00pm"})
]),
ToolMessage(content="Table booked at Golden Dragon for 8:00pm."),
AIMessage(content="Your table at Golden Dragon is booked for 8:00pm. Enjoy your meal!"),
HumanMessage(content="thanks"),
],
reference="Table booked at one of the chinese restaurants at 8 pm")

scorer = AgentGoalAccuracyWithReference(llm = evaluator_llm)
await scorer.multi_turn_ascore(sample)
from langchain_openai import ChatOpenAI
from ragas.dataset_schema import MultiTurnSample
from ragas.llms import LangchainLLMWrapper
from ragas.messages import AIMessage, HumanMessage, ToolCall, ToolMessage
from ragas.metrics import AgentGoalAccuracyWithReference

evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))


async def evaluate_agent_goal_accuracy_with_reference():
sample = MultiTurnSample(
user_input=[
HumanMessage(
content="Hey, book a table at the nearest best Chinese restaurant for 8:00pm"
),
AIMessage(
content="Sure, let me find the best options for you.",
tool_calls=[
ToolCall(
name="restaurant_search",
args={"cuisine": "Chinese", "time": "8:00pm"},
)
],
),
ToolMessage(
content="Found a few options: 1. Golden Dragon, 2. Jade Palace"
),
AIMessage(
content="I found some great options: Golden Dragon and Jade Palace. Which one would you prefer?"
),
HumanMessage(content="Let's go with Golden Dragon."),
AIMessage(
content="Great choice! I'll book a table for 8:00pm at Golden Dragon.",
tool_calls=[
ToolCall(
name="restaurant_book",
args={"name": "Golden Dragon", "time": "8:00pm"},
)
],
),
ToolMessage(content="Table booked at Golden Dragon for 8:00pm."),
AIMessage(
content="Your table at Golden Dragon is booked for 8:00pm. Enjoy your meal!"
),
HumanMessage(content="thanks"),
],
reference="Table booked at one of the chinese restaurants at 8 pm",
)

scorer = AgentGoalAccuracyWithReference(llm=evaluator_llm)
score = await scorer.multi_turn_ascore(sample)
print(score)


if __name__ == "__main__":
asyncio.run(evaluate_agent_goal_accuracy_with_reference())
```
Output
```
Expand All @@ -173,29 +273,64 @@ Output
### Example

```python
from ragas.dataset_schema import MultiTurnSample
from ragas.messages import HumanMessage,AIMessage,ToolMessage,ToolCall
from ragas.metrics import AgentGoalAccuracyWithoutReference

import asyncio

from langchain_openai import ChatOpenAI
from ragas.dataset_schema import MultiTurnSample
from ragas.llms import LangchainLLMWrapper
from ragas.messages import AIMessage, HumanMessage, ToolCall, ToolMessage
from ragas.metrics import AgentGoalAccuracyWithoutReference

sample = MultiTurnSample(user_input=[
HumanMessage(content="Hey, book a table at the nearest best Chinese restaurant for 8:00pm"),
AIMessage(content="Sure, let me find the best options for you.", tool_calls=[
ToolCall(name="restaurant_search", args={"cuisine": "Chinese", "time": "8:00pm"})
]),
ToolMessage(content="Found a few options: 1. Golden Dragon, 2. Jade Palace"),
AIMessage(content="I found some great options: Golden Dragon and Jade Palace. Which one would you prefer?"),
HumanMessage(content="Let's go with Golden Dragon."),
AIMessage(content="Great choice! I'll book a table for 8:00pm at Golden Dragon.", tool_calls=[
ToolCall(name="restaurant_book", args={"name": "Golden Dragon", "time": "8:00pm"})
]),
ToolMessage(content="Table booked at Golden Dragon for 8:00pm."),
AIMessage(content="Your table at Golden Dragon is booked for 8:00pm. Enjoy your meal!"),
HumanMessage(content="thanks"),
])

scorer = AgentGoalAccuracyWithoutReference(llm = evaluator_llm)
await scorer.multi_turn_ascore(sample)
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))


async def evaluate_agent_goal_accuracy_without_reference():

sample = MultiTurnSample(
user_input=[
HumanMessage(
content="Hey, book a table at the nearest best Chinese restaurant for 8:00pm"
),
AIMessage(
content="Sure, let me find the best options for you.",
tool_calls=[
ToolCall(
name="restaurant_search",
args={"cuisine": "Chinese", "time": "8:00pm"},
)
],
),
ToolMessage(
content="Found a few options: 1. Golden Dragon, 2. Jade Palace"
),
AIMessage(
content="I found some great options: Golden Dragon and Jade Palace. Which one would you prefer?"
),
HumanMessage(content="Let's go with Golden Dragon."),
AIMessage(
content="Great choice! I'll book a table for 8:00pm at Golden Dragon.",
tool_calls=[
ToolCall(
name="restaurant_book",
args={"name": "Golden Dragon", "time": "8:00pm"},
)
],
),
ToolMessage(content="Table booked at Golden Dragon for 8:00pm."),
AIMessage(
content="Your table at Golden Dragon is booked for 8:00pm. Enjoy your meal!"
),
HumanMessage(content="thanks"),
]
)

scorer = AgentGoalAccuracyWithoutReference(llm=evaluator_llm)
score = await scorer.multi_turn_ascore(sample)
print(score)

if __name__ == "__main__":
asyncio.run(evaluate_agent_goal_accuracy_without_reference())

```
Output
Expand Down