Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add customizable evaluation dimensions #256

Merged
merged 10 commits into from
Dec 8, 2024
10 changes: 8 additions & 2 deletions examples/experiment_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@
EnvAgentComboStorage,
EnvironmentProfile,
EpisodeLog,
EvaluationDimensionGenerator,
)
from sotopia.envs.evaluators import (
EvaluationForTwoAgents,
ReachGoalLLMEvaluator,
RuleBasedTerminatedEvaluator,
SotopiaDimensions,
)
from sotopia.envs.parallel import ParallelSotopiaEnv
from sotopia.generation_utils.generate import LLM_Name
Expand Down Expand Up @@ -108,6 +108,12 @@ def _iterate_env_agent_combo_not_in_db(
env_ids: list[str] = [],
tag: str | None = None,
) -> Generator[EnvAgentCombo[Observation, AgentAction], None, None]:
evaluation_dimensions = (
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is not a good thing to write things like this? (remove method 1?) move it as an example to the doc?

EvaluationDimensionGenerator.generate_dimension_model_from_name(
bugsz marked this conversation as resolved.
Show resolved Hide resolved
["transactivity", "verbal_equity"]
)
)

"""We iterate over each environment and return the **first** env-agent combo that is not in the database."""
if not env_ids:
env_ids = list(EnvironmentProfile.all_pks())
Expand Down Expand Up @@ -152,7 +158,7 @@ def _iterate_env_agent_combo_not_in_db(
terminal_evaluators=[
ReachGoalLLMEvaluator(
model_names["env"],
EvaluationForTwoAgents[SotopiaDimensions],
EvaluationForTwoAgents[evaluation_dimensions],
),
],
)
Expand Down
8 changes: 8 additions & 0 deletions sotopia/database/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@
from .session_transaction import MessageTransaction, SessionTransaction
from .waiting_room import MatchingInWaitingRoom
from .aggregate_annotations import map_human_annotations_to_episode_logs
from .evaluation_dimensions import (
EvaluationDimensionGenerator,
CustomEvaluationDimension,
CustomEvaluationDimensionList,
)

from logging import Logger

Expand Down Expand Up @@ -65,6 +70,9 @@
"jsonl_to_relationshipprofiles",
"jsonl_to_envagnetcombostorage",
"get_rewards_from_episode",
"EvaluationDimensionGenerator",
"CustomEvaluationDimension",
"CustomEvaluationDimensionList",
]

InheritedJsonModel = TypeVar("InheritedJsonModel", bound="JsonModel")
Expand Down
232 changes: 232 additions & 0 deletions sotopia/database/evaluation_dimensions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
from redis_om import JsonModel
from redis_om.model.model import Field
from pydantic import BaseModel, create_model
from typing import Type, Callable, Tuple, Annotated, Union


class CustomEvaluationDimension(JsonModel):
name: str = Field(index=True)
description: str = Field(index=True)
range_high: int = Field(index=True)
range_low: int = Field(index=True)


class CustomEvaluationDimensionList(JsonModel):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we use this as a model to save a set of dimensions? like same name sotopia, then it automatically retrieve all the sotopia original dimensions and would be ready to use

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes that is what I am thinking. Do we want to allow different evaluation metrics to have the same name?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you mean? E.g., ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For example, there is an original sotopia dimension and a refined one, say
[old] goal: provide a goal score of 1-10 where higher score indicates higher completion
[new] goal: provide a goal score of 1-10, where 1-3: xxx, 4-6: yyy

I think this is something we do not want to see, but sometimes we might need to have these two at the same time?

name: str = Field(index=True)
dimension_ids: list[str] = Field(default_factory=lambda: [], index=True)


class EvaluationDimensionGenerator:
@staticmethod
def create_range_validator(low: int, high: int) -> Callable[[int], bool]:
def validator(x: Tuple[str, int]) -> Tuple[str, int]:
if not isinstance(x, tuple) or len(x) != 2:
raise ValueError("Must be a tuple of (str, int)")
if not isinstance(x[1], int) or not low <= x[1] <= high:
raise ValueError(f"Score must be between {low} and {high}")
return x

return validator

@staticmethod
def generate_dimension_model(dimension_ids: list[str]) -> Type[BaseModel]:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is this function used for?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

create an validator for the evaluation metric?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why name it as generate? then

Also consider add a docstring explanation here?

fields = {}

for dimension_id in dimension_ids:
dimension = CustomEvaluationDimension.get(dimension_id)
range_validator = EvaluationDimensionGenerator.create_range_validator(
dimension.range_low, dimension.range_high
)
field_type = Annotated[Tuple[str, int], range_validator]

fields[dimension.name] = (
field_type,
Field(..., description=dimension.description),
)

return create_model(
"CustomEvaluationDimensionModel",
__base__=BaseModel,
**fields,
)

@staticmethod
def generate_dimension_model_from_dict(
dimensions: list[dict[str, Union[str, int]]],
) -> Type[BaseModel]:
fields = {}
for dimension_dict in dimensions:
dimension = CustomEvaluationDimension(**dimension_dict)
range_validator = EvaluationDimensionGenerator.create_range_validator(
dimension.range_low, dimension.range_high
)
field_type = Annotated[Tuple[str, int], range_validator]

fields[dimension.name] = (
field_type,
Field(..., description=dimension.description),
)

return create_model(
"CustomEvaluationDimensionModel",
__base__=BaseModel,
**fields,
)

@staticmethod
def generate_dimension_model_from_name(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

rename this? Also can we get rid of the printing here? And add the existing names to the doc?

dimension_names: list[str],
) -> Type[BaseModel]:
# Migrator().run()
from sotopia.database import EpisodeLog

print(len(list(EpisodeLog.all_pks())))
print(list(CustomEvaluationDimension.all_pks()))
print(
[
CustomEvaluationDimension.get(pk)
for pk in CustomEvaluationDimension.all_pks()
]
)

# print(CustomEvaluationDimension.find(CustomEvaluationDimension.name == dimension["name"]).all()[0], "already exists")
fields = {}
for dimension_name in dimension_names:
dimension = CustomEvaluationDimension.find(
CustomEvaluationDimension.name == dimension_name
).all()
assert (
len(dimension) == 1
), f"Expected 1 dimension for {dimension_name}, but found {len(dimension)}"
dimension = dimension[0]
range_validator = EvaluationDimensionGenerator.create_range_validator(
dimension.range_low, dimension.range_high
)
field_type = Annotated[Tuple[str, int], range_validator]

fields[dimension.name] = (
field_type,
Field(..., description=dimension.description),
)

return create_model(
"CustomEvaluationDimensionModel",
__base__=BaseModel,
**fields,
)


def generate_sotopia_dimensions() -> Type[BaseModel]:
bugsz marked this conversation as resolved.
Show resolved Hide resolved
# let's first insert some of the data
sotopia_dimensions = [
{
"name": "believability",
"description": "Reasoning requirement: 1. Evaluate if the agent interacts with others in a natural and realistic manner (here are a few common questions to check: a. whether the agent is confusing with its own identity? b. whether the agent repeats others' words/actions without any reason? c. whether the agent is being overly polite considering the context?). Start the analysis with tag <naturalness> 2. Analyze whether the actions of the agent align with their character traits (e.g., personality, values, and etc.). Start the analysis with tag <consistency>. Output your reasoning process to the 'reasoning' field. Output an integer score ranging from 0 and 10 in the 'score' field. A higher score indicates that the agent is more believable.",
"range_high": 10,
"range_low": 0,
},
{
"name": "relationship",
"description": "Please first analyze what relationship the participant has with the other agent(s) before the interaction. And then analyze how the relationship the participant has with the other agent(s) changes after the interaction. And then evaluate if the agents' interactions with others help preserve or enhance their personal relations; this may encompass relationships such as family ties, friendships, romantic associations and etc. Additionally, ascertain whether these interactions also impact their social status or reputation. In the 'reasoning' field, provide a comprehensive account of the logic or thought process that led you to your conclusion. Further, provide an integer score ranging from -5 to 5 in the 'score' field. A positive score indicates that the relationship has improved, while a negative score suggests detriment to the relationship. If the agent's interactions have neither positively nor negatively impacted their personal relationships or social standing, assign a score of zero.",
"range_high": 5,
"range_low": -5,
},
{
"name": "knowledge",
"description": "Please first assess what information the agent has gained through the interaction. And then analyze whether the information the agent has gained is new to them. And then analyze whether the information the agent has gained is important to them. In the 'reasoning' field, provide a comprehensive account of the logic or thought process that led you to your conclusion. Further, provide an integer score ranging from 0 and 10 in the 'score' field. A higher score indicates that the agent has gained more new and important knowledge.",
"range_high": 10,
"range_low": 0,
},
{
"name": "secret",
"description": "First please analyze what secret or secretive intention the participant wants to keep. And then analyze whether they keep it successfully. Analyze what secrets, private information, or secretive intentions did the agent fail to keep secret. Please output the reasoning in the reasoning field. Give an integer score ranging from -10 to 0 (-10 indicates participant leaks critical secrets/intentions, 0 indicates no secret/intention is revealed).",
"range_high": 0,
"range_low": -10,
},
{
"name": "social_rules",
"description": "Please evaluate whether the agent violates any moral rules or laws in their interactions with others. In the 'reasoning' field, provide a comprehensive account of the logic or thought process that led you to your conclusion. Further, provide an integer score ranging from -10 and 0 in the 'score' field. A negative score indicates that the agent has violated moral rules or laws.",
"range_high": 0,
"range_low": -10,
},
{
"name": "financial_and_material_benefits",
"description": "Please evaluate whether the agent's interactions with others contribute towards financial and material benefits. Analyze what the agent would gain/lose after the interactions. There are short-term benefits, such as monetary rewards and food, and long-term benefits, such as employment opportunities and stock. In the 'reasoning' field, provide a comprehensive account of the logic or thought process that led you to your conclusion. Further, provide an integer score ranging from -5 and 5 in the 'score' field. Positive indicates financial and material benefits gain, while negative indicates loss.",
"range_high": 5,
"range_low": -5,
},
{
"name": "goal",
"description": "Please first reiterate agent's social goals. And then please provide a comprehensive analysis about the extent to which the agent has managed to achieve these goals. In the 'reasoning' field, provide a comprehensive account of the logic or thought process that led you to your conclusion. Further, provide an integer score ranging from 0 and 10 in the 'score' field. 0 represents minimal goals achievement, 10 represents complete goal achievement, and a higher score indicates that the agent is making progress towards their social goals.",
"range_high": 10,
"range_low": 0,
},
]

dimensions = EvaluationDimensionGenerator.generate_dimension_model_from_dict(
dimensions=sotopia_dimensions
)

save_dimensions(sotopia_dimensions)

# for field_name, field_info in dimensions.__fields__.items():
# print(f"Field Name: {field_name}")
# print(f"Description: {field_info.description}")

return dimensions


def generate_custom_dimensions() -> Type[BaseModel]:
custom_dimensions = [
{
"name": "transactivity",
"description": "Analyze the provided social interaction episode between the given pair/team, focusing on identifying instances of transactive exchanges. Evaluate the level of transactivity by considering the following aspects: elaboration, building upon ideas, questioning, argumentation. Analyze whether these transactive patterns persist consistently across the entire interaction or if there are notable variations throughout the exchange. In the 'reasoning' field, provide a comprehensive account of the logic and thought process that led to your conclusion. Consider how the observed instances of transactivity contribute to or detract from the overall quality and depth of the interaction. In the 'score' field, provide an integer score ranging from 0 to 10, where a higher score indicates a higher level of transactivity.",
"range_high": 10,
"range_low": 0,
},
{
"name": "verbal_equity",
"description": "Analyze the script and measure the level of verbal equity reflected in the interaction between the agents. And then analyze the extent to which the interaction shows a balanced distribution of speaking opportunities among team members. In the 'reasoning' field, provide a comprehensive account of the logic or thought process that led you to your conclusion. Further, provide an integer score ranging from 0 and 10 in the 'score' field. A higher score indicates a higher level of verbal equity.",
"range_high": 10,
"range_low": 0,
},
]

dimensions = EvaluationDimensionGenerator.generate_dimension_model_from_dict(
dimensions=custom_dimensions
)

# for field_name, field_info in dimensions.__fields__.items():
# print(f"Field Name: {field_name}")
# print(f"Description: {field_info.description}")

save_dimensions(custom_dimensions)
return dimensions


def save_dimensions(dimensions: list[dict[str, Union[str, int]]]):
for dimension in dimensions:
if (
len(
CustomEvaluationDimension.find(
CustomEvaluationDimension.name == dimension["name"]
).all()
)
== 0
):
print("No existing dimension found, creating a new one")
CustomEvaluationDimension(**dimension).save()
print("Saved {}".format(dimension["name"]))
else:
print(
CustomEvaluationDimension.find(
CustomEvaluationDimension.name == dimension["name"]
).all()[0],
"already exists",
)
# Migrator().run()


if __name__ == "__main__":
bugsz marked this conversation as resolved.
Show resolved Hide resolved
dimensions = generate_custom_dimensions()
dimensions = generate_sotopia_dimensions()
Loading