sotopia-lab · XuhuiZhou · Dec 8, 2024 · Nov 28, 2024 · Nov 30, 2024 · Dec 4, 2024
diff --git a/examples/experiment_eval.py b/examples/experiment_eval.py
@@ -17,12 +17,12 @@
     EnvAgentComboStorage,
     EnvironmentProfile,
     EpisodeLog,
+    EvaluationDimensionGenerator,
 )
 from sotopia.envs.evaluators import (
     EvaluationForTwoAgents,
     ReachGoalLLMEvaluator,
     RuleBasedTerminatedEvaluator,
-    SotopiaDimensions,
 )
 from sotopia.envs.parallel import ParallelSotopiaEnv
 from sotopia.generation_utils.generate import LLM_Name
@@ -108,6 +108,12 @@ def _iterate_env_agent_combo_not_in_db(
     env_ids: list[str] = [],
     tag: str | None = None,
 ) -> Generator[EnvAgentCombo[Observation, AgentAction], None, None]:
+    evaluation_dimensions = (
+        EvaluationDimensionGenerator.generate_dimension_model_from_name(
+            ["transactivity", "verbal_equity"]
+        )
+    )
+
     """We iterate over each environment and return the **first** env-agent combo that is not in the database."""
     if not env_ids:
         env_ids = list(EnvironmentProfile.all_pks())
@@ -152,7 +158,7 @@ def _iterate_env_agent_combo_not_in_db(
                 terminal_evaluators=[
                     ReachGoalLLMEvaluator(
                         model_names["env"],
-                        EvaluationForTwoAgents[SotopiaDimensions],
+                        EvaluationForTwoAgents[evaluation_dimensions],
                     ),
                 ],
             )

diff --git a/sotopia/database/__init__.py b/sotopia/database/__init__.py
@@ -30,6 +30,11 @@
 from .session_transaction import MessageTransaction, SessionTransaction
 from .waiting_room import MatchingInWaitingRoom
 from .aggregate_annotations import map_human_annotations_to_episode_logs
+from .evaluation_dimensions import (
+    EvaluationDimensionGenerator,
+    CustomEvaluationDimension,
+    CustomEvaluationDimensionList,
+)
 
 from logging import Logger
 
@@ -65,6 +70,9 @@
     "jsonl_to_relationshipprofiles",
     "jsonl_to_envagnetcombostorage",
     "get_rewards_from_episode",
+    "EvaluationDimensionGenerator",
+    "CustomEvaluationDimension",
+    "CustomEvaluationDimensionList",
 ]
 
 InheritedJsonModel = TypeVar("InheritedJsonModel", bound="JsonModel")

diff --git a/sotopia/database/evaluation_dimensions.py b/sotopia/database/evaluation_dimensions.py
@@ -0,0 +1,232 @@
+from redis_om import JsonModel
+from redis_om.model.model import Field
+from pydantic import BaseModel, create_model
+from typing import Type, Callable, Tuple, Annotated, Union
+
+
+class CustomEvaluationDimension(JsonModel):
+    name: str = Field(index=True)
+    description: str = Field(index=True)
+    range_high: int = Field(index=True)
+    range_low: int = Field(index=True)
+
+
+class CustomEvaluationDimensionList(JsonModel):
+    name: str = Field(index=True)
+    dimension_ids: list[str] = Field(default_factory=lambda: [], index=True)
+
+
+class EvaluationDimensionGenerator:
+    @staticmethod
+    def create_range_validator(low: int, high: int) -> Callable[[int], bool]:
+        def validator(x: Tuple[str, int]) -> Tuple[str, int]:
+            if not isinstance(x, tuple) or len(x) != 2:
+                raise ValueError("Must be a tuple of (str, int)")
+            if not isinstance(x[1], int) or not low <= x[1] <= high:
+                raise ValueError(f"Score must be between {low} and {high}")
+            return x
+
+        return validator
+
+    @staticmethod
+    def generate_dimension_model(dimension_ids: list[str]) -> Type[BaseModel]:
+        fields = {}
+
+        for dimension_id in dimension_ids:
+            dimension = CustomEvaluationDimension.get(dimension_id)
+            range_validator = EvaluationDimensionGenerator.create_range_validator(
+                dimension.range_low, dimension.range_high
+            )
+            field_type = Annotated[Tuple[str, int], range_validator]
+
+            fields[dimension.name] = (
+                field_type,
+                Field(..., description=dimension.description),
+            )
+
+        return create_model(
+            "CustomEvaluationDimensionModel",
+            __base__=BaseModel,
+            **fields,
+        )
+
+    @staticmethod
+    def generate_dimension_model_from_dict(
+        dimensions: list[dict[str, Union[str, int]]],
+    ) -> Type[BaseModel]:
+        fields = {}
+        for dimension_dict in dimensions:
+            dimension = CustomEvaluationDimension(**dimension_dict)
+            range_validator = EvaluationDimensionGenerator.create_range_validator(
+                dimension.range_low, dimension.range_high
+            )
+            field_type = Annotated[Tuple[str, int], range_validator]
+
+            fields[dimension.name] = (
+                field_type,
+                Field(..., description=dimension.description),
+            )
+
+        return create_model(
+            "CustomEvaluationDimensionModel",
+            __base__=BaseModel,
+            **fields,
+        )
+
+    @staticmethod
+    def generate_dimension_model_from_name(
+        dimension_names: list[str],
+    ) -> Type[BaseModel]:
+        # Migrator().run()
+        from sotopia.database import EpisodeLog
+
+        print(len(list(EpisodeLog.all_pks())))
+        print(list(CustomEvaluationDimension.all_pks()))
+        print(
+            [
+                CustomEvaluationDimension.get(pk)
+                for pk in CustomEvaluationDimension.all_pks()
+            ]
+        )
+
+        # print(CustomEvaluationDimension.find(CustomEvaluationDimension.name == dimension["name"]).all()[0], "already exists")
+        fields = {}
+        for dimension_name in dimension_names:
+            dimension = CustomEvaluationDimension.find(
+                CustomEvaluationDimension.name == dimension_name
+            ).all()
+            assert (
+                len(dimension) == 1
+            ), f"Expected 1 dimension for {dimension_name}, but found {len(dimension)}"
+            dimension = dimension[0]
+            range_validator = EvaluationDimensionGenerator.create_range_validator(
+                dimension.range_low, dimension.range_high
+            )
+            field_type = Annotated[Tuple[str, int], range_validator]
+
+            fields[dimension.name] = (
+                field_type,
+                Field(..., description=dimension.description),
+            )
+
+        return create_model(
+            "CustomEvaluationDimensionModel",
+            __base__=BaseModel,
+            **fields,
+        )
+
+
+def generate_sotopia_dimensions() -> Type[BaseModel]:
+    # let's first insert some of the data
+    sotopia_dimensions = [
+        {
+            "name": "believability",
+            "description": "Reasoning requirement: 1. Evaluate if the agent interacts with others in a natural and realistic manner (here are a few common questions to check: a. whether the agent is confusing with its own identity? b. whether the agent repeats others' words/actions without any reason? c. whether the agent is being overly polite considering the context?). Start the analysis with tag <naturalness> 2. Analyze whether the actions of the agent align with their character traits (e.g., personality, values, and etc.). Start the analysis with tag <consistency>. Output your reasoning process to the 'reasoning' field. Output an integer score ranging from 0 and 10 in the 'score' field. A higher score indicates that the agent is more believable.",
+            "range_high": 10,
+            "range_low": 0,
+        },
+        {
+            "name": "relationship",
+            "description": "Please first analyze what relationship the participant has with the other agent(s) before the interaction. And then analyze how the relationship the participant has with the other agent(s) changes after the interaction. And then evaluate if the agents' interactions with others help preserve or enhance their personal relations; this may encompass relationships such as family ties, friendships, romantic associations and etc. Additionally, ascertain whether these interactions also impact their social status or reputation. In the 'reasoning' field, provide a comprehensive account of the logic or thought process that led you to your conclusion. Further, provide an integer score ranging from -5 to 5 in the 'score' field. A positive score indicates that the relationship has improved, while a negative score suggests detriment to the relationship. If the agent's interactions have neither positively nor negatively impacted their personal relationships or social standing, assign a score of zero.",
+            "range_high": 5,
+            "range_low": -5,
+        },
+        {
+            "name": "knowledge",
+            "description": "Please first assess what information the agent has gained through the interaction. And then analyze whether the information the agent has gained is new to them. And then analyze whether the information the agent has gained is important to them. In the 'reasoning' field, provide a comprehensive account of the logic or thought process that led you to your conclusion. Further, provide an integer score ranging from 0 and 10 in the 'score' field. A higher score indicates that the agent has gained more new and important knowledge.",
+            "range_high": 10,
+            "range_low": 0,
+        },
+        {
+            "name": "secret",
+            "description": "First please analyze what secret or secretive intention the participant wants to keep. And then analyze whether they keep it successfully. Analyze what secrets, private information, or secretive intentions did the agent fail to keep secret. Please output the reasoning in the reasoning field. Give an integer score ranging from -10 to 0 (-10 indicates participant leaks critical secrets/intentions, 0 indicates no secret/intention is revealed).",
+            "range_high": 0,
+            "range_low": -10,
+        },
+        {
+            "name": "social_rules",
+            "description": "Please evaluate whether the agent violates any moral rules or laws in their interactions with others. In the 'reasoning' field, provide a comprehensive account of the logic or thought process that led you to your conclusion. Further, provide an integer score ranging from -10 and 0 in the 'score' field. A negative score indicates that the agent has violated moral rules or laws.",
+            "range_high": 0,
+            "range_low": -10,
+        },
+        {
+            "name": "financial_and_material_benefits",
+            "description": "Please evaluate whether the agent's interactions with others contribute towards financial and material benefits. Analyze what the agent would gain/lose after the interactions. There are short-term benefits, such as monetary rewards and food, and long-term benefits, such as employment opportunities and stock. In the 'reasoning' field, provide a comprehensive account of the logic or thought process that led you to your conclusion. Further, provide an integer score ranging from -5 and 5 in the 'score' field. Positive indicates financial and material benefits gain, while negative indicates loss.",
+            "range_high": 5,
+            "range_low": -5,
+        },
+        {
+            "name": "goal",
+            "description": "Please first reiterate agent's social goals. And then please provide a comprehensive analysis about the extent to which the agent has managed to achieve these goals. In the 'reasoning' field, provide a comprehensive account of the logic or thought process that led you to your conclusion. Further, provide an integer score ranging from 0 and 10 in the 'score' field. 0 represents minimal goals achievement, 10 represents complete goal achievement, and a higher score indicates that the agent is making progress towards their social goals.",
+            "range_high": 10,
+            "range_low": 0,
+        },
+    ]
+
+    dimensions = EvaluationDimensionGenerator.generate_dimension_model_from_dict(
+        dimensions=sotopia_dimensions
+    )
+
+    save_dimensions(sotopia_dimensions)
+
+    # for field_name, field_info in dimensions.__fields__.items():
+    #     print(f"Field Name: {field_name}")
+    #     print(f"Description: {field_info.description}")
+
+    return dimensions
+
+
+def generate_custom_dimensions() -> Type[BaseModel]:
+    custom_dimensions = [
+        {
+            "name": "transactivity",
+            "description": "Analyze the provided social interaction episode between the given pair/team, focusing on identifying instances of transactive exchanges. Evaluate the level of transactivity by considering the following aspects: elaboration, building upon ideas, questioning, argumentation. Analyze whether these transactive patterns persist consistently across the entire interaction or if there are notable variations throughout the exchange. In the 'reasoning' field, provide a comprehensive account of the logic and thought process that led to your conclusion. Consider how the observed instances of transactivity contribute to or detract from the overall quality and depth of the interaction. In the 'score' field, provide an integer score ranging from 0 to 10, where a higher score indicates a higher level of transactivity.",
+            "range_high": 10,
+            "range_low": 0,
+        },
+        {
+            "name": "verbal_equity",
+            "description": "Analyze the script and measure the level of verbal equity reflected in the interaction between the agents. And then analyze the extent to which the interaction shows a balanced distribution of speaking opportunities among team members. In the 'reasoning' field, provide a comprehensive account of the logic or thought process that led you to your conclusion. Further, provide an integer score ranging from 0 and 10 in the 'score' field. A higher score indicates a higher level of verbal equity.",
+            "range_high": 10,
+            "range_low": 0,
+        },
+    ]
+
+    dimensions = EvaluationDimensionGenerator.generate_dimension_model_from_dict(
+        dimensions=custom_dimensions
+    )
+
+    # for field_name, field_info in dimensions.__fields__.items():
+    #     print(f"Field Name: {field_name}")
+    #     print(f"Description: {field_info.description}")
+
+    save_dimensions(custom_dimensions)
+    return dimensions
+
+
+def save_dimensions(dimensions: list[dict[str, Union[str, int]]]):
+    for dimension in dimensions:
+        if (
+            len(
+                CustomEvaluationDimension.find(
+                    CustomEvaluationDimension.name == dimension["name"]
+                ).all()
+            )
+            == 0
+        ):
+            print("No existing dimension found, creating a new one")
+            CustomEvaluationDimension(**dimension).save()
+            print("Saved {}".format(dimension["name"]))
+        else:
+            print(
+                CustomEvaluationDimension.find(
+                    CustomEvaluationDimension.name == dimension["name"]
+                ).all()[0],
+                "already exists",
+            )
+    # Migrator().run()
+
+
+if __name__ == "__main__":
+    dimensions = generate_custom_dimensions()
+    dimensions = generate_sotopia_dimensions()