agentscope-ai · pan-x-c · Aug 29, 2025 · Aug 20, 2025 · Aug 20, 2025 · Aug 21, 2025
diff --git a/docs/sphinx_doc/assets/email_eval_accuracy.png b/docs/sphinx_doc/assets/email_eval_accuracy.png
diff --git a/docs/sphinx_doc/assets/email_rollout_accuracy.png b/docs/sphinx_doc/assets/email_rollout_accuracy.png
diff --git a/docs/sphinx_doc/source/index.rst b/docs/sphinx_doc/source/index.rst
@@ -22,6 +22,7 @@ Welcome to Trinity-RFT's documentation!
    tutorial/example_multi_turn.md
    tutorial/example_step_wise.md
    tutorial/example_react.md
+   tutorial/example_search_email.md
    tutorial/example_dpo.md
    tutorial/example_data_functionalities.md
 

diff --git a/docs/sphinx_doc/source/tutorial/example_search_email.md b/docs/sphinx_doc/source/tutorial/example_search_email.md
@@ -0,0 +1,52 @@
+# Email Search Workflow
+
+
+This example shows a multi-turn email search workflow, inspired by [ART](https://openpipe.ai/blog/art-e-mail-agent?refresh=1756431423904). We implement a ReAct Agent and define tools for email search. Note that this example rewquires installing `AgentScope==0.1.6`.
+
+## Core Components
+
+We need to define some components:
+
+-   `EmailSearchWorkflow`: The main class that orchestrates the entire process. It initializes the environment, manages the agent, and runs the task.
+-   `EmailSearchAgent`: The "brain" of the operation.
+    *   It receives the user's query and a system prompt.
+    *   It decides which actions to take (e.g., which tool to use).
+    *   It is built using the `AgentScope` framework.
+-   **Tools**: These are the functions the agent can call to interact with the environment. Based on the code, these tools would likely include:
+    *   `search_email`: To find relevant emails.
+    *   `read_email`: To read the content of a specific email.
+    *   `generate_response`: To provide the final answer when it is found. This tool can be inherited from the `AgentScope` framework.
+-   **Judge LLM**: The judge LLM is used to evaluate the agent's performance, defined by `auxiliary_models`.
+
+
+## Run the Experiments
+
+### Step 1: Prepare the Database
+
+We prepare the data by running the following command:
+
+```bash
+python trinity/common/workflows/envs/email_searcher/prepare_data.py
+```
+
+If you want to choose a new database path, you can modify the `DEFAULT_DB_PATH` in [`prepare_data.py`]. Also, remember to set an environment variable `DEFAULT_EMAIL_DB_PATH` to point to the database path before moving to the next step.
+
+
+### Step 2: Run the Workflow
+
+The config files are located in [`email_search.yaml`](https://github.com/modelscope/Trinity-RFT/tree/main/examples/grpo_email_search/email_search.yaml) and [`train_email_search.yaml`](https://github.com/modelscope/Trinity-RFT/tree/main/examples/grpo_email_search/train_email_search.yaml).
+To run this example, you can run the following command:
+
+```bash
+trinity run --config examples/grpo_email_search/email_search.yaml
+```
+
+
+## Evaluation Results
+
+The results are shown in the following figure (the accuracy ranges from -0.1 to 1.0):
+
+![](../../assets/email_rollout_accuracy.png)
+
+
+![](../../assets/email_eval_accuracy.png)
diff --git a/examples/grpo_email_search/README.md b/examples/grpo_email_search/README.md
@@ -0,0 +1,7 @@
+# Email Search Workflow
+
+This example shows a multi-turn email search workflow, inspired by [ART](https://openpipe.ai/blog/art-e-mail-agent?refresh=1756431423904). We implement a ReAct Agent and define tools for email search. Note that this example rewquires installing `AgentScope==0.1.6`.
+
+For more detailed information, please refer to the [documentation](../../docs/sphinx_doc/source/tutorial/example_search_email.md).
+
+The config files are located in [`email_search.yaml`](email_search.yaml) and [`train_email_search.yaml`](train_email_search.yaml).
diff --git a/examples/grpo_email_search/email_search.yaml b/examples/grpo_email_search/email_search.yaml
@@ -0,0 +1,95 @@
+project: "Trinity_Multi_Step"
+name: "Email_Example"
+checkpoint_root_dir: /PATH/TO/CHECKPOINT/
+algorithm:
+  algorithm_type: grpo
+  repeat_times: 8
+  advantage_fn: grpo
+model:
+  model_path: /PATH/TO/Qwen3-4B-Instruct-2507
+  max_response_tokens: 4096
+  max_model_len: 20480
+cluster:
+  node_num: 1
+  gpu_per_node: 8
+buffer:
+  total_epochs: 1
+  batch_size: 16
+  train_batch_size: 640 # 16*8*5
+  max_retry_times: 3
+  max_retry_interval: 1
+  explorer_input:
+    taskset:
+      name: enron_train
+      storage_type: file
+      path: '/PATH/TO/ENRON_EMAIL_QA_PAIRS/'
+      split: train
+      format:
+        prompt_key: 'question'
+        response_key: 'answer'
+      workflow_args:
+        max_turns: 10
+      reward_fn_args:
+        llm_as_a_judge: true
+      rollout_args:
+        temperature: 1.0
+      enable_progress_bar: false
+    eval_tasksets:
+    - name: enron_test
+      storage_type: file
+      path: '/PATH/TO/ENRON_EMAIL_QA_PAIRS/'
+      split: test
+      format:
+        prompt_key: 'question'
+        response_key: 'answer'
+      enable_progress_bar: false
+      workflow_args:
+        max_turns: 10
+      reward_fn_args:
+        llm_as_a_judge: true
+      rollout_args:
+        temperature: 0.6
+        # max_tokens: 4096
+    default_workflow_type: 'email_search_workflow'
+  trainer_input:
+    experience_buffer:
+      name: experience_buffer
+      storage_type: queue
+      use_priority_queue: true
+explorer:
+  eval_interval: 10
+  max_repeat_times_per_runner: 1
+  max_timeout: 3600
+  rollout_model:
+    enable_thinking: true
+    enable_history: true
+    enable_openai_api: true
+    enable_auto_tool_choice: true
+    tool_call_parser: hermes
+    engine_num: 4
+    tensor_parallel_size: 1
+    enable_prefix_caching: false
+    enforce_eager: true
+    dtype: bfloat16
+    seed: 42
+    gpu_memory_utilization: 0.7
+    enable_chunked_prefill: true
+  auxiliary_models:
+    - model_path: /PATH/TO/Qwen3-30B-A3B-Instruct-2507
+      engine_num: 1
+      tensor_parallel_size: 2
+      enable_thinking: false
+      max_prompt_tokens: 2048
+      max_response_tokens: 128
+      max_model_len: 2500
+synchronizer:
+  sync_style: dynamic_by_explorer
+  sync_method: 'nccl'
+  sync_interval: 5
+  sync_timeout: 3600
+trainer:
+  trainer_type: 'verl'
+  trainer_config_path: 'examples/grpo_email_search/train_email_search.yaml'
+  save_interval: 100
+monitor:
+  monitor_type: wandb
diff --git a/examples/grpo_email_search/train_email_search.yaml b/examples/grpo_email_search/train_email_search.yaml
@@ -0,0 +1,48 @@
+actor_rollout_ref:
+  hybrid_engine: True
+  model:
+    external_lib: null
+    override_config: { }
+    enable_gradient_checkpointing: True
+    use_remove_padding: False
+  actor:
+    strategy: fsdp  # This is for backward-compatibility
+    ppo_micro_batch_size_per_gpu: 1
+    use_dynamic_bsz: False
+    ppo_max_token_len_per_gpu: 16384
+    grad_clip: 1.0
+    ppo_epochs: 1
+    shuffle: False
+    ulysses_sequence_parallel_size: 1 # sp size
+    optim:
+      lr: 1e-6
+      lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
+      # min_lr_ratio: null   # only useful for warmup with cosine
+      warmup_style: constant  # select from constant/cosine
+      total_training_steps: -1  # must be override by program
+    fsdp_config:
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+      param_offload: False
+      optimizer_offload: False
+      fsdp_size: -1
+  ref:
+    fsdp_config:
+      param_offload: False
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+    log_prob_micro_batch_size_per_gpu: 1
+    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+    ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
+trainer:
+  balance_batch: True
+  # total_training_steps: null
+  # auto: find the last ckpt to resume. If can't find, start from scratch
+  resume_mode: auto # or auto or resume_path if
+  default_hdfs_dir: null
+  remove_previous_ckpt_in_save: False
+  del_local_ckpt_after_load: False
+  val_before_train: False
diff --git a/trinity/algorithm/advantage_fn/grpo_advantage.py b/trinity/algorithm/advantage_fn/grpo_advantage.py
@@ -93,7 +93,7 @@ def default_args(cls) -> Dict:
 
 @ADVANTAGE_FN.register_module("grpo")
 class GRPOGroupedAdvantage(GroupAdvantage):
-    """An example AddStrategy that calculates GRPO advantages."""
+    """An advantage class that calculates GRPO advantages."""
 
     def __init__(
         self,

diff --git a/trinity/common/workflows/__init__.py b/trinity/common/workflows/__init__.py
@@ -6,6 +6,7 @@
 from .envs.alfworld.alfworld_workflow import AlfworldWorkflow, StepWiseAlfworldWorkflow
 from .envs.alfworld.RAFT_alfworld_workflow import RAFTAlfworldWorkflow
 from .envs.alfworld.RAFT_reflect_alfworld_workflow import RAFTReflectAlfworldWorkflow
+from .envs.email_searcher.workflow import EmailSearchWorkflow
 from .envs.sciworld.sciworld_workflow import SciWorldWorkflow
 from .envs.webshop.webshop_workflow import WebShopWorkflow
 from .eval_workflow import MathEvalWorkflow
@@ -29,4 +30,5 @@
     "ToolCallWorkflow",
     "MathEvalWorkflow",
     "AgentScopeReactV2MathWorkflow",
+    "EmailSearchWorkflow",
 ]
diff --git a/trinity/common/workflows/envs/agentscope/agentscope_react_workflow.py b/trinity/common/workflows/envs/agentscope/agentscope_react_workflow.py
@@ -149,10 +149,10 @@ def run(self):
         for i, experience in enumerate(experiences):
             experience.eid.step = i
             experience.reward = reward
-            turns_metrics = {"agent_turns": len(self.agent.memory.get_memory())}
+            agent_metrics = {"react_memory_length": len(self.agent.memory.get_memory())}
             if experience.metrics is None:
                 experience.metrics = {}
-            experience.metrics.update(turns_metrics)
+            experience.metrics.update(agent_metrics)
         self.logger.debug(
             f"return experience len: {len(experiences)}, run_id: {str(experiences[-1].eid.run)}, final step reward: {experiences[-1].reward}"
         )